├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── documentation.md │ └── feature_request.md ├── PULL_REQUEST_TEMPLATE.md └── workflows │ ├── python-publish.yml │ └── python-test.yml ├── .gitignore ├── .readthedocs.yml ├── AUTHORS.md ├── CHANGE.txt ├── CONTRIBUTE.md ├── CONTRIBUTE_CH.md ├── EduNLP ├── Formula │ ├── Formula.py │ ├── README.md │ ├── __init__.py │ ├── ast │ │ ├── __init__.py │ │ ├── ast.py │ │ ├── katex.py │ │ └── readme.md │ └── viz │ │ ├── __init__.py │ │ ├── m_viz.py │ │ ├── tree_viz.py │ │ ├── utils.py │ │ └── viz.py ├── I2V │ ├── __init__.py │ └── i2v.py ├── ModelZoo │ ├── __init__.py │ ├── base_model.py │ ├── bert │ │ ├── __init__.py │ │ └── bert.py │ ├── disenqnet │ │ ├── __init__.py │ │ ├── disenqnet.py │ │ ├── modules.py │ │ └── utils.py │ ├── quesnet │ │ ├── __init__.py │ │ ├── modules.py │ │ ├── quesnet.py │ │ └── util.py │ ├── rnn │ │ ├── __init__.py │ │ ├── harnn.py │ │ └── rnn.py │ └── utils │ │ ├── __init__.py │ │ ├── data.py │ │ ├── device.py │ │ ├── downstream_output.py │ │ ├── masker.py │ │ ├── modules.py │ │ ├── padder.py │ │ └── torch_utils.py ├── Pipeline │ ├── __init__.py │ ├── base.py │ ├── components.py │ ├── knowledge_prediction.py │ ├── mappings.py │ └── property_prediction.py ├── Pretrain │ ├── __init__.py │ ├── bert_vec.py │ ├── disenqnet_vec.py │ ├── elmo_vec.py │ ├── gensim_vec.py │ ├── hugginface_utils.py │ ├── pretrian_utils.py │ └── quesnet_vec.py ├── SIF │ ├── __init__.py │ ├── constants.py │ ├── parser │ │ ├── __init__.py │ │ └── parser.py │ ├── segment │ │ ├── __init__.py │ │ └── segment.py │ ├── sif.py │ └── tokenization │ │ ├── __init__.py │ │ ├── formula │ │ ├── __init__.py │ │ ├── ast_token.py │ │ ├── formula.py │ │ └── linear_token.py │ │ ├── text │ │ ├── __init__.py │ │ ├── stopwords.py │ │ └── tokenization.py │ │ └── tokenization.py ├── Tokenizer │ ├── __init__.py │ └── tokenizer.py ├── Vector │ ├── __init__.py │ ├── bert_vec.py │ ├── const.py │ ├── disenqnet │ │ ├── __init__.py │ │ └── disenqnet.py │ ├── elmo_vec.py │ ├── embedding.py │ ├── gensim_vec.py │ ├── meta.py │ ├── quesnet │ │ ├── __init__.py │ │ └── quesnet.py │ ├── rnn │ │ ├── __init__.py │ │ └── rnn.py │ └── t2v.py ├── __init__.py ├── constant.py ├── main.py ├── meta_data │ └── sif_stopwords.txt └── utils │ ├── __init__.py │ ├── data.py │ ├── image.py │ ├── log.py │ └── path.py ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── asset └── _static │ ├── d2v.png │ ├── d2v_bow_tfidf.png │ ├── d2v_general.png │ ├── d2v_stem_tf.png │ ├── data.png │ ├── formula.png │ ├── i2v.png │ ├── item.png │ ├── item_figure.png │ ├── item_formula.png │ ├── parse.png │ ├── prepare_dataset.jpg │ ├── seg.png │ ├── sif.png │ ├── sif_addition.png │ ├── tokenizer.png │ ├── w2v_stem_text.png │ └── w2v_stem_tf.png ├── docs ├── EduNLP.png ├── Makefile ├── README.md ├── SIF4TI_CH.md ├── make.bat ├── requirements.txt ├── source │ ├── _static │ │ ├── EduNLP.png │ │ ├── formula.png │ │ ├── formulagroup.png │ │ ├── pipeline.png │ │ └── 流程图.png │ ├── api │ │ ├── ModelZoo.rst │ │ ├── formula.rst │ │ ├── i2v.rst │ │ ├── index.rst │ │ ├── pipeline.rst │ │ ├── pretrain.rst │ │ ├── sif.rst │ │ ├── tokenizer.rst │ │ ├── utils.rst │ │ └── vector.rst │ ├── conf.py │ ├── index.rst │ └── tutorial │ │ ├── en │ │ ├── index.rst │ │ ├── parse │ │ │ ├── FormulaSyntaxStructureParsing.rst │ │ │ └── TextSyntaxStructureParsing.rst │ │ ├── pipeline.rst │ │ ├── pretrain.rst │ │ ├── pretrain │ │ │ ├── loading.rst │ │ │ ├── pub.rst │ │ │ └── start.rst │ │ ├── seg.rst │ │ ├── seg │ │ │ ├── SemanticComponentSegmentation.rst │ │ │ └── StructuralComponentSegmentation.rst │ │ ├── sif.rst │ │ ├── tokenization.rst │ │ ├── tokenization │ │ │ ├── GensimSegTokenizer.rst │ │ │ ├── GensimWordTokenizer.rst │ │ │ ├── PureTextTokenizer.rst │ │ │ └── TextTokenizer.rst │ │ ├── tokenize.rst │ │ ├── tokenize │ │ │ ├── Sentence Segmentation.rst │ │ │ ├── Tokenization.rst │ │ │ └── WordSegmentation.rst │ │ ├── vectorization.rst │ │ └── vectorization │ │ │ ├── WithPre-trainedModel.rst │ │ │ └── WithoutPre-trainedModel.rst │ │ └── zh │ │ ├── formula.rst │ │ ├── index.rst │ │ ├── pipeline.rst │ │ ├── pretrain.rst │ │ ├── seg.rst │ │ ├── sif.rst │ │ ├── tokenization.rst │ │ ├── tokenize.rst │ │ └── vectorization.rst └── tutorial.ipynb ├── examples ├── downstream │ ├── difficulty_prediction │ │ ├── difficulty_prediction.ipynb │ │ └── utils.py │ ├── discrimination_prediction │ │ ├── discrimination_prediction.ipynb │ │ └── utils.py │ ├── knowledge_prediction │ │ ├── konwledge_prediction.ipynb │ │ └── utils.py │ ├── paper_segmentation │ │ ├── load_data.py │ │ ├── model.py │ │ ├── paper_segmentation.ipynb │ │ ├── samples │ │ │ └── train │ │ │ │ └── math │ │ │ │ └── paper_1.txt │ │ ├── trainer.py │ │ └── utils.py │ ├── quality_evaluation │ │ ├── quality_evaluation.ipynb │ │ └── train.py │ └── similarity_prediction │ │ └── similarity_prediction.ipynb ├── formula │ ├── formula.ipynb │ ├── formula.py │ └── tree.ipynb ├── i2v │ ├── get_pretrained_i2v.ipynb │ ├── get_pretrained_i2v_d2v_w2v.ipynb │ ├── i2v.ipynb │ ├── i2v_bert.ipynb │ ├── i2v_d2v.ipynb │ ├── i2v_disenq.ipynb │ ├── i2v_elmo.ipynb │ ├── i2v_quesnet.ipynb │ └── i2v_w2v.ipynb ├── pipeline │ └── pipeline.ipynb ├── pretrain │ ├── bert.ipynb │ ├── disenq.ipynb │ ├── elmo.ipynb │ ├── gensim │ │ ├── d2v_bow_tfidf.ipynb │ │ ├── d2v_general.ipynb │ │ ├── d2v_stem_tf.ipynb │ │ ├── w2v_stem_text.ipynb │ │ └── w2v_stem_tf.ipynb │ ├── hugginface_tokenizer.ipynb │ ├── prepare_dataset.ipynb │ ├── pretrained_tokenizer.ipynb │ ├── quesnet.ipynb │ ├── rnn │ │ └── rnn.py │ └── seg_token │ │ ├── d2v.ipynb │ │ ├── d2v_d1.ipynb │ │ └── d2v_d2.ipynb ├── sif │ ├── item.json │ ├── parse │ │ └── parse.ipynb │ ├── sci4sif.py │ ├── seg │ │ └── seg.ipynb │ ├── sif4sci.ipynb │ ├── sif_addition.ipynb │ ├── sif_check.ipynb │ └── tokenize │ │ └── tokenization.ipynb ├── t2v │ ├── get_pretrained_t2v.ipynb │ ├── t2v.ipynb │ ├── t2v_bert.ipynb │ ├── t2v_d2v.ipynb │ ├── t2v_disenq.ipynb │ ├── t2v_elmo.ipynb │ ├── t2v_quesnet.ipynb │ └── t2v_w2v.ipynb ├── test_model │ └── w2v │ │ └── gensim_luna_stem_t_sg_100.kv ├── tokenizer │ ├── all_tokenize.ipynb │ ├── test_stopwords.txt │ └── tokenizer.ipynb └── utils │ └── data.ipynb ├── pytest.ini ├── scripts └── extlib │ └── katex2python.py ├── setup.cfg ├── setup.py ├── static └── test_data │ ├── quesnet_img │ └── 000004d6-0479-11ec-829b-797d5eb43535.png │ └── standard_luna_data.json └── tests ├── __init__.py ├── test_ast.py ├── test_formula.py ├── test_i2v ├── __init__.py └── test_pretrained.py ├── test_main.py ├── test_model_zoo └── test_rnn.py ├── test_pipeline ├── conftest.py └── test_pipelines.py ├── test_pretrain ├── __init__.py ├── conftest.py ├── test_hugginface_utils.py ├── test_pretrain_utils.py ├── test_pretrained_bert.py ├── test_pretrained_disenqnet.py ├── test_pretrained_elmo.py └── test_pretrained_quesnet.py ├── test_sif ├── __init__.py ├── conftest.py ├── test_parser.py ├── test_segement.py ├── test_sif.py └── test_tokenization.py ├── test_tokenizer ├── __init__.py └── test_tokenizer.py ├── test_utils └── test_modules.py └── test_vec ├── __init__.py ├── conftest.py ├── test_t2v.py └── test_vec.py /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: 'Bug, needs triage' 6 | 7 | --- 8 | ## 🐛 Description 9 | (A clear and concise description of what the bug is.) 10 | 11 | ### Error Message 12 | (Paste the complete error message. Please also include stack trace by setting environment variable `DMLC_LOG_STACK_TRACE_DEPTH=100` before running your script.) 13 | 14 | ## To Reproduce 15 | (If you developed your own code, please provide a short script that reproduces the error. For existing examples, please provide link.) 16 | 17 | ### Steps to reproduce 18 | (Paste the commands you ran that produced the error.) 19 | 20 | 1. 21 | 2. 22 | 23 | ## What have you tried to solve it? 24 | 25 | 1. 26 | 2. 27 | 28 | ## Environment 29 | 30 |
31 | Environment Information 32 | 33 | **Operating System:** ... 34 | 35 | **Python Version:** (e.g., python3.6, anaconda/python3.7, venv/python3.8) 36 | 37 |
38 | 39 | ## Additional context 40 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/documentation.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: 📚 Documentation 3 | about: Update api documentation or add the data analysis 4 | --- 5 | 6 | ## 📚 Documentation -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: 'Feature request' 6 | 7 | --- 8 | 9 | ## Description 10 | (A clear and concise description of what the feature is.) 11 | - If the proposal is about a new dataset, provide description of what the dataset is and 12 | attach the basic data analysis with it. 13 | - If the proposal is about an API, provide mock examples if possible. 14 | 15 | ## References 16 | - list reference and related literature 17 | - list known implementations 18 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | Thanks for sending a pull request! 2 | Please make sure you click the link above to view the [contribution guidelines](../CONTRIBUTE.md), 3 | then fill out the blanks below. 4 | 5 | ## Description ## 6 | (Brief description on what this PR is about) 7 | 8 | ### What does this implement/fix? Explain your changes. 9 | ... 10 | 11 | #### Pull request type 12 | - [ ] [DATASET] Add a new dataset 13 | - [ ] [BUGFIX] Bugfix 14 | - [ ] [FEATURE] New feature (non-breaking change which adds functionality) 15 | - [ ] [BREAKING] Breaking change (fix or feature that would cause existing functionality to not work as expected) 16 | - [ ] [STYLE] Code style update (formatting, renaming) 17 | - [ ] [REFACTOR] Refactoring (no functional changes, no api changes) 18 | - [ ] [BUILD] Build related changes 19 | - [ ] [DOC] Documentation content changes 20 | - [ ] [OTHER] Other (please describe): 21 | 22 | 23 | #### Changes 24 | - Feature1, tests, (and when applicable, API doc) 25 | - Feature2, tests, (and when applicable, API doc) 26 | 27 | or 28 | 29 | - Fix1, tests 30 | - Fix2, tests 31 | 32 | ### Does this close any currently open issues? 33 | ... 34 | 35 | ### Any relevant logs, error output, etc? 36 | ... 37 | 38 | ## Checklist ## 39 | Before you submit a pull request, please make sure you have to following: 40 | 41 | ### Essentials ### 42 | - [ ] PR's title starts with a category (e.g. [BUGFIX], [FEATURE], [BREAKING], [DOC], etc) 43 | - [ ] Changes are complete (i.e. I finished coding on this PR) 44 | - [ ] All changes have test coverage and al tests passing 45 | - [ ] Code is well-documented (extended the README / documentation, if necessary) 46 | - [ ] If this PR is your first one, add your name and github account to [AUTHORS.md](../AUTHORS.md) 47 | 48 | ## Comments ## 49 | - If this change is a backward incompatible change, why must this change be made. 50 | - Interesting edge cases to note here 51 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflows will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: Upload Python Package 5 | 6 | on: 7 | release: 8 | types: [created] 9 | 10 | jobs: 11 | deploy: 12 | 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v2 17 | - name: Set up Python 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: '3.x' 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install setuptools wheel twine 25 | - name: Build and publish 26 | env: 27 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 28 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 29 | run: | 30 | python setup.py sdist bdist_wheel 31 | twine upload dist/* 32 | -------------------------------------------------------------------------------- /.github/workflows/python-test.yml: -------------------------------------------------------------------------------- 1 | 2 | name: test 3 | 4 | on: [push, pull_request] 5 | 6 | jobs: 7 | build: 8 | 9 | runs-on: ${{ matrix.os }} 10 | strategy: 11 | matrix: 12 | python-version: [3.6, 3.7, 3.8, 3.9] 13 | include: 14 | - os: "ubuntu-latest" 15 | - os: "ubuntu-20.04" 16 | python-version: "3.6" 17 | 18 | steps: 19 | - uses: actions/checkout@v2 20 | - name: Set up Python ${{ matrix.python-version }} 21 | uses: actions/setup-python@v2 22 | with: 23 | python-version: ${{ matrix.python-version }} 24 | - name: Install dependencies 25 | run: | 26 | pip install -e .[test,full] 27 | pip install codecov 28 | - name: Test with pytest 29 | run: | 30 | pytest 31 | codecov -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | wheels/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | MANIFEST 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *.cover 46 | .hypothesis/ 47 | .pytest_cache/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | db.sqlite3 57 | 58 | # Flask stuff: 59 | instance/ 60 | .webassets-cache 61 | 62 | # Scrapy stuff: 63 | .scrapy 64 | 65 | # Sphinx documentation 66 | docs/_build/ 67 | **/_build/ 68 | **/_build/* 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # Jupyter Notebook 74 | .ipynb_checkpoints 75 | 76 | # pyenv 77 | .python-version 78 | 79 | # celery beat schedule file 80 | celerybeat-schedule 81 | 82 | # SageMath parsed files 83 | *.sage.py 84 | 85 | # Environments 86 | .env 87 | .venv 88 | env/ 89 | venv/ 90 | ENV/ 91 | env.bak/ 92 | venv.bak/ 93 | 94 | # Spyder project settings 95 | .spyderproject 96 | .spyproject 97 | 98 | # Rope project settings 99 | .ropeproject 100 | 101 | # mkdocs documentation 102 | /site 103 | 104 | # IDE 105 | .idea/ 106 | .vscode/ 107 | .DS_Store 108 | 109 | # Pyre type checker 110 | .pyre/ 111 | 112 | # User Definition 113 | data/ 114 | deprecated/ 115 | tmp*/ 116 | jieba.cache 117 | *.kv 118 | *.zip 119 | examples/test_model -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Build documentation in the docs/ directory with Sphinx 9 | sphinx: 10 | configuration: docs/source/conf.py 11 | 12 | # Build documentation with MkDocs 13 | #mkdocs: 14 | # configuration: mkdocs.yml 15 | 16 | # Optionally build your docs in additional formats such as PDF and ePub 17 | formats: [] 18 | 19 | # Optionally set the version of Python and requirements 20 | # required to build your docs 21 | python: 22 | version: 3.7 23 | install: 24 | - requirements: docs/requirements.txt 25 | - method: pip 26 | path: . 27 | extra_requirements: 28 | - full -------------------------------------------------------------------------------- /AUTHORS.md: -------------------------------------------------------------------------------- 1 | # AUTHORS 2 | 3 | [Shiwei Tong*](https://github.com/tswsxk) 4 | 5 | [Rui Lv](https://github.com/karin0018) 6 | 7 | [Fangzhou Yao](https://github.com/fannazya) 8 | 9 | [Jinze Wu](https://github.com/hxwujinze) 10 | 11 | [Xin Wang](https://github.com/WangXin1198) 12 | 13 | [Longhu Qin](https://github.com/KenelmQLH) 14 | 15 | [Pingzhi Li](https://github.com/pingzhiLi) 16 | 17 | [Meikai Bao](https://github.com/BAOOOOOM) 18 | 19 | [Yuting Ning](https://github.com/nnnyt) 20 | 21 | [Jundong Wu](https://github.com/wintermelon008) 22 | 23 | [Shangzi Xue](https://github.com/ShangziXue) 24 | 25 | The stared contributors are the corresponding authors. 26 | -------------------------------------------------------------------------------- /CHANGE.txt: -------------------------------------------------------------------------------- 1 | v1.0.0 2 | 1. Support cuda for I2V and T2V. 3 | 2. Add demos for downstream tasks including knowledge & difficulty & discrimination prediction, similarity prediction and paper segmentation. 4 | 3. Refactor quesnet for pretrain and vectorization. 5 | 4. Update documents about tutorials and API. 6 | 7 | v0.0.9 8 | 1. Refactor tokenizer Basic Tokenizer and Pretrained Tokenizer 9 | 2. Refactor model structures following huggingface styles for Elmo, BERT, DisenQNet and QuesNet 10 | 3. Add PreprocessingPipeline and Pipeline 11 | 4. Add downstream task: knowledge prediction and property prediction 12 | 5. Fix a bug in RNN which causes ELMo not converging 13 | 6. Move all the test models to modelhub 14 | 7. Update test data files 15 | 16 | v0.0.8 17 | 1. add Emlo 18 | 2. add DisenQNet 19 | 3. add QuesNet 20 | 4. add tal-edu-bert 21 | 5. add dynamic mapping table from modelhub 22 | 6. fix cuda error 23 | 7. update pretrained models 24 | 25 | v0.0.7: 26 | 1. add BERT and pretrained model (luna_bert) 27 | 2. speed up the process in sif 28 | 3. handling OOV in word2vec 29 | 4. add English tutorials 30 | 5. add api docs and prettify tutorials 31 | 6. fix the np.error in gensim_vec.W2V.infer_vector 32 | 7. fix the parameters lost in tokenization 33 | 34 | v0.0.6: 35 | 1. dev: add half-pretrained rnn model 36 | 2. important!!!: rename TextTokenizer to PureTextTokenizer, and add a new tokenizer named TextTokenizer (the two have similar but not the same behaviours). 37 | 3. sif: add $\textf{}$ syntax 38 | 4. add two pretrained w2v model: w2v_sci_300 and w2v_lit_300 39 | 40 | v0.0.5: 41 | 1. fix the missing stopwords.txt when use pip install 42 | 43 | v0.0.4: 44 | 1. fix the project errors 45 | 46 | v0.0.3: 47 | 1. update formula ast: supporting more symbols and functions defined in katex 48 | 2. add tokens to vector tools, including word2vec and doc2vec using gensim 49 | 3. sci4sif support tokenization grouped by segments 50 | 4. add special tokens: \SIFTag and \SIFSep 51 | 5. add item to vector tools 52 | 6. add interface for getting pretrained models, where the supported model names can be accessed by `edunlp i2v` in the command console 53 | 54 | v0.0.2: 55 | 1. fix potential ModuleNotFoundError 56 | 57 | v0.0.1: 58 | 1. Add Formula class to parse latex formula, which will generate the abstract syntax tree. 59 | 2. Add SIF v0.0.2. 60 | 3. Add sif4sci function which serves as a preprocess function for downstream tasks. 61 | -------------------------------------------------------------------------------- /CONTRIBUTE_CH.md: -------------------------------------------------------------------------------- 1 | # 贡献规范 2 | 3 | [English version](CONTRIBUTE.md) 4 | 5 | ## 导引 6 | 7 | 首先感谢您关注 EduNLP 并致力于让其变得更好! 8 | 在您开始贡献自己的一份力之前,需要注意以下几点: 9 | 1. 如果您希望我们实现新的功能。 10 | - 可以在通过 issue 来告诉我们您想要的功能,我们将及时展开讨论设计和实现。 11 | - 一旦我们一致地认为这个计划不错,那么您可以期待新的功能很快就可以与您见面。 12 | 2. 如果您想要对于某个未解决问题的 issue 提供解决性意见或 bug 修复。 13 | - 可以先在 [EduNLP issue list](https://github.com/bigdata-ustc/EduNLP/issues) 中搜索您的问题。 14 | - 之后,选择一个具体问题和评论,来提供您的解决性意见或者 bug 修复。 15 | - 如果对于具体的 issue,您需要更多的细节,请向我们咨询。 16 | 17 | 一旦您实现并已经测试过了你的想法或者是对于 bug 的修复,请通过 Pull Request 提及到到 [EduNLP](https://github.com/bigdata-ustc/EduNLP) : 18 | 1. 首先fork此仓库到你的分支下 19 | 2. 对代码进行修改。注意:我们强烈建议你遵守我们的 [commit格式规范](CONTRIBUTE_CH.md#关于Commit的格式) 20 | 3. 通过代码测试,测试覆盖度达到100%,例子可见[此处](tests/test_sif) 21 | 4. 通过Pull Request 提及到到 [EduNLP](https://github.com/bigdata-ustc/EduNLP) 。注意:我们提供了一个标准的PR请求模板,你需要认真完成其中的信息,一个标准且规范的PR可参考[此处](https://github.com/bigdata-ustc/EduNLP/pull/1) 22 | 23 | 以下是对于不同贡献内容的有用建议: 24 | 25 | ### 添加新的数据集或者数据分析 26 | 27 | 有关新数据集或数据分析,请移步至 [EduData](https://github.com/bigdata-ustc/EduData) 。 28 | 29 | #### 代码注释风格 30 | 31 | 请使用 Numpy 代码注释风格: 32 | 33 | ``` 34 | function 的功能 35 | 36 | Parameters 37 | ---------- 38 | 变量名 1: 类型, 是否 optional 39 | 描述 40 | 变量名 2: 类型, 是否 optional 41 | 描述 42 | ... 43 | 44 | Returns 45 | ------- 46 | 变量名: 类型 47 | 描述 48 | 49 | See Also (可选) 50 | -------- 51 | 类似 function: 类似 function 的功能 52 | 53 | Examples (可选) 54 | -------- 55 | >>> 举例怎么用 56 | ``` 57 | 58 | ### 关于Commit的格式 59 | 60 | #### commit format 61 | 62 | ``` 63 | []() 64 | ``` 65 | 66 | #### type 67 | - `feat`:新功能(feature)。 68 | - `fix/to`:修复 bug,可以是 Q&A 发现的 bug,也可以是自己在使用时发现的 bug。 69 | - `fix`:产生 diff 并自动修复此问题。**适合于一次提交直接修复问题**。 70 | - `to`:只产生 diff 不自动修复此问题。**适合于多次提交**。最终修复问题提交时使用 `fix`。 71 | - `docs`:文档(documentation)。 72 | - `style`:格式(不影响代码运行的变动)。 73 | - `refactor`:重构(即非新增功能,也不是修改 bug 的代码变动)。 74 | - `perf`:优化相关,比如提升性能、体验。 75 | - `test`:增加测试。 76 | - `chore`:构建过程或辅助工具的变动。 77 | - `revert`:回滚到上一个版本。 78 | - `merge`:代码合并。 79 | - `sync`:同步主线或分支的 bug。 80 | - `arch`: 工程文件或工具的改动。 81 | 82 | #### scope (可选) 83 | 84 | scope 是用于说明 commit 影响的范围,比如数据层控制层视图层等等,视项目不同而不同。 85 | 86 | 例如在 Angular,可以是 location,browser,compile,compile,rootScope, ngHref,ngClick,ngView等。如果你的修改影响了不止一个scope,你可以使用`*`代替。 87 | 88 | #### subject (必须) 89 | 90 | subject 是 commit 目的的简短描述,不超过50个字符。 91 | 92 | 结尾不加句号或其他标点符号。 93 | 94 | #### Example 95 | 96 | - **[docs] update the README.md** 97 | 98 | ```sh 99 | git commit -m "[docs] update the README.md" 100 | ``` 101 | 102 | ## FAQ 103 | 104 | 问题: 我已经在本地仔细地测试了代码,并通过了代码检查,但是在 CI 步骤时却报错? 105 | 回答: 这个问题可能是两个原因造成: 106 | 1. 在线的 CI 系统与您自己本地系统有差别; 107 | 2. 可能是网络原因造成的,如果是可以通过 CI 的日志文件查看。 108 | -------------------------------------------------------------------------------- /EduNLP/Formula/README.md: -------------------------------------------------------------------------------- 1 | 0:无边 1:自身 2:弟 2:兄 3:子 4;父亲 5:跨树 -------------------------------------------------------------------------------- /EduNLP/Formula/__init__.py: -------------------------------------------------------------------------------- 1 | from .Formula import Formula, FormulaGroup, link_formulas 2 | from .ast import link_variable 3 | from .Formula import CONST_MATHORD 4 | -------------------------------------------------------------------------------- /EduNLP/Formula/ast/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/5/20 @ tongshiwei 3 | 4 | from .ast import str2ast, get_edges, ast, link_variable, katex_parse 5 | -------------------------------------------------------------------------------- /EduNLP/Formula/ast/readme.md: -------------------------------------------------------------------------------- 1 | katex version: 0.13.11 2 | katex github: https://github.com/KaTeX/KaTeX 3 | node type can be found in https://github.com/KaTeX/KaTeX/blob/master/src/parseNode.js 4 | symbol type can be found in https://github.com/KaTeX/KaTeX/blob/master/src/symbols.js -------------------------------------------------------------------------------- /EduNLP/Formula/viz/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/3/8 @ tongshiwei 3 | 4 | import warnings 5 | # warnings.warn("Do not use this package") 6 | from .tree_viz import TreePlotter, ForestPlotter 7 | -------------------------------------------------------------------------------- /EduNLP/Formula/viz/m_viz.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/3/8 @ tongshiwei 3 | 4 | import matplotlib.pyplot as plt 5 | from sklearn.tree._export import _MPLTreeExporter 6 | from sklearn.tree._reingold_tilford import buchheim, Tree 7 | from matplotlib.text import Annotation 8 | 9 | 10 | class TreePlotter(_MPLTreeExporter): 11 | def recurse(self, node, ax, scale_x, scale_y, height, depth=0): 12 | kwargs = dict(bbox=self.bbox_args, ha='center', va='center', 13 | zorder=100 - 10 * depth, xycoords='axes pixels') 14 | 15 | if self.fontsize is not None: 16 | kwargs['fontsize'] = self.fontsize 17 | 18 | # offset things by .5 to center them in plot 19 | xy = ((node.x + .5) * scale_x, height - (node.y + .5) * scale_y) 20 | 21 | if self.max_depth is None or depth <= self.max_depth: 22 | # if self.filled: 23 | # kwargs['bbox']['fc'] = self.get_fill_color(tree, 24 | # node.tree.node_id) 25 | if node.parent is None: 26 | # root 27 | ax.annotate(node.tree.label, xy, **kwargs) 28 | else: 29 | xy_parent = ((node.parent.x + .5) * scale_x, 30 | height - (node.parent.y + .5) * scale_y) 31 | kwargs["arrowprops"] = self.arrow_args 32 | ax.annotate(node.tree.label, xy_parent, xy, **kwargs) 33 | for child in node.children: 34 | self.recurse(child, ax, scale_x, scale_y, height, 35 | depth=depth + 1) 36 | 37 | else: 38 | xy_parent = ((node.parent.x + .5) * scale_x, 39 | height - (node.parent.y + .5) * scale_y) 40 | kwargs["arrowprops"] = self.arrow_args 41 | kwargs['bbox']['fc'] = 'grey' 42 | ax.annotate("\n (...) \n", xy_parent, xy, **kwargs) 43 | 44 | def _make_forest(self, ast): 45 | forest = [] 46 | for node in ast: 47 | if node["structure"]["father"] is None: 48 | return Tree() 49 | else: 50 | pass 51 | 52 | return Tree(name, node_id, *children) 53 | 54 | def export(self, formula_ast, ax=None): 55 | self.filled = False 56 | 57 | if ax is None: 58 | ax = plt.gca() 59 | ax.clear() 60 | ax.set_axis_off() 61 | # my_tree = self._make_tree(0, decision_tree.tree_, 62 | # decision_tree.criterion) 63 | my_tree = self._make_forest(formula_ast) 64 | draw_tree = buchheim(my_tree) 65 | 66 | # important to make sure we're still 67 | # inside the axis after drawing the box 68 | # this makes sense because the width of a box 69 | # is about the same as the distance between boxes 70 | max_x, max_y = draw_tree.max_extents() + 1 71 | ax_width = ax.get_window_extent().width 72 | ax_height = ax.get_window_extent().height 73 | 74 | scale_x = ax_width / max_x 75 | scale_y = ax_height / max_y 76 | 77 | self.recurse(draw_tree, ax, 78 | scale_x, scale_y, ax_height) 79 | 80 | anns = [ann for ann in ax.get_children() 81 | if isinstance(ann, Annotation)] 82 | 83 | # update sizes of all bboxes 84 | renderer = ax.figure.canvas.get_renderer() 85 | 86 | for ann in anns: 87 | ann.update_bbox_position_size(renderer) 88 | 89 | if self.fontsize is None: 90 | # get figure to data transform 91 | # adjust fontsize to avoid overlap 92 | # get max box width and height 93 | extents = [ann.get_bbox_patch().get_window_extent() 94 | for ann in anns] 95 | max_width = max([extent.width for extent in extents]) 96 | max_height = max([extent.height for extent in extents]) 97 | # width should be around scale_x in axis coordinates 98 | size = anns[0].get_fontsize() * min(scale_x / max_width, 99 | scale_y / max_height) 100 | for ann in anns: 101 | ann.set_fontsize(size) 102 | 103 | return anns 104 | -------------------------------------------------------------------------------- /EduNLP/I2V/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/8/1 @ tongshiwei 3 | 4 | from .i2v import I2V, get_pretrained_i2v 5 | from .i2v import D2V, W2V, Elmo, Bert, DisenQ, QuesNet 6 | -------------------------------------------------------------------------------- /EduNLP/ModelZoo/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import * 2 | from .bert import * 3 | from .rnn import * 4 | from .disenqnet import * 5 | from .quesnet import * 6 | -------------------------------------------------------------------------------- /EduNLP/ModelZoo/base_model.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import json 3 | import os 4 | from pathlib import Path 5 | import torch 6 | from transformers import PretrainedConfig 7 | # import logging 8 | from ..utils import logger 9 | 10 | 11 | class BaseModel(nn.Module): 12 | base_model_prefix = '' 13 | 14 | def __init__(self): 15 | super(BaseModel, self).__init__() 16 | self.config = PretrainedConfig() 17 | 18 | def forward(self, *input): 19 | raise NotImplementedError 20 | 21 | def save_pretrained(self, output_dir): 22 | if not os.path.exists(output_dir): 23 | os.makedirs(output_dir, exist_ok=True) 24 | model_path = os.path.join(output_dir, 'pytorch_model.bin') 25 | model_path = Path(model_path) 26 | torch.save(self.state_dict(), model_path.open('wb')) 27 | self.save_config(output_dir) 28 | 29 | @classmethod 30 | def from_pretrained(cls, pretrained_model_path, *args, **kwargs): 31 | config_path = os.path.join(pretrained_model_path, "config.json") 32 | model_path = os.path.join(pretrained_model_path, "pytorch_model.bin") 33 | model = cls.from_config(config_path, *args, **kwargs) 34 | loaded_state_dict = torch.load(model_path, map_location=torch.device('cpu')) 35 | loaded_keys = loaded_state_dict.keys() 36 | expected_keys = model.state_dict().keys() 37 | 38 | prefix = cls.base_model_prefix 39 | 40 | if set(loaded_keys) == set(expected_keys): 41 | # same architecture 42 | model.load_state_dict(loaded_state_dict) 43 | else: 44 | has_prefix_module = any(s.startswith(prefix) for s in loaded_keys) 45 | expects_prefix_module = any(s.startswith(prefix) for s in expected_keys) 46 | 47 | new_loaded_state_dict = {} 48 | if expects_prefix_module and not has_prefix_module: 49 | # add prefix 50 | for key in loaded_keys: 51 | new_loaded_state_dict['.'.join([prefix, key])] = loaded_state_dict[key] 52 | if has_prefix_module and not expects_prefix_module: 53 | # remove prefix 54 | for key in loaded_keys: 55 | if key.startswith(prefix): 56 | new_loaded_state_dict['.'.join(key.split('.')[1:])] = loaded_state_dict[key] 57 | if has_prefix_module and expects_prefix_module: 58 | # both have prefix, only load the base encoder 59 | for key in loaded_keys: 60 | if key.startswith(prefix): 61 | new_loaded_state_dict[key] = loaded_state_dict[key] 62 | loaded_state_dict = new_loaded_state_dict 63 | model.load_state_dict(loaded_state_dict, strict=False) 64 | loaded_keys = loaded_state_dict.keys() 65 | missing_keys = set(expected_keys) - set(loaded_keys) 66 | if len(missing_keys) == 0: 67 | logger.info( 68 | f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at" 69 | f" {pretrained_model_path}.\nIf your task is similar to the task the model of the checkpoint" 70 | f" was trained on, you can already use {model.__class__.__name__} for predictions without further" 71 | " training." 72 | ) 73 | elif len(missing_keys) > 0: 74 | logger.warning( 75 | f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at" 76 | f" {pretrained_model_path} and are newly initialized: {missing_keys}\nYou should probably" 77 | " TRAIN this model on a down-stream task to be able to use it for predictions and inference." 78 | ) 79 | return model 80 | 81 | def save_config(self, config_dir): 82 | config_path = os.path.join(config_dir, "config.json") 83 | with open(config_path, "w", encoding="utf-8") as wf: 84 | json.dump(self.config.to_dict(), wf, ensure_ascii=False, indent=2) 85 | 86 | @classmethod 87 | def from_config(cls, config_path, *args, **kwargs): 88 | raise NotImplementedError 89 | -------------------------------------------------------------------------------- /EduNLP/ModelZoo/bert/__init__.py: -------------------------------------------------------------------------------- 1 | from .bert import * 2 | -------------------------------------------------------------------------------- /EduNLP/ModelZoo/disenqnet/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from .disenqnet import * 4 | -------------------------------------------------------------------------------- /EduNLP/ModelZoo/disenqnet/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import torch 4 | from torch import nn 5 | from torch.nn import functional as F 6 | 7 | 8 | def get_mask(seq_len, lengths): 9 | device = lengths.device 10 | # batch_size 11 | batch_size = lengths.size(0) 12 | # seq_len 13 | pos_index = torch.arange(seq_len).to(device) 14 | # batch_size * seq_len 15 | mask = pos_index.unsqueeze(0).expand(batch_size, -1) >= lengths.unsqueeze(-1) 16 | return mask 17 | 18 | 19 | def shuffle(real): 20 | # |0 1 2 3| => |1 2 3 0| 21 | device = real.device 22 | batch_size = real.size(0) 23 | shuffled_index = (torch.arange(batch_size) + 1) % batch_size 24 | shuffled_index = shuffled_index.to(device) 25 | shuffled = real.index_select(dim=0, index=shuffled_index) 26 | return shuffled 27 | 28 | 29 | def spectral_norm(w, n_iteration=5): 30 | device = w.device 31 | # (o, i) 32 | # bias: (O) -> (o, 1) 33 | if w.dim() == 1: 34 | w = w.unsqueeze(-1) 35 | out_dim, in_dim = w.size() 36 | # (i, o) 37 | wt = w.transpose(0, 1) 38 | # (1, i) 39 | u = torch.ones(1, in_dim).to(device) 40 | for _ in range(n_iteration): 41 | # (1, i) * (i, o) -> (1, o) 42 | v = torch.mm(u, wt) 43 | v = v / v.norm(p=2) 44 | # (1, o) * (o, i) -> (1, i) 45 | u = torch.mm(v, w) 46 | u = u / u.norm(p=2) 47 | # (1, i) * (i, o) * (o, 1) -> (1, 1) 48 | sn = torch.mm(torch.mm(u, wt), v.transpose(0, 1)).sum() ** 0.5 49 | return sn 50 | -------------------------------------------------------------------------------- /EduNLP/ModelZoo/quesnet/__init__.py: -------------------------------------------------------------------------------- 1 | from .quesnet import QuesNet, QuesNetForPreTraining 2 | from .modules import AE, ImageAE, MetaAE 3 | -------------------------------------------------------------------------------- /EduNLP/ModelZoo/quesnet/modules.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class FeatureExtractor(nn.Module): 6 | def __init__(self, feat_size=512): 7 | super(FeatureExtractor, self).__init__() 8 | self.feat_size = feat_size 9 | 10 | def make_batch(self, data, device, pretrain=False): 11 | """Make batch from input data (python data / np arrays -> tensors)""" 12 | raise NotImplementedError 13 | 14 | def load_emb(self, emb): 15 | pass 16 | 17 | def forward(self, *input): 18 | raise NotImplementedError 19 | 20 | 21 | class AE(nn.Module): 22 | factor = 1 23 | 24 | def enc(self, item, *args, **kwargs): 25 | return self.encoder(item, *args, **kwargs) 26 | 27 | def dec(self, item, *args, **kwargs): 28 | return self.decoder(item, *args, **kwargs) 29 | 30 | def loss(self, item, emb=None): 31 | if emb is None: 32 | emb = self(item) 33 | out = self.dec(emb) 34 | else: 35 | out = self.dec(emb) 36 | 37 | return self.recons_loss(out, item) 38 | 39 | def forward(self, item): 40 | return self.enc(item) 41 | 42 | 43 | class ImageAE(AE): 44 | def __init__(self, emb_size): 45 | super().__init__() 46 | self.emb_size = emb_size 47 | self.recons_loss = nn.MSELoss() 48 | self._encoder = nn.Sequential( 49 | nn.Conv2d(1, 16, 3, stride=3), 50 | nn.ReLU(True), 51 | nn.MaxPool2d(2, stride=2), 52 | nn.Conv2d(16, 32, 3, stride=2), 53 | nn.ReLU(True), 54 | nn.MaxPool2d(2, stride=1), 55 | nn.Conv2d(32, emb_size, 3, stride=2) 56 | ) 57 | self._decoder = nn.Sequential( 58 | nn.ConvTranspose2d(emb_size // self.factor, 32, 3, stride=2), 59 | nn.ReLU(True), 60 | nn.ConvTranspose2d(32, 16, 5, stride=3, padding=1), 61 | nn.ReLU(True), 62 | nn.ConvTranspose2d(16, 8, 5, stride=3), 63 | nn.ReLU(True), 64 | nn.ConvTranspose2d(8, 1, 2, stride=2, padding=1), 65 | nn.Sigmoid() 66 | ) 67 | 68 | def encoder(self, item, detach_tensor=False): 69 | return self._encoder(item).detach().view(item.size(0), -1) if detach_tensor else self._encoder(item).view( 70 | item.size(0), -1) 71 | 72 | def decoder(self, emb, detach_tensor=False): 73 | return self._decoder(emb[:, :, None, None]).detach() if detach_tensor else self._decoder(emb[:, :, None, None]) 74 | 75 | 76 | class MetaAE(AE): 77 | def __init__(self, meta_size, emb_size): 78 | super().__init__() 79 | self.emb_size = emb_size 80 | self.meta_size = meta_size 81 | self.recons_loss = nn.BCEWithLogitsLoss() 82 | self.encoder = nn.Sequential(nn.Linear(meta_size, emb_size), 83 | nn.ReLU(True)) 84 | # error: inplace 85 | # nn.Linear(emb_size, emb_size) 86 | self.decoder = nn.Sequential(nn.Linear(emb_size // self.factor, 87 | emb_size), 88 | nn.ReLU(True), 89 | nn.Linear(emb_size, meta_size)) 90 | -------------------------------------------------------------------------------- /EduNLP/ModelZoo/quesnet/util.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn.utils.rnn import pack_padded_sequence 3 | 4 | 5 | def argsort(seq): 6 | return sorted(range(len(seq)), key=seq.__getitem__) 7 | 8 | 9 | class SeqBatch: 10 | def __init__(self, seqs, dtype=None, device=None): 11 | self.dtype = dtype 12 | self.device = device 13 | self.seqs = seqs 14 | 15 | if not seqs: 16 | self.lens = [0] 17 | else: 18 | self.lens = [len(x) for x in seqs] 19 | 20 | self.ind = argsort(self.lens)[::-1] 21 | self.inv = argsort(self.ind) 22 | self.lens.sort(reverse=True) 23 | self._prefix = [0] 24 | self._index = {} 25 | c = 0 26 | 27 | for i in range(self.lens[0]): 28 | for j in range(len(self.lens)): 29 | if self.lens[j] <= i: 30 | break 31 | self._index[i, j] = c 32 | c += 1 33 | 34 | def packed(self): 35 | ind = torch.tensor(self.ind, dtype=torch.long, device=self.device) 36 | if not ind.numel() or ind.max() >= self.padded()[0].size(1): 37 | return None, None 38 | padded = self.padded()[0].index_select(1, ind) 39 | return pack_padded_sequence(padded, torch.tensor(self.lens)) 40 | 41 | def padded(self, max_len=None, batch_first=False): 42 | if not self.seqs: 43 | return torch.empty((0, 0), dtype=self.dtype, device=self.device), \ 44 | torch.empty((0, 0), dtype=torch.bool, device=self.device) 45 | 46 | seqs = [torch.tensor(s, dtype=self.dtype, device=self.device) 47 | if not isinstance(s, torch.Tensor) else s 48 | for s in self.seqs] 49 | if max_len is None: 50 | max_len = self.lens[0] 51 | seqs = [s[:max_len] for s in seqs] 52 | mask = [[1] * len(s) + [0] * (max_len - len(s)) for s in seqs] 53 | 54 | trailing_dims = seqs[0].size()[1:] 55 | if batch_first: 56 | out_dims = (len(seqs), max_len) + trailing_dims 57 | else: 58 | out_dims = (max_len, len(seqs)) + trailing_dims 59 | 60 | padded = seqs[0].new(*out_dims).fill_(0) 61 | for i, tensor in enumerate(seqs): 62 | length = tensor.size(0) 63 | # use index notation to prevent duplicate references to the tensor 64 | if batch_first: 65 | padded[i, :length, ...] = tensor 66 | else: 67 | padded[:length, i, ...] = tensor 68 | return padded, torch.tensor(mask).byte().to(self.device) 69 | 70 | def index(self, item): 71 | return self._index[item[0], self.inv[item[1]]] 72 | 73 | def invert(self, batch, dim=0): 74 | return batch.index_select(dim, torch.tensor(self.inv, device=self.device)) 75 | -------------------------------------------------------------------------------- /EduNLP/ModelZoo/rnn/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/7/12 @ tongshiwei 3 | 4 | from .rnn import * 5 | from .harnn import * 6 | -------------------------------------------------------------------------------- /EduNLP/ModelZoo/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/7/12 @ tongshiwei 3 | 4 | from .padder import PadSequence, pad_sequence 5 | from .device import set_device 6 | from .masker import Masker 7 | from .data import load_items 8 | from .modules import MLP, TextCNN 9 | from .torch_utils import * 10 | from .downstream_output import * 11 | -------------------------------------------------------------------------------- /EduNLP/ModelZoo/utils/data.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | def load_items(data_path): 5 | _data = [] 6 | with open(data_path, encoding="utf-8") as f: 7 | for line in f.readlines(): 8 | _data.append(json.loads(line)) 9 | return _data 10 | -------------------------------------------------------------------------------- /EduNLP/ModelZoo/utils/device.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/8/2 @ tongshiwei 3 | import logging 4 | import torch 5 | from torch.nn import DataParallel 6 | 7 | 8 | def set_device(_net, ctx, *args, **kwargs): # pragma: no cover 9 | """code from longling v1.3.26""" 10 | if ctx == "cpu": 11 | return _net.cpu() 12 | elif any(map(lambda x: x in ctx, ["cuda", "gpu"])): 13 | if not torch.cuda.is_available(): 14 | try: 15 | torch.ones((1,), device=torch.device("cuda:0")) 16 | except AssertionError as e: 17 | raise TypeError("no cuda detected, noly cpu is supported, the detailed error msg:%s" % str(e)) 18 | if torch.cuda.device_count() >= 1: 19 | if ":" in ctx: 20 | ctx_name, device_ids = ctx.split(":") 21 | assert ctx_name in ["cuda", "gpu"], "the equipment should be 'cpu', 'cuda' or 'gpu', now is %s" % ctx 22 | device_ids = [int(i) for i in device_ids.strip().split(",")] 23 | try: 24 | if not isinstance(_net, DataParallel): 25 | return DataParallel(_net, device_ids).cuda() 26 | return _net.cuda(device_ids) 27 | except AssertionError as e: 28 | logging.error(device_ids) 29 | raise e 30 | elif ctx in ["cuda", "gpu"]: 31 | if not isinstance(_net, DataParallel): 32 | _net = DataParallel(_net) 33 | return _net.cuda() 34 | else: 35 | raise TypeError("the equipment should be 'cpu', 'cuda' or 'gpu', now is %s" % ctx) 36 | else: 37 | logging.error(torch.cuda.device_count()) 38 | raise TypeError("0 gpu can be used, use cpu") 39 | else: 40 | if not isinstance(_net, DataParallel): 41 | return DataParallel(_net, device_ids=ctx).cuda() 42 | return _net.cuda(ctx) 43 | -------------------------------------------------------------------------------- /EduNLP/ModelZoo/utils/downstream_output.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers.modeling_outputs import ModelOutput 3 | 4 | 5 | class PropertyPredictionOutput(ModelOutput): 6 | loss: torch.FloatTensor = None 7 | logits: torch.FloatTensor = None 8 | 9 | 10 | class KnowledgePredictionOutput(ModelOutput): 11 | loss: torch.FloatTensor = None 12 | logits: torch.FloatTensor = None 13 | -------------------------------------------------------------------------------- /EduNLP/ModelZoo/utils/masker.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/8/3 @ tongshiwei 3 | 4 | from copy import deepcopy 5 | import numpy as np 6 | 7 | 8 | class Masker(object): 9 | """ 10 | 11 | Parameters 12 | ---------- 13 | mask: int, str 14 | per 15 | seed 16 | 17 | Examples 18 | --------- 19 | >>> masker = Masker(per=0.5, seed=10) 20 | >>> items = [[1, 1, 3, 4, 6], [2], [5, 9, 1, 4]] 21 | >>> masked_seq, mask_label = masker(items) 22 | >>> masked_seq 23 | [[1, 1, 0, 0, 6], [2], [0, 9, 0, 4]] 24 | >>> mask_label 25 | [[0, 0, 1, 1, 0], [0], [1, 0, 1, 0]] 26 | >>> items = [[1, 2, 3], [1, 1, 0], [2, 0, 0]] 27 | >>> masked_seq, mask_label = masker(items, [3, 2, 1]) 28 | >>> masked_seq 29 | [[1, 0, 3], [0, 1, 0], [2, 0, 0]] 30 | >>> mask_label 31 | [[0, 1, 0], [1, 0, 0], [0, 0, 0]] 32 | >>> masker = Masker(mask="[MASK]", per=0.5, seed=10) 33 | >>> items = [["a", "b", "c"], ["d", "[PAD]", "[PAD]"], ["hello", "world", "[PAD]"]] 34 | >>> masked_seq, mask_label = masker(items, length=[3, 1, 2]) 35 | >>> masked_seq 36 | [['a', '[MASK]', 'c'], ['d', '[PAD]', '[PAD]'], ['hello', '[MASK]', '[PAD]']] 37 | >>> mask_label 38 | [[0, 1, 0], [0, 0, 0], [0, 1, 0]] 39 | 40 | Returns 41 | ---------- 42 | list 43 | list of masked_seq and list of masked_list 44 | """ 45 | def __init__(self, mask: (int, str, ...) = 0, per=0.2, seed=None): 46 | self.seed = np.random.default_rng(seed) 47 | self.per = per 48 | self.mask = mask 49 | 50 | def __call__(self, seqs, length=None, *args, **kwargs) -> tuple: 51 | seqs = deepcopy(seqs) 52 | masked_list = [] 53 | if length is None: 54 | length = [len(seq) for seq in seqs] 55 | for seq, _length in zip(seqs, length): 56 | masked = self.seed.choice(len(seq) - 1, size=int(_length * self.per), replace=False) 57 | _masked_list = [0] * len(seq) 58 | for _masked in masked: 59 | seq[_masked] = self.mask 60 | _masked_list[_masked] = 1 61 | masked_list.append(_masked_list) 62 | return seqs, masked_list 63 | -------------------------------------------------------------------------------- /EduNLP/ModelZoo/utils/modules.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import functional as F 4 | 5 | 6 | class MLP(nn.Module): 7 | def __init__(self, in_dim, n_classes, hidden_dim, dropout, n_layers=2, act=F.leaky_relu): 8 | super(MLP, self).__init__() 9 | self.l_in = nn.Linear(in_dim, hidden_dim) 10 | self.l_hs = nn.ModuleList(nn.Linear(hidden_dim, hidden_dim) for _ in range(n_layers - 2)) # doctest: +ELLIPSIS 11 | self.l_out = nn.Linear(hidden_dim, n_classes) 12 | self.dropout = nn.Dropout(p=dropout) 13 | self.act = act 14 | 15 | def forward(self, input): 16 | hidden = self.act(self.l_in(self.dropout(input))) 17 | for l_h in self.l_hs: 18 | hidden = self.act(l_h(self.dropout(hidden))) 19 | output = self.l_out(self.dropout(hidden)) 20 | return output 21 | 22 | 23 | class TextCNN(nn.Module): 24 | def __init__(self, embed_dim, hidden_dim): 25 | super(TextCNN, self).__init__() 26 | kernel_sizes = [2, 3, 4, 5] 27 | channel_dim = hidden_dim // len(kernel_sizes) 28 | self.min_seq_len = max(kernel_sizes) 29 | self.convs = nn.ModuleList([nn.Conv1d(embed_dim, channel_dim, k_size) for k_size in kernel_sizes]) 30 | 31 | def forward(self, embed): 32 | if embed.size(1) < self.min_seq_len: 33 | device = embed.device 34 | pad = torch.zeros(embed.size(0), self.min_seq_len - embed.size(1), embed.size(-1)).to(device) 35 | embed = torch.cat((embed, pad), dim=1) 36 | # (b, s, d) => (b, d, s) => (b, d', s') => (b, d', 1) => (b, d') 37 | # batch_size * dim * seq_len 38 | hidden = [F.leaky_relu(conv(embed.transpose(1, 2))) for conv in self.convs] 39 | # batch_size * dim 40 | hidden = [F.max_pool1d(h, kernel_size=h.size(2)).squeeze(-1) for h in hidden] 41 | hidden = torch.cat(hidden, dim=-1) 42 | return hidden 43 | -------------------------------------------------------------------------------- /EduNLP/ModelZoo/utils/padder.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/7/12 @ tongshiwei 3 | 4 | __all__ = ["PadSequence", "pad_sequence"] 5 | 6 | 7 | class PadSequence(object): 8 | """ 9 | Pad the sequence. 10 | 11 | Pad the sequence to the given `length` by inserting `pad_val`. If `clip` is set, 12 | sequence that has length larger than `length` will be clipped. 13 | 14 | Parameters 15 | ---------- 16 | length : int 17 | The maximum length to pad/clip the sequence 18 | pad_val : number 19 | The pad value. Default 0 20 | clip : bool 21 | 22 | Returns 23 | ------- 24 | ret 25 | list of number 26 | """ 27 | def __init__(self, length, pad_val=0, clip=True): 28 | self._length = length 29 | self._pad_val = pad_val 30 | self._clip = clip 31 | 32 | def __call__(self, sample: list): 33 | sample_length = len(sample) 34 | if sample_length >= self._length: 35 | if self._clip and sample_length > self._length: 36 | return sample[:self._length] 37 | else: 38 | return sample 39 | else: 40 | return sample + [ 41 | self._pad_val for _ in range(self._length - sample_length) 42 | ] 43 | 44 | 45 | def pad_sequence(sequence: list, max_length=None, pad_val=0, clip=True): 46 | """ 47 | 48 | Parameters 49 | ---------- 50 | sequence 51 | max_length 52 | pad_val 53 | clip 54 | 55 | Returns 56 | ------- 57 | Modified list:list 58 | padding the sequence in the same size. 59 | 60 | Examples 61 | -------- 62 | >>> seq = [[4, 3, 3], [2], [3, 3, 2]] 63 | >>> pad_sequence(seq) 64 | [[4, 3, 3], [2, 0, 0], [3, 3, 2]] 65 | >>> pad_sequence(seq, pad_val=1) 66 | [[4, 3, 3], [2, 1, 1], [3, 3, 2]] 67 | >>> pad_sequence(seq, max_length=2) 68 | [[4, 3], [2, 0], [3, 3]] 69 | >>> pad_sequence(seq, max_length=2, clip=False) 70 | [[4, 3, 3], [2, 0], [3, 3, 2]] 71 | """ 72 | padder = PadSequence(max([len(seq) for seq in sequence]) if max_length is None else max_length, pad_val, clip) 73 | return [padder(seq) for seq in sequence] 74 | -------------------------------------------------------------------------------- /EduNLP/ModelZoo/utils/torch_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def sequence_mask(lengths, max_len=None): 5 | """Same as tf.sequence_mask, Returns a mask tensor representing the first N positions of each cell. 6 | 7 | Parameters 8 | ---------- 9 | lengths : _type_ 10 | integer tensor, all its values <= maxlen. 11 | max_len : _type_, optional 12 | scalar integer tensor, size of last dimension of returned tensor. Default is the maximum value in lengths. 13 | 14 | Returns 15 | ------- 16 | _type_ 17 | A mask tensor of shape lengths.shape + (maxlen,) 18 | 19 | Examples: 20 | --------- 21 | >>> sequence_mask(torch.tensor([1, 3, 2]), 5) 22 | tensor([[ True, False, False, False, False], 23 | [ True, True, True, False, False], 24 | [ True, True, False, False, False]]) 25 | >>> sequence_mask(torch.tensor([[1, 3],[2,0]])) 26 | tensor([[[ True, False, False], 27 | [ True, True, True]], 28 | 29 | [[ True, True, False], 30 | [False, False, False]]]) 31 | """ 32 | 33 | lengths_shape = lengths.shape # torch.size() is a tuple 34 | lengths = lengths.reshape(-1) 35 | 36 | batch_size = lengths.numel() 37 | max_len = max_len or int(lengths.max()) 38 | lengths_shape += (max_len,) 39 | 40 | return (torch.arange(0, max_len, device=lengths.device) 41 | .type_as(lengths) 42 | .unsqueeze(0).expand(batch_size, max_len) 43 | .lt(lengths.unsqueeze(1))).reshape(lengths_shape) 44 | 45 | 46 | def gather_nd(params, indices): 47 | """_summary_ 48 | 49 | Parameters 50 | ---------- 51 | params : _type_ 52 | _description_ 53 | indices : _type_ 54 | _description_ 55 | 56 | Returns 57 | ------- 58 | _type_ 59 | _description_ 60 | 61 | Examples: 62 | --------- 63 | >>> gather_nd( 64 | ... params=torch.tensor([[1, 2, 3], 65 | ... [4, 5, 6]]), 66 | ... indices=torch.tensor([[1], 67 | ... [0]])) 68 | tensor([[4, 5, 6], 69 | [1, 2, 3]]) 70 | """ 71 | newshape = indices.shape[:-1] + params.shape[indices.shape[-1]:] 72 | indices = indices.view(-1, indices.shape[-1]).tolist() 73 | out = torch.cat([params.__getitem__(tuple(i)) for i in indices]) 74 | return out.reshape(newshape) 75 | -------------------------------------------------------------------------------- /EduNLP/Pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import Pipeline, PreProcessingPipeline 2 | from .mappings import TASK_MAPPING, TOKENIZER_MAPPING_NAMES 3 | from .property_prediction import PropertyPredictionPipeline 4 | from .knowledge_prediction import KnowledgePredictionPipeline 5 | from ..Pretrain import PretrainedEduTokenizer 6 | from ..ModelZoo.base_model import BaseModel 7 | from ..Vector.t2v import get_pretrained_model_info 8 | from ..constant import MODEL_DIR 9 | from EduData import get_data 10 | from typing import Optional, Union, List 11 | 12 | __all__ = ["pipeline"] 13 | 14 | SUPPORTED_TASKS = { 15 | "pre-process": { 16 | "impl": Pipeline, 17 | "default": None 18 | }, 19 | "property-prediction": { 20 | "impl": PropertyPredictionPipeline, 21 | "default": "elmo_for_property_prediction_test_256" 22 | }, 23 | "knowledge-prediction": { 24 | "impl": KnowledgePredictionPipeline, 25 | "default": "elmo_for_knowledge_prediction_test_256" 26 | } 27 | } 28 | 29 | 30 | def pipeline( 31 | task: str = None, 32 | model: Optional[Union[BaseModel, str]] = None, 33 | tokenizer: Optional[PretrainedEduTokenizer] = None, 34 | pipeline_class: Optional[Pipeline] = None, 35 | preprocess: Optional[List] = None, 36 | **kwargs 37 | ): 38 | """ 39 | Parameters 40 | ---------- 41 | task: str, required 42 | model: BaseModel or str, optional 43 | 44 | tokenizer: PretrainedEduTokenizer, optional 45 | 46 | pipeline_class: Pipeline, optional 47 | to specify Pipeline class 48 | preprocess: list, optional 49 | a list of names of pre-process pipes 50 | 51 | Examples 52 | ---------- 53 | >>> processor = pipeline(task="property-prediction") # doctest: +SKIP 54 | >>> item = "如图所示,则三角形ABC的面积是_。" 55 | >>> processor(item) # doctest: +SKIP 56 | """ 57 | if preprocess is None and task is None and model is None: 58 | raise RuntimeError("Please specify at least the model to use or task to do!") 59 | elif model is None and tokenizer is not None: 60 | raise RuntimeError("Specified tokenizer but no model is not allowed!") 61 | elif task is None and model is not None: 62 | raise RuntimeError("Please specify the task.") 63 | elif task is None: 64 | task = "pre-process" 65 | 66 | if task == "pre-process": 67 | return PreProcessingPipeline(pipe_names=preprocess) 68 | 69 | if task in SUPPORTED_TASKS: 70 | targeted_task = SUPPORTED_TASKS[task] 71 | else: 72 | raise KeyError(f"Unknown task {task}") 73 | if pipeline_class is None: 74 | pipeline_class = targeted_task["impl"] 75 | if model is None or isinstance(model, str): 76 | # TODO: 1. waiting for ModelHub and TEST 77 | # 2. Check if the specified model and task are matched 78 | # pretrained_name = targeted_task["default"] if model is None else model 79 | # model_url, model_name, *args = get_pretrained_model_info(pretrained_name) 80 | # model_path = get_data(model_url, MODEL_DIR) 81 | # model = TASK_MAPPING[task][model_name].from_pretrained(model_path) 82 | # tokenizer = TOKENIZER_MAPPING_NAMES[model_name].from_pretrained(model_path) 83 | pass 84 | elif isinstance(model, BaseModel) and isinstance(tokenizer, PretrainedEduTokenizer): 85 | model, tokenizer = model, tokenizer 86 | elif model is not None and tokenizer is not None: 87 | raise KeyError(f"Unknown model and tokenizer: {model} and {tokenizer}") 88 | 89 | return pipeline_class(model=model, task=task, tokenizer=tokenizer, preproc_pipe_names=preprocess, **kwargs) 90 | -------------------------------------------------------------------------------- /EduNLP/Pipeline/components.py: -------------------------------------------------------------------------------- 1 | from ..utils import dict2str4sif 2 | from ..SIF import is_sif, to_sif, sif4sci 3 | from ..SIF.segment import seg, SegmentList 4 | from ..Tokenizer import PureTextTokenizer 5 | from ..SIF.tokenization.text import tokenize 6 | 7 | 8 | class BasePipe: 9 | def __init__(self, *args, **kwargs): 10 | self.args = args 11 | self.kwargs = kwargs 12 | 13 | def __call__(self, input_): 14 | raise NotImplementedError 15 | 16 | 17 | class IsSifPipe(BasePipe): 18 | def __init__(self, *args, **kwargs): 19 | super(IsSifPipe, self).__init__(*args, **kwargs) 20 | 21 | def __call__(self, input_): 22 | print(is_sif(input_, *self.args, **self.kwargs)) 23 | return input_ 24 | 25 | 26 | class ToSifPipe(BasePipe): 27 | def __init__(self, *args, **kwargs): 28 | super(ToSifPipe, self).__init__(*args, **kwargs) 29 | 30 | def __call__(self, input_): 31 | return to_sif(input_, *self.args, **self.kwargs) 32 | 33 | 34 | class Dict2Str4SifPipe(BasePipe): 35 | def __init__(self, *args, **kwargs): 36 | super(Dict2Str4SifPipe, self).__init__(*args, **kwargs) 37 | 38 | def __call__(self, input_): 39 | return dict2str4sif(input_, *self.args, **self.kwargs) 40 | 41 | 42 | class Sif4SciPipe(BasePipe): 43 | def __init__(self, *args, **kwargs): 44 | super(Sif4SciPipe, self).__init__(*args, **kwargs) 45 | 46 | def __call__(self, input_): 47 | return sif4sci(input_, *self.args, **self.kwargs) 48 | 49 | 50 | class SegPipe(BasePipe): 51 | def __init__(self, *args, **kwargs): 52 | super(SegPipe, self).__init__(*args, **kwargs) 53 | 54 | def __call__(self, input_): 55 | return seg(input_, *self.args, **self.kwargs) 56 | 57 | 58 | class SegDescribePipe(BasePipe): 59 | def __init__(self, *args, **kwargs): 60 | super(SegDescribePipe, self).__init__(*args, **kwargs) 61 | 62 | def __call__(self, input_: SegmentList): 63 | print(input_.describe()) 64 | return input_ 65 | 66 | 67 | class SegFilterPipe(BasePipe): 68 | def __init__(self, *args, **kwargs): 69 | super(SegFilterPipe, self).__init__(*args, **kwargs) 70 | 71 | def __call__(self, input_: SegmentList): 72 | input_.filter(*self.args, **self.kwargs) 73 | return input_ 74 | 75 | 76 | class TokenizePipe(BasePipe): 77 | def __init__(self, *args, **kwargs): 78 | super(TokenizePipe, self).__init__(*args, **kwargs) 79 | 80 | def __call__(self, input_): 81 | return tokenize(input_, *self.args, **self.kwargs) 82 | 83 | 84 | PREPROCESSING_PIPES = { 85 | 'dict2str4sif': Dict2Str4SifPipe, 86 | 'is_sif': IsSifPipe, 87 | 'to_sif': ToSifPipe, 88 | 'sif4sci': Sif4SciPipe, 89 | 'seg': SegPipe, 90 | 'seg_describe': SegDescribePipe, 91 | 'seg_filter': SegFilterPipe, 92 | 'tokenize': TokenizePipe, 93 | } 94 | -------------------------------------------------------------------------------- /EduNLP/Pipeline/knowledge_prediction.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from .base import Pipeline, GenericTensor 4 | from typing import Dict, Optional, Union 5 | from torch import sigmoid 6 | 7 | 8 | class KnowledgePredictionPipeline(Pipeline): 9 | def __init__(self, **kwargs): 10 | super(KnowledgePredictionPipeline, self).__init__(**kwargs) 11 | 12 | def _sanitize_parameters(self, **pipeline_parameters): 13 | tokenize_params, forward_params, postprocess_params = pipeline_parameters, {}, {} 14 | return tokenize_params, forward_params, postprocess_params 15 | 16 | def _tokenize(self, input_, **tokenize_parameters) -> Dict[str, GenericTensor]: 17 | return self.tokenizer(input_, **tokenize_parameters) 18 | 19 | def _forward(self, model_inputs, **forward_params): 20 | return self.model(**model_inputs) 21 | 22 | def postprocess(self, model_outputs, **postprocess_params): 23 | if 'num_classes_list' not in dir(self.model) or 'num_total_classes' not in dir(self.model): 24 | raise ValueError('model is not for knowledge prediction: ', self.model) 25 | outputs = model_outputs["logits"][0] 26 | start_idx = 0 27 | knowledge_list = [] 28 | for num_classes in self.model.num_classes_list: 29 | level_prediction = torch.argmax(outputs[start_idx:start_idx + num_classes]) + start_idx 30 | knowledge_list.append(level_prediction) 31 | start_idx += num_classes 32 | outputs = outputs.detach().numpy() 33 | dict_knowledge = { 34 | "knowledge_list": knowledge_list, 35 | "knowledge_scores": outputs.tolist(), 36 | } 37 | return dict_knowledge 38 | -------------------------------------------------------------------------------- /EduNLP/Pipeline/mappings.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | from ..Pretrain import ElmoTokenizer, BertTokenizer, QuesNetTokenizer, DisenQTokenizer 3 | from ..ModelZoo.rnn import ElmoLMForPropertyPrediction, ElmoLMForKnowledgePrediction 4 | from ..ModelZoo.bert import BertForPropertyPrediction, BertForKnowledgePrediction 5 | 6 | TOKENIZER_MAPPING_NAMES = OrderedDict( 7 | [ 8 | ("elmo", ElmoTokenizer), 9 | ("bert", BertTokenizer), 10 | ("quesnet", QuesNetTokenizer), 11 | ("disenq", DisenQTokenizer) 12 | ] 13 | ) 14 | 15 | MODEL_FOR_PROPERTY_PREDICTION_MAPPING_NAMES = OrderedDict( 16 | [ 17 | ("elmo", ElmoLMForPropertyPrediction), 18 | ("bert", BertForPropertyPrediction), 19 | ] 20 | ) 21 | 22 | MODEL_FOR_KNOWLEDGE_PREDICTION_MAPPING_NAMES = OrderedDict( 23 | [ 24 | ("elmo", ElmoLMForKnowledgePrediction), 25 | ("bert", BertForKnowledgePrediction) 26 | ] 27 | ) 28 | 29 | TASK_MAPPING = { 30 | "property-prediction": MODEL_FOR_PROPERTY_PREDICTION_MAPPING_NAMES, 31 | "knowledge-prediction": MODEL_FOR_KNOWLEDGE_PREDICTION_MAPPING_NAMES 32 | } 33 | -------------------------------------------------------------------------------- /EduNLP/Pipeline/property_prediction.py: -------------------------------------------------------------------------------- 1 | from .base import Pipeline, GenericTensor 2 | from typing import Dict, Optional, Union 3 | 4 | 5 | class PropertyPredictionPipeline(Pipeline): 6 | def __init__(self, **kwargs): 7 | super(PropertyPredictionPipeline, self).__init__(**kwargs) 8 | 9 | def _sanitize_parameters(self, **pipeline_parameters): 10 | tokenize_params, forward_params, postprocess_params = pipeline_parameters, {}, {} 11 | return tokenize_params, forward_params, postprocess_params 12 | 13 | def _tokenize(self, input_, **tokenize_parameters) -> Dict[str, GenericTensor]: 14 | return self.tokenizer(input_, **tokenize_parameters) 15 | 16 | def _forward(self, model_inputs, **forward_params): 17 | return self.model(**model_inputs) 18 | 19 | def postprocess(self, model_outputs, **postprocess_params): 20 | outputs = model_outputs["logits"] 21 | outputs = outputs.detach().numpy() 22 | dict_property = {"property": outputs.item()} 23 | return dict_property 24 | -------------------------------------------------------------------------------- /EduNLP/Pretrain/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/5/29 @ tongshiwei 3 | 4 | from .gensim_vec import train_vector, GensimWordTokenizer, GensimSegTokenizer 5 | from .elmo_vec import * 6 | from .bert_vec import * 7 | from .quesnet_vec import QuesNetTokenizer, pretrain_quesnet, Question 8 | from .disenqnet_vec import * 9 | from .pretrian_utils import * 10 | from .hugginface_utils import * 11 | -------------------------------------------------------------------------------- /EduNLP/SIF/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/5/16 @ tongshiwei 3 | 4 | from .sif import is_sif, to_sif, sif4sci 5 | from .tokenization import link_formulas 6 | from .constants import * 7 | -------------------------------------------------------------------------------- /EduNLP/SIF/constants.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/5/18 @ tongshiwei 3 | 4 | TEXT_SYMBOL = "[TEXT]" 5 | FORMULA_SYMBOL = "[FORMULA]" 6 | FIGURE_SYMBOL = "[FIGURE]" 7 | QUES_MARK_SYMBOL = "[MARK]" 8 | TAG_SYMBOL = "[TAG]" 9 | SEP_SYMBOL = "[SEP]" 10 | TEXT_BEGIN = r"[TEXT_BEGIN]" 11 | TEXT_END = r"[TEXT_END]" 12 | FORMULA_BEGIN = r"[FORMULA_BEGIN]" 13 | FORMULA_END = r"[FORMULA_END]" 14 | 15 | EDU_SPYMBOLS = [ 16 | TEXT_SYMBOL, FORMULA_SYMBOL, FIGURE_SYMBOL, 17 | QUES_MARK_SYMBOL, TAG_SYMBOL, SEP_SYMBOL, 18 | TEXT_BEGIN, TEXT_END, 19 | FORMULA_BEGIN, FORMULA_END 20 | ] 21 | 22 | 23 | class Symbol(str): 24 | pass 25 | -------------------------------------------------------------------------------- /EduNLP/SIF/parser/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/6/02 @ fannazya 3 | 4 | from .parser import (Parser) 5 | -------------------------------------------------------------------------------- /EduNLP/SIF/segment/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/5/18 @ tongshiwei 3 | 4 | from .segment import (SegmentList, TextSegment, FigureFormulaSegment, LatexFormulaSegment, FigureSegment, 5 | QuesMarkSegment, Figure, TagSegment, SepSegment, seg) 6 | -------------------------------------------------------------------------------- /EduNLP/SIF/tokenization/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/5/18 @ tongshiwei 3 | 4 | from .tokenization import tokenize, link_formulas 5 | -------------------------------------------------------------------------------- /EduNLP/SIF/tokenization/formula/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/5/18 @ tongshiwei 3 | 4 | from .formula import tokenize 5 | from .ast_token import traversal_formula 6 | -------------------------------------------------------------------------------- /EduNLP/SIF/tokenization/formula/ast_token.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/5/20 @ tongshiwei 3 | import networkx as nx 4 | from EduNLP.Formula import Formula 5 | 6 | 7 | # def inorder_traversal(ast: nx.DiGraph): 8 | # visit = set() 9 | # nodes = [] 10 | # 11 | # def _inorder_traversal(_node): 12 | # if _node in visit: 13 | # return 14 | # successors = list(ast.successors(_node)) 15 | # if successors: 16 | # if len(successors) <= 2: 17 | # _inorder_traversal(successors[0]) 18 | # nodes.append(_node) 19 | # visit.add(_node) 20 | # if len(successors) == 2: 21 | # _inorder_traversal(successors[1]) 22 | # else: 23 | # nodes.append(_node) 24 | # for successor in successors: 25 | # if successor in visit: 26 | # continue 27 | # _inorder_traversal(successor) 28 | # else: 29 | # nodes.append(_node) 30 | # 31 | # for node in ast.nodes: 32 | # if node in visit or list(ast.predecessors(node)): 33 | # continue 34 | # _inorder_traversal(node) 35 | # return nodes 36 | 37 | def traversal_formula(ast, ord2token=False, var_numbering=False, strategy="post", *args, **kwargs): 38 | """ 39 | The part will run only when the return type is list. And it provides two strategy: post and linear. 40 | Besides, tokens list will append node follow its type. 41 | """ 42 | tokens = [] 43 | if strategy == "post": 44 | order = nx.dfs_postorder_nodes(ast) 45 | elif strategy == "linear": # pragma: no cover 46 | order = ast.nodes 47 | else: # pragma: no cover 48 | raise ValueError("Unknown traversal strategy: %s" % strategy) 49 | for i in order: 50 | node = ast.nodes[i] 51 | if node.get("type", "ignore") == "ignore": 52 | continue 53 | if ord2token is True and node["type"] in ["mathord", "textord", "text"]: 54 | if var_numbering is True and node["type"] == "mathord": 55 | tokens.append("%s_%s" % (node["type"], node.get("var", "con"))) 56 | else: 57 | tokens.append(node["type"]) 58 | else: 59 | tokens.append(node["text"]) 60 | return tokens 61 | 62 | 63 | def ast_tokenize(formula, ord2token=False, var_numbering=False, return_type="formula", *args, **kwargs): 64 | """ 65 | According to return type, tokenizing formula by different methods. 66 | 67 | Parameters 68 | ---------- 69 | formula 70 | ord2token 71 | var_numbering 72 | return_type 73 | args 74 | kwargs 75 | 76 | Returns 77 | ------- 78 | 79 | Examples 80 | -------- 81 | >>> ast_tokenize(r"{x + y}^\\frac{\\pi}{2} + 1 = x", return_type="list") 82 | ['x', '+', 'y', '{ }', '\\\\pi', '{ }', '2', '{ }', '\\\\frac', '\\\\supsub', '+', '1', '=', 'x'] 83 | >>> ast_tokenize(r"{x + y}^\\frac{\\pi}{2} + 1 = x", return_type="list", ord2token=True) 84 | ['mathord', '+', 'mathord', '{ }', 'mathord', '{ }', 'textord', '{ }', '\\\\frac', '\\\\supsub', '+', 'textord', \ 85 | '=', 'mathord'] 86 | >>> ast_tokenize(r"{x + y}^\\frac{\\pi}{2} + 1 = x", return_type="list", ord2token=True, var_numbering=True) 87 | ['mathord_0', '+', 'mathord_1', '{ }', 'mathord_con', '{ }', 'textord', '{ }', '\\\\frac', '\\\\supsub', \ 88 | '+', 'textord', '=', 'mathord_0'] 89 | >>> len(ast_tokenize(r"{x + y}^\\frac{\\pi}{2} + 1 = x", return_type="ast").nodes) 90 | 14 91 | >>> ast_tokenize(r"{x + y}^\\frac{\\pi}{2} + 1 = x") 92 | 93 | """ 94 | if return_type == "list": 95 | ast = Formula(formula, variable_standardization=True).ast_graph 96 | return traversal_formula(ast, ord2token=ord2token, var_numbering=var_numbering) 97 | elif return_type == "formula": 98 | return Formula(formula) 99 | elif return_type == "ast": 100 | return Formula(formula).ast_graph 101 | else: 102 | raise ValueError() 103 | 104 | 105 | if __name__ == '__main__': 106 | print(ast_tokenize(r"{x + y}^\frac{\pi}{2} + 1 = x", return_type="list", ord2token=True, var_numbering=True)) 107 | -------------------------------------------------------------------------------- /EduNLP/SIF/tokenization/formula/formula.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/5/18 @ tongshiwei 3 | 4 | import warnings 5 | 6 | from .linear_token import linear_tokenize 7 | from .ast_token import ast_tokenize 8 | 9 | 10 | def tokenize(formula, method="linear", errors="raise", **kwargs): 11 | """ 12 | The total function to tokenize formula by linear or ast. 13 | 14 | Parameters 15 | ---------- 16 | formula 17 | method 18 | errors: how to handle the exception occurs in ast tokenize 19 | "coerce": use linear_tokenize 20 | "raise": raise exception 21 | kwargs 22 | 23 | Returns 24 | ------- 25 | 26 | Examples 27 | -------- 28 | >>> tokenize(r"\\frac{\\pi}{x + y} + 1 = x") 29 | ['\\\\frac', '{', '\\\\pi', '}', '{', 'x', '+', 'y', '}', '+', '1', '=', 'x'] 30 | >>> tokenize(r"\\frac{\\pi}{x + y} + 1 = x", method="ast", ord2token=True) 31 | 32 | >>> tokenize(r"\\frac{\\pi}{x + y} + 1 = x", method="ast", ord2token=True, return_type="list") 33 | ['mathord', '{ }', 'mathord', '+', 'mathord', '{ }', '\\\\frac', '+', 'textord', '=', 'mathord'] 34 | """ 35 | if method == "linear": 36 | return linear_tokenize(formula, **kwargs) 37 | elif method == "ast": 38 | try: 39 | return ast_tokenize(formula, **kwargs) 40 | except TypeError as e: # pragma: no cover 41 | if errors == "coerce": 42 | warnings.warn("A type error is detected, linear tokenize is applied") 43 | return linear_tokenize(formula) 44 | else: 45 | raise e 46 | else: 47 | raise TypeError("Unknown method type: %s" % method) 48 | -------------------------------------------------------------------------------- /EduNLP/SIF/tokenization/text/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/5/18 @ tongshiwei 3 | from .tokenization import tokenize 4 | -------------------------------------------------------------------------------- /EduNLP/SIF/tokenization/text/stopwords.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/5/18 @ tongshiwei 3 | 4 | import os 5 | from EduNLP.utils import abs_current_dir, path_append 6 | 7 | DEFAULT_FILEPATH = os.path.abspath( 8 | path_append(abs_current_dir(__file__), "..", "..", "..", "meta_data", "sif_stopwords.txt") 9 | ) 10 | 11 | 12 | def get_stopwords(filepath=DEFAULT_FILEPATH): 13 | _stopwords = set() 14 | with open(filepath, encoding="utf-8") as f: 15 | for line in f: 16 | _stopwords.add(line.strip()) 17 | 18 | return _stopwords 19 | 20 | 21 | DEFAULT_STOPWORDS = get_stopwords() 22 | -------------------------------------------------------------------------------- /EduNLP/SIF/tokenization/text/tokenization.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/5/18 @ tongshiwei 3 | import logging 4 | import jieba 5 | from .stopwords import DEFAULT_STOPWORDS 6 | 7 | jieba.setLogLevel(logging.INFO) 8 | 9 | 10 | def is_chinese(word): 11 | """判断一个char或者string是否是汉字(串)""" 12 | for char in word: 13 | if char < u'\u4e00' or char > u'\u9fa5': 14 | return False 15 | return True 16 | 17 | 18 | def tokenize(text, granularity="word", stopwords="default"): 19 | """ 20 | Using jieba library to tokenize item by word or char. 21 | 22 | Parameters 23 | ---------- 24 | text 25 | granularity 26 | stopwords: str, None or set 27 | 28 | Returns 29 | ------- 30 | 31 | Examples 32 | -------- 33 | >>> tokenize("三角函数是基本初等函数之一") 34 | ['三角函数', '初等', '函数'] 35 | >>> tokenize("三角函数是基本初等函数之一", granularity="char") 36 | ['三', '角', '函', '数', '初', '等', '函', '数'] 37 | """ 38 | stopwords = DEFAULT_STOPWORDS if stopwords == "default" else stopwords 39 | stopwords = stopwords if stopwords is not None else {} 40 | if granularity == "word": 41 | return [token for token in jieba.cut(text) if token not in stopwords and token.strip()] 42 | elif granularity == "char": 43 | jieba_tokens = [token for token in jieba.cut(text) if token not in stopwords and token.strip()] 44 | # Use jieba_tokens to hangle sentence with mixed chinese and english. 45 | split_tokens = [] 46 | for token in jieba_tokens: 47 | if is_chinese(token): 48 | split_tokens.extend(list(token)) 49 | else: 50 | split_tokens.append(token) 51 | return split_tokens 52 | else: 53 | raise TypeError("Unknown granularity %s" % granularity) 54 | -------------------------------------------------------------------------------- /EduNLP/Tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/8/1 @ tongshiwei 3 | 4 | from .tokenizer import * 5 | -------------------------------------------------------------------------------- /EduNLP/Vector/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/5/29 @ tongshiwei 3 | 4 | from .gensim_vec import W2V, D2V, BowLoader, TfidfLoader 5 | from .const import * 6 | from .rnn import RNNModel 7 | from .t2v import T2V, get_pretrained_t2v, get_pretrained_model_info, get_all_pretrained_models 8 | from .embedding import Embedding 9 | from .bert_vec import BertModel 10 | from .quesnet import QuesNetModel 11 | from .disenqnet import DisenQModel 12 | from .elmo_vec import ElmoModel 13 | -------------------------------------------------------------------------------- /EduNLP/Vector/bert_vec.py: -------------------------------------------------------------------------------- 1 | # from transformers import BertModel as HFBertModel 2 | from transformers import AutoModel 3 | from .meta import Vector 4 | import torch 5 | 6 | 7 | class BertModel(Vector): 8 | """ 9 | Examples 10 | -------- 11 | >>> from EduNLP.Pretrain import BertTokenizer 12 | >>> tokenizer = BertTokenizer("bert-base-chinese", add_special_tokens=False) 13 | >>> model = BertModel("bert-base-chinese") 14 | >>> item = ["有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,若$x,y$满足约束", 15 | ... "有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,若$x,y$满足约束"] 16 | >>> inputs = tokenizer(item, return_tensors='pt') 17 | >>> output = model(inputs) 18 | >>> output.shape 19 | torch.Size([2, 14, 768]) 20 | >>> tokens = model.infer_tokens(inputs) 21 | >>> tokens.shape 22 | torch.Size([2, 12, 768]) 23 | >>> tokens = model.infer_tokens(inputs, return_special_tokens=True) 24 | >>> tokens.shape 25 | torch.Size([2, 14, 768]) 26 | >>> item = model.infer_vector(inputs) 27 | >>> item.shape 28 | torch.Size([2, 768]) 29 | """ 30 | 31 | def __init__(self, pretrained_dir, device="cpu"): 32 | self.device = device 33 | self.model = AutoModel.from_pretrained(pretrained_dir).to(self.device) 34 | self.model.eval() 35 | 36 | def __call__(self, items: dict): 37 | self.cuda_tensor(items) 38 | tokens = self.model(**items).last_hidden_state 39 | return tokens 40 | 41 | def infer_vector(self, items: dict, pooling_strategy='CLS', **kwargs) -> torch.Tensor: 42 | vector = self(items) 43 | if pooling_strategy == 'CLS': 44 | return vector[:, 0, :] 45 | elif pooling_strategy == 'average': 46 | # the average of word embedding of the last layer 47 | # batch_size, sent_len, embedding_dim 48 | mask = items['attention_mask'].unsqueeze(-1).expand(vector.size()) 49 | mul_mask = vector * mask 50 | # batch_size, embedding_dim 51 | return mul_mask.sum(1) / (mask.sum(1) + 1e-10) 52 | 53 | def infer_tokens(self, items: dict, return_special_tokens=False, **kwargs) -> torch.Tensor: 54 | tokens = self(items) 55 | if return_special_tokens: 56 | # include embedding of [CLS] and [SEP] 57 | return tokens 58 | else: 59 | # ignore embedding of [CLS] and [SEP] 60 | return tokens[:, 1:-1, :] 61 | 62 | @property 63 | def vector_size(self): 64 | return self.model.config.hidden_size 65 | -------------------------------------------------------------------------------- /EduNLP/Vector/const.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/7/12 @ tongshiwei 3 | 4 | UNK = "[UNK]" 5 | PAD = "[PAD]" 6 | -------------------------------------------------------------------------------- /EduNLP/Vector/disenqnet/__init__.py: -------------------------------------------------------------------------------- 1 | from .disenqnet import DisenQModel 2 | -------------------------------------------------------------------------------- /EduNLP/Vector/disenqnet/disenqnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from EduNLP.ModelZoo.disenqnet.disenqnet import DisenQNet 3 | from EduNLP.Vector.meta import Vector 4 | 5 | 6 | class DisenQModel(Vector): 7 | def __init__(self, pretrained_dir, device="cpu"): 8 | """ 9 | Parameters 10 | ---------- 11 | pretrained_dir: str 12 | the dirname to pretrained model 13 | device: str 14 | cpu or cuda, default is cpu 15 | """ 16 | self.device = device 17 | self.model = DisenQNet.from_pretrained(pretrained_dir).to(self.device) 18 | self.model.eval() 19 | 20 | def __call__(self, items: dict): 21 | self.cuda_tensor(items) 22 | outputs = self.model(**items) 23 | return outputs.embeded, outputs.k_hidden, outputs.i_hidden 24 | 25 | def infer_vector(self, items: dict, vector_type=None, **kwargs) -> torch.Tensor: 26 | """ 27 | Parameters 28 | ---------- 29 | vector_type: str 30 | choose the type of items tensor to return. 31 | Default is None, which means return both (k_hidden, i_hidden) 32 | when vector_type="k", return k_hidden; 33 | when vector_type="i", return i_hidden; 34 | """ 35 | _, k_hidden, i_hidden = self(items) 36 | if vector_type is None: 37 | return k_hidden, i_hidden 38 | elif vector_type == "k": 39 | return k_hidden 40 | elif vector_type == "i": 41 | return i_hidden 42 | else: 43 | raise KeyError("vector_type must be one of (None, 'k', 'i') ") 44 | 45 | def infer_tokens(self, items: dict, **kwargs) -> torch.Tensor: 46 | embeded, _, _ = self(items) 47 | """ 48 | get tokens embedding with DisenQModel 49 | Parameters 50 | ---------- 51 | items: dict 52 | {'content_idx': tensor(),'content_len': tensor()}, the tokens about question after tokenizer processing 53 | 54 | Returns: 55 | torch.Tensor: token embedding 56 | """ 57 | return embeded 58 | 59 | @property 60 | def vector_size(self): 61 | return self.model.hidden_size 62 | -------------------------------------------------------------------------------- /EduNLP/Vector/elmo_vec.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from EduNLP.ModelZoo.rnn import ElmoLM 3 | from .meta import Vector 4 | 5 | 6 | class ElmoModel(Vector): 7 | def __init__(self, pretrained_dir: str, device="cpu"): 8 | """ 9 | Parameters 10 | ---------- 11 | pretrained_model_path: str 12 | """ 13 | super(ElmoModel, self).__init__() 14 | self.device = device 15 | self.model = ElmoLM.from_pretrained(pretrained_dir).to(device) 16 | self.model.eval() 17 | 18 | def __call__(self, items: dict): 19 | self.cuda_tensor(items) 20 | outputs = self.model(**items) 21 | return outputs 22 | 23 | def infer_vector(self, items: dict, **kwargs) -> torch.Tensor: 24 | """ 25 | get sentence vector embedding with ElmoModel 26 | Parameters 27 | ---------- 28 | items: dict, {'seq_idx': tensor(),'seq_len':tensor()}, the tokens about question after tokenizer processing 29 | 30 | Returns: 31 | torch.Tensor: sentence embedding 32 | """ 33 | outputs = self(items) 34 | item_embeds = torch.cat( 35 | (outputs.forward_output[torch.arange(len(items["seq_len"])), torch.tensor(items["seq_len"]) - 1], 36 | outputs.backward_output[torch.arange(len(items["seq_len"])), 0]), 37 | dim=-1) 38 | return item_embeds 39 | 40 | def infer_tokens(self, items, **kwargs) -> torch.Tensor: 41 | """ 42 | get tokens embedding with ElmoModel 43 | Parameters 44 | ---------- 45 | items: dict, {'seq_idx': tensor()}, the tokens about question after tokenizer processing 46 | 47 | Returns: 48 | torch.Tensor: token embedding 49 | """ 50 | outputs = self(items) 51 | forward_hiddens = outputs.forward_output 52 | backward_hiddens = outputs.backward_output 53 | return torch.cat((forward_hiddens, backward_hiddens), dim=-1) 54 | 55 | @property 56 | def vector_size(self): 57 | return 2 * self.model.hidden_size 58 | -------------------------------------------------------------------------------- /EduNLP/Vector/embedding.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/7/12 @ tongshiwei 3 | 4 | from typing import List 5 | import torch 6 | from .gensim_vec import W2V 7 | from .const import PAD 8 | from EduNLP.ModelZoo import pad_sequence, set_device 9 | 10 | 11 | class Embedding(object): 12 | def __init__(self, w2v: (W2V, tuple, list, dict, None), freeze=True, device=None, **kwargs): 13 | if w2v is None: 14 | self.w2v = None 15 | elif isinstance(w2v, (tuple, list)): 16 | self.w2v = W2V(*w2v) 17 | elif isinstance(w2v, dict): 18 | self.w2v = W2V(**w2v) 19 | elif isinstance(w2v, W2V): 20 | self.w2v = w2v 21 | else: 22 | raise TypeError("w2v argument must be one of W2V, tuple, list, dict or None, now is %s" % type(w2v)) 23 | 24 | if self.w2v is not None: 25 | self.vocab_size = len(self.w2v) 26 | self.embedding_dim = self.w2v.vector_size 27 | else: 28 | self.vocab_size = kwargs["vocab_size"] 29 | self.embedding_dim = kwargs["embedding_dim"] 30 | 31 | self.embedding = torch.nn.Embedding(self.vocab_size, self.embedding_dim) 32 | 33 | self.pad_val = 0 34 | if self.w2v is not None: 35 | self.embedding.from_pretrained(torch.Tensor(self.w2v.vectors), freeze) 36 | self.pad_val = self.w2v.constants[PAD] 37 | self.key_to_index = self.w2v.key_to_index if w2v is not None else lambda x: x 38 | 39 | if device is not None: 40 | self.set_device(device) 41 | 42 | def __call__(self, items: List[List[str]], indexing=True, padding=True, vectorization=True, *args, 43 | **kwargs) -> tuple: 44 | 45 | items, item_len = self.indexing(items, padding=padding, indexing=indexing) 46 | items = self.infer_token_vector(items, indexing=False)[0] if vectorization else items 47 | return items, item_len 48 | 49 | def infer_token_vector(self, items: List[List[str]], indexing=True) -> tuple: 50 | items, item_len = self.indexing(items, padding=True, indexing=indexing) 51 | item_embedding = self.embedding(torch.LongTensor(items)) 52 | return item_embedding, item_len 53 | 54 | def indexing(self, items: List[List[str]], padding=False, indexing=True) -> tuple: 55 | """ 56 | 57 | Parameters 58 | ---------- 59 | items: list of list of str(word/token) 60 | padding: bool 61 | whether padding the returned list with default pad_val to make all item in items have the same length 62 | indexing: bool 63 | 64 | Returns 65 | ------- 66 | token_idx: list of list of int 67 | the list of the tokens of each item 68 | token_len: list of int 69 | the list of the length of tokens of each item 70 | """ 71 | items_idx = [[self.key_to_index(word) for word in item] for item in items] if indexing else items 72 | item_len = [len(_idx) for _idx in items_idx] 73 | padded_items_idx = pad_sequence(items_idx, pad_val=self.pad_val) if padding is True else items_idx 74 | return padded_items_idx, item_len 75 | 76 | def set_device(self, device): 77 | self.embedding = set_device(self.embedding, device) 78 | return self 79 | -------------------------------------------------------------------------------- /EduNLP/Vector/meta.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/7/13 @ tongshiwei 3 | import torch 4 | 5 | 6 | class Vector(object): 7 | def infer_vector(self, items, *args, **kwargs) -> ...: 8 | pass 9 | 10 | def infer_tokens(self, items, *args, **kwargs) -> ...: 11 | pass 12 | 13 | @property 14 | def vector_size(self): 15 | raise NotImplementedError 16 | 17 | @property 18 | def is_frozen(self): # pragma: no cover 19 | return True 20 | 21 | def freeze(self, *args, **kwargs): # pragma: no cover 22 | pass 23 | 24 | def cuda_tensor(self, items: dict): 25 | for k, v in items.items(): 26 | if isinstance(v, torch.Tensor): 27 | items[k] = v.to(self.device) 28 | -------------------------------------------------------------------------------- /EduNLP/Vector/quesnet/__init__.py: -------------------------------------------------------------------------------- 1 | from .quesnet import QuesNetModel 2 | -------------------------------------------------------------------------------- /EduNLP/Vector/quesnet/quesnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from typing import Union 3 | from EduNLP.ModelZoo.quesnet import QuesNet 4 | from EduNLP.Pretrain import Question, QuesNetTokenizer 5 | from EduNLP.Vector.meta import Vector 6 | 7 | 8 | class QuesNetModel(Vector): 9 | def __init__(self, pretrained_dir, device="cpu", **kwargs): 10 | """ 11 | Parameters 12 | ---------- 13 | pretrained_dir: str 14 | the dirname to pretrained model 15 | device: str 16 | cpu or cuda, default is cpu 17 | img_dir: str 18 | image dir 19 | """ 20 | self.device = torch.device(device) 21 | self.model = QuesNet.from_pretrained(pretrained_dir).to(self.device) 22 | self.model.eval() 23 | 24 | def __call__(self, items: dict): 25 | """ get question embedding with quesnet 26 | 27 | Parameters 28 | ---------- 29 | items: 30 | encodes from tokenizer 31 | """ 32 | qs = [Question("", items['seq_idx'][i], 33 | [0], [[0], [0], [0]], items['meta_idx'][i]) for i in range(len(items['seq_idx']))] 34 | outputs = self.model(self.model.make_batch(qs, device=self.device)) 35 | return outputs.hidden, outputs.embeded 36 | 37 | def infer_vector(self, items: Union[dict, list], **kwargs) -> torch.Tensor: 38 | """ get question embedding with quesnet 39 | 40 | Parameters 41 | ---------- 42 | items: 43 | encodes from tokenizer 44 | """ 45 | return self(items)[0] 46 | 47 | def infer_tokens(self, items: Union[dict, list], **kwargs) -> torch.Tensor: 48 | """ get token embeddings with quesnet 49 | 50 | Parameters 51 | ---------- 52 | items: 53 | encodes from tokenizer 54 | Returns 55 | ------- 56 | torch.Tensor 57 | word_embs + meta_emb 58 | """ 59 | vector = self(items)[1] 60 | """ Please note that output vector is like 0 0 seq_idx(text with image) 0 meta_idx 0 0""" 61 | return vector[:, 2:-2, :] 62 | 63 | @property 64 | def vector_size(self): 65 | return self.model.feat_size 66 | -------------------------------------------------------------------------------- /EduNLP/Vector/rnn/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/7/12 @ tongshiwei 3 | 4 | from .rnn import RNNModel 5 | -------------------------------------------------------------------------------- /EduNLP/Vector/rnn/rnn.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/7/12 @ tongshiwei 3 | 4 | import torch 5 | from ..gensim_vec import W2V 6 | from ..embedding import Embedding 7 | from ..meta import Vector 8 | from EduNLP.ModelZoo import rnn, set_device 9 | from baize.torch import save_params 10 | 11 | 12 | class RNNModel(Vector): 13 | """ 14 | Examples 15 | -------- 16 | >>> model = RNNModel("BiLSTM", None, 2, vocab_size=4, embedding_dim=3) 17 | >>> seq_idx = [[1, 2, 3], [1, 2, 0], [3, 0, 0]] 18 | >>> output, hn = model(seq_idx, indexing=False, padding=False) 19 | >>> seq_idx = [[1, 2, 3], [1, 2], [3]] 20 | >>> output, hn = model(seq_idx, indexing=False, padding=True) 21 | >>> output.shape 22 | torch.Size([3, 3, 4]) 23 | >>> hn.shape 24 | torch.Size([2, 3, 2]) 25 | >>> tokens = model.infer_tokens(seq_idx, indexing=False) 26 | >>> tokens.shape 27 | torch.Size([3, 3, 4]) 28 | >>> tokens = model.infer_tokens(seq_idx, agg="mean", indexing=False) 29 | >>> tokens.shape 30 | torch.Size([3, 4]) 31 | >>> item = model.infer_vector(seq_idx, indexing=False) 32 | >>> item.shape 33 | torch.Size([3, 4]) 34 | >>> item = model.infer_vector(seq_idx, agg="mean", indexing=False) 35 | >>> item.shape 36 | torch.Size([3, 2]) 37 | >>> item = model.infer_vector(seq_idx, agg=None, indexing=False) 38 | >>> item.shape 39 | torch.Size([2, 3, 2]) 40 | """ 41 | 42 | def __init__(self, rnn_type, w2v: (W2V, tuple, list, dict, None), hidden_size, 43 | freeze_pretrained=True, model_params=None, device=None, 44 | **kwargs): 45 | self.embedding = Embedding(w2v, freeze_pretrained, **kwargs) 46 | for key in ["vocab_size", "embedding_dim"]: 47 | if key in kwargs: 48 | kwargs.pop(key) 49 | self.rnn = rnn.LM( 50 | rnn_type, 51 | self.embedding.vocab_size, 52 | self.embedding.embedding_dim, 53 | hidden_size=hidden_size, 54 | embedding=self.embedding.embedding, 55 | model_params=model_params, 56 | **kwargs 57 | ) 58 | self.bidirectional = self.rnn.rnn.bidirectional 59 | self.hidden_size = self.rnn.hidden_size 60 | self.freeze_pretrained = freeze_pretrained 61 | if device is not None: 62 | self.set_device(device) 63 | 64 | def __call__(self, items, indexing=True, padding=True, **kwargs): 65 | seq_idx, seq_len = self.embedding(items, indexing=indexing, padding=padding, vectorization=False) 66 | 67 | tokens, item = self.rnn(torch.LongTensor(seq_idx), torch.LongTensor(seq_len)) 68 | 69 | return tokens, item 70 | 71 | def infer_vector(self, items, agg: (int, str, None) = -1, indexing=True, padding=True, *args, 72 | **kwargs) -> torch.Tensor: 73 | vector = self(items, indexing=indexing, padding=padding, **kwargs)[1] 74 | if agg is not None: 75 | if agg == -1: 76 | return torch.reshape(vector, (vector.shape[1], -1)) 77 | return eval("torch.%s" % agg)(vector, dim=0) 78 | return vector 79 | 80 | def infer_tokens(self, items, agg=None, *args, **kwargs) -> torch.Tensor: 81 | tokens = self(items, **kwargs)[0] 82 | if agg is not None: 83 | return eval("torch.%s" % agg)(tokens, dim=1) 84 | return tokens 85 | 86 | @property 87 | def vector_size(self) -> int: 88 | return self.hidden_size * (1 if self.bidirectional is False else 2) 89 | 90 | def set_device(self, device): 91 | self.rnn = set_device(self.rnn, device) 92 | 93 | def save(self, filepath, save_embedding=False): 94 | save_params(filepath, self.rnn, select=None if save_embedding is True else '^(?!.*embedding)') 95 | return filepath 96 | 97 | def freeze(self, *args, **kwargs): 98 | return self.eval() 99 | 100 | @property 101 | def is_frozen(self): 102 | return not self.rnn.training 103 | 104 | def eval(self): 105 | self.rnn.eval() 106 | return self 107 | 108 | def train(self, mode=True): 109 | self.rnn.train(mode) 110 | return self 111 | -------------------------------------------------------------------------------- /EduNLP/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import logger 2 | from .I2V import get_pretrained_i2v 3 | -------------------------------------------------------------------------------- /EduNLP/constant.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/8/1 @ tongshiwei 3 | 4 | import os 5 | from os.path import expanduser, join 6 | 7 | ROOT = os.environ.get("EDUNLPPATH", join(expanduser("~"), ".EduNLP")) 8 | MODEL_DIR = os.environ.get("EDUNLPMODELPATH", join(ROOT, "model")) 9 | -------------------------------------------------------------------------------- /EduNLP/main.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/8/2 @ tongshiwei 3 | 4 | import fire 5 | 6 | 7 | from EduNLP.Vector.t2v import get_all_pretrained_models 8 | 9 | 10 | def list_i2v(): 11 | print("\n".join(get_all_pretrained_models())) 12 | 13 | 14 | def cli(): # pragma: no cover 15 | fire.Fire({"i2v": list_i2v}) 16 | -------------------------------------------------------------------------------- /EduNLP/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/5/20 @ tongshiwei 3 | 4 | from .path import abs_current_dir, path_append 5 | from .image import image2base64 6 | from .log import logger 7 | from .data import dict2str4sif 8 | -------------------------------------------------------------------------------- /EduNLP/utils/data.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/5/30 @ tongshiwei 3 | 4 | from contextlib import contextmanager 5 | 6 | ann_format = r"$\SIFTag{{{}}}$" 7 | ann_begin_format = r"$\SIFTag{{{}_begin}}$" 8 | ann_end_format = r"$\SIFTag{{{}_end}}$" 9 | ann_list_no_format = r"$\SIFTag{{list_{}}}$" 10 | 11 | 12 | @contextmanager 13 | def add_annotation(key, tag_mode, tar: list, key_as_tag=True): 14 | """add tag""" 15 | if key_as_tag is True: 16 | if tag_mode == "delimiter": 17 | tar.append(ann_begin_format.format(key)) 18 | elif tag_mode == "head": 19 | tar.append(ann_format.format(key)) 20 | yield 21 | if key_as_tag is True: 22 | if tag_mode == "delimiter": 23 | tar.append(ann_end_format.format(key)) 24 | elif tag_mode == "tail": 25 | tar.append(ann_format.format(key)) 26 | 27 | 28 | def dict2str4sif(obj: dict, key_as_tag=True, tag_mode="delimiter", add_list_no_tag=True, keys=None) -> str: 29 | r""" 30 | The function aims to transfer dictionary format item to string format item. 31 | 32 | Parameters 33 | ---------- 34 | obj 35 | key_as_tag 36 | tag_mode 37 | delimiter: add $\SIFTag{key_begin}$ in the head and add $\SIFTag{key_end}$ at the end 38 | head: add $\SIFTag{key}$ in the head 39 | tail: add $\SIFTag{key}$ at the end 40 | add_list_no_tag 41 | keys 42 | 43 | Examples 44 | ------- 45 | >>> item = { 46 | ... "stem": r"若复数$z=1+2 i+i^{3}$,则$|z|=$", 47 | ... "options": ['0', '1', r'$\sqrt{2}$', '2'], 48 | ... } 49 | >>> item 50 | {'stem': '若复数$z=1+2 i+i^{3}$,则$|z|=$', 'options': ['0', '1', '$\\sqrt{2}$', '2']} 51 | >>> dict2str4sif(item) # doctest: +ELLIPSIS 52 | '$\\SIFTag{stem_begin}$...$\\SIFTag{stem_end}$$\\SIFTag{options_begin}$...$\\SIFTag{options_end}$' 53 | >>> dict2str4sif(item, add_list_no_tag=True) # doctest: +ELLIPSIS 54 | '...$\\SIFTag{options_begin}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1...$\\SIFTag{options_end}$' 55 | >>> dict2str4sif(item, tag_mode="head") # doctest: +ELLIPSIS 56 | '$\\SIFTag{stem}$...$\\SIFTag{options}$...' 57 | >>> dict2str4sif(item, tag_mode="tail") # doctest: +ELLIPSIS 58 | '若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem}$...2$\\SIFTag{options}$' 59 | >>> dict2str4sif(item, add_list_no_tag=False) # doctest: +ELLIPSIS 60 | '...$\\SIFTag{options_begin}$0$\\SIFSep$1$\\SIFSep$...$\\SIFTag{options_end}$' 61 | >>> dict2str4sif(item, key_as_tag=False) 62 | '若复数$z=1+2 i+i^{3}$,则$|z|=$0$\\SIFSep$1$\\SIFSep$$\\sqrt{2}$$\\SIFSep$2' 63 | """ 64 | ret = [] 65 | keys = obj.keys() if keys is None else keys 66 | for key in keys: 67 | _obj = [] 68 | value = obj[key] 69 | with add_annotation(key, tag_mode, _obj, key_as_tag): 70 | if isinstance(value, str): 71 | _obj.append(value) 72 | elif isinstance(value, (list, dict)): 73 | value = value.values() if isinstance(value, dict) else value 74 | for i, v in enumerate(value): 75 | v = str(v) 76 | if key_as_tag is True and add_list_no_tag is True: 77 | _obj.append(ann_list_no_format.format(i)) 78 | else: 79 | if i > 0: 80 | _obj.append(r"$\SIFSep$") 81 | _obj.append(v) 82 | else: # pragma: no cover 83 | raise TypeError("Cannot handle %s" % type(value)) 84 | ret.append("".join(_obj)) 85 | return str("".join(ret)) 86 | -------------------------------------------------------------------------------- /EduNLP/utils/image.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/5/20 @ tongshiwei 3 | 4 | import base64 5 | from io import BytesIO 6 | 7 | 8 | def image2base64(img): 9 | buffered = BytesIO() 10 | img.save(buffered, format="png") 11 | img_str = base64.b64encode(buffered.getvalue()) 12 | return img_str.decode("utf-8") 13 | -------------------------------------------------------------------------------- /EduNLP/utils/log.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/5/29 @ tongshiwei 3 | import logging 4 | 5 | 6 | def get_logger(): 7 | _logger = logging.getLogger("EduNLP") 8 | _logger.setLevel(logging.INFO) 9 | _logger.propagate = False 10 | ch = logging.StreamHandler() 11 | ch.setFormatter(logging.Formatter('[%(name)s, %(levelname)s] %(message)s')) 12 | ch.setLevel(logging.INFO) 13 | _logger.addHandler(ch) 14 | return _logger 15 | 16 | 17 | logger = get_logger() 18 | -------------------------------------------------------------------------------- /EduNLP/utils/path.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/5/20 @ tongshiwei 3 | 4 | import os 5 | from pathlib import PurePath 6 | 7 | 8 | def abs_current_dir(filepath): 9 | """ 10 | 获取文件所在目录的绝对路径 11 | 12 | Example 13 | ------- 14 | .. code :: 15 | 16 | abs_current_dir(__file__) 17 | 18 | """ 19 | return os.path.abspath(os.path.dirname(filepath)) 20 | 21 | 22 | def path_append(path, *addition, to_str=False): 23 | """ 24 | 路径合并函数 25 | 26 | Examples 27 | -------- 28 | .. code-block:: python 29 | 30 | path_append("../", "../data", "../dataset1/", "train", to_str=True) 31 | '../../data/../dataset1/train' 32 | 33 | Parameters 34 | ---------- 35 | path: str or PurePath 36 | addition: list(str or PurePath) 37 | to_str: bool 38 | Convert the new path to str 39 | Returns 40 | ------- 41 | 42 | """ 43 | path = PurePath(path) 44 | if addition: 45 | for a in addition: 46 | path = path / a 47 | if to_str: 48 | return str(path) 49 | return path 50 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include EduNLP/meta_data * -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | VERSION=`ls dist/*.tar.gz | sed "s/dist\/EduNLP-\(.*\)\.tar\.gz/\1/g"` 2 | 3 | ifdef ENVPIP 4 | PIP = $(ENVPIP) 5 | else 6 | PIP = pip3 7 | endif 8 | 9 | ifdef ENVPYTHON 10 | PYTHON = $(ENVPYTHON) 11 | else 12 | PYTHON = python3 13 | endif 14 | 15 | ifdef ENVPYTEST 16 | PYTEST = $(ENVPYTEST) 17 | else 18 | PYTEST = pytest 19 | endif 20 | 21 | help: 22 | 23 | @echo "install install EduNLP" 24 | @echo "test run test" 25 | @echo "release publish to PyPI and release in github" 26 | @echo "release_test publish to TestPyPI" 27 | @echo "clean remove all build, test, coverage and Python artifacts" 28 | @echo "clean-build remove build artifacts" 29 | @echo "clean-pyc remove Python file artifacts" 30 | @echo "clean-test remove test and coverage artifacts" 31 | 32 | .PHONY: install, test, build, release, release_test, version, .test, .build, clean 33 | 34 | install: 35 | @echo "install EduNLP" 36 | $(PIP) install -e . --user 37 | 38 | test: 39 | @echo "run test" 40 | $(PYTEST) 41 | 42 | build: test, clean 43 | $(PYTHON) setup.py bdist_wheel sdist 44 | 45 | .test: 46 | $(PYTEST) > /dev/null 47 | 48 | .build: clean 49 | $(PYTHON) setup.py bdist_wheel sdist > /dev/null 50 | 51 | version: .build 52 | @echo $(VERSION) 53 | 54 | release: test, build 55 | @echo "publish to pypi and release in github" 56 | @echo "version $(VERSION)" 57 | 58 | -@twine upload dist/* && git tag "v$(VERSION)" 59 | git push && git push --tags 60 | 61 | release_test: test, build 62 | @echo "publish to test pypi" 63 | @echo "version $(VERSION)" 64 | 65 | -@twine upload --repository test dist/* 66 | 67 | clean: clean-build clean-pyc clean-test 68 | 69 | clean-build: 70 | rm -rf build/* 71 | rm -rf dist/* 72 | rm -rf .eggs/* 73 | find . -name '*.egg-info' -exec rm -fr {} + 74 | find . -name '*.egg' -exec rm -f {} + 75 | 76 | clean-pyc: 77 | find . -name '*.pyc' -exec rm -f {} + 78 | find . -name '*.pyo' -exec rm -f {} + 79 | find . -name '*~' -exec rm -f {} + 80 | find . -name '__pycache__' -exec rm -rf {} + 81 | 82 | clean-test: 83 | rm -f .coverage -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |

4 | 5 | # EduNLP 6 | 7 | [![VERSION](https://img.shields.io/pypi/pyversions/longling)](https://pypi.python.org/pypi/longling) 8 | [![PyPI](https://img.shields.io/pypi/v/EduNLP.svg)](https://pypi.python.org/pypi/EduNLP) 9 | [![test](https://github.com/bigdata-ustc/EduNLP/actions/workflows/python-test.yml/badge.svg?branch=master)](https://github.com/bigdata-ustc/EduNLP/actions/workflows/python-test.yml) 10 | [![codecov](https://codecov.io/gh/bigdata-ustc/EduNLP/branch/master/graph/badge.svg?token=B7gscOGQLD)](https://codecov.io/gh/bigdata-ustc/EduNLP) 11 | [![Documentation Status](https://readthedocs.org/projects/edunlp/badge/?version=latest)](https://edunlp.readthedocs.io/en/latest/?badge=latest) 12 | [![Download](https://img.shields.io/pypi/dm/EduNLP.svg?style=flat)](https://pypi.python.org/pypi/EduNLP) 13 | [![License](https://img.shields.io/github/license/bigdata-ustc/EduNLP)](LICENSE) 14 | [![DOI](https://zenodo.org/badge/332661206.svg)](https://zenodo.org/badge/latestdoi/332661206) 15 | 16 | 17 | EduNLP is a library for advanced Natural Language Processing in Python and is one of the projects of [EduX]((https://github.com/bigdata-ustc/EduX)) plan of [BDAA](https://github.com/bigdata-ustc). It's built on the very latest research, and was designed from day one to be used in real educational products. 18 | 19 | EduNLP now comes with pretrained pipelines and currently supports segment, tokenization and vertorization. It supports varies of preprocessing for NLP in educational scenario, such as formula parsing, multi-modal segment. 20 | 21 | EduNLP is commercial open-source software, released under the [Apache-2.0 license](LICENSE). 22 | 23 | ## Quickstart 24 | 25 | ### Installation 26 | 27 | Git and install by pip 28 | ``` sh 29 | # basic installation 30 | pip install . 31 | 32 | # full installation 33 | pip install .[full] 34 | ``` 35 | or install from pypi: 36 | ``` 37 | # basic installation 38 | pip install EduNLP 39 | 40 | # full installation 41 | pip install EduNLP[full] 42 | ``` 43 | 44 | ### Usage 45 | 46 | ```python 47 | from EduNLP import get_pretrained_i2v 48 | i2v = get_pretrained_i2v("d2v_all_300", "./model") 49 | item_vector, token_vector = i2v(["the content of item 1", "the content of item 2"]) 50 | ``` 51 | 52 | ### Tutorial 53 | 54 | For more details, please refer to the full documentation ([latest](https://edunlp.readthedocs.io/en/latest) | [stable](https://edunlp.readthedocs.io/en/stable)). 55 | 56 | ### Resource 57 | We will continuously publish new datasets in [Standard Item Format (SIF)](https://github.com/bigdata-ustc/EduNLP/blob/master/docs/SIF4TI_CH.md) to encourage the relevant research works. The data resources can be accessed via another EduX project [EduData](https://github.com/bigdata-ustc/EduData) 58 | 59 | ## Contribute 60 | 61 | EduNLP is still under development. More algorithms and features are going to be added and we always welcome contributions to help make EduNLP better. If you would like to contribute, please follow this [guideline](CONTRIBUTE.md)([开发指南](CONTRIBUTE_CH.md)). 62 | 63 | ## Citation 64 | 65 | If this repository is helpful for you, please cite our work 66 | 67 | ``` 68 | @misc{bigdata2021edunlp, 69 | title={EduNLP}, 70 | author={bigdata-ustc}, 71 | publisher = {GitHub}, 72 | journal = {GitHub repository}, 73 | year = {2021}, 74 | howpublished = {\url{https://github.com/bigdata-ustc/EduNLP}}, 75 | } 76 | ``` 77 | -------------------------------------------------------------------------------- /asset/_static/d2v.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/asset/_static/d2v.png -------------------------------------------------------------------------------- /asset/_static/d2v_bow_tfidf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/asset/_static/d2v_bow_tfidf.png -------------------------------------------------------------------------------- /asset/_static/d2v_general.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/asset/_static/d2v_general.png -------------------------------------------------------------------------------- /asset/_static/d2v_stem_tf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/asset/_static/d2v_stem_tf.png -------------------------------------------------------------------------------- /asset/_static/data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/asset/_static/data.png -------------------------------------------------------------------------------- /asset/_static/formula.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/asset/_static/formula.png -------------------------------------------------------------------------------- /asset/_static/i2v.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/asset/_static/i2v.png -------------------------------------------------------------------------------- /asset/_static/item.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/asset/_static/item.png -------------------------------------------------------------------------------- /asset/_static/item_figure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/asset/_static/item_figure.png -------------------------------------------------------------------------------- /asset/_static/item_formula.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/asset/_static/item_formula.png -------------------------------------------------------------------------------- /asset/_static/parse.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/asset/_static/parse.png -------------------------------------------------------------------------------- /asset/_static/prepare_dataset.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/asset/_static/prepare_dataset.jpg -------------------------------------------------------------------------------- /asset/_static/seg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/asset/_static/seg.png -------------------------------------------------------------------------------- /asset/_static/sif.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/asset/_static/sif.png -------------------------------------------------------------------------------- /asset/_static/sif_addition.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/asset/_static/sif_addition.png -------------------------------------------------------------------------------- /asset/_static/tokenizer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/asset/_static/tokenizer.png -------------------------------------------------------------------------------- /asset/_static/w2v_stem_text.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/asset/_static/w2v_stem_text.png -------------------------------------------------------------------------------- /asset/_static/w2v_stem_tf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/asset/_static/w2v_stem_tf.png -------------------------------------------------------------------------------- /docs/EduNLP.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/docs/EduNLP.png -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | EduNLP document and tutorial folder 2 | =================================== 3 | 4 | Requirements 5 | ------------ 6 | 7 | See the requirements `docs_deps` in `setup.py`: 8 | 9 | ```sh 10 | pip install -e .[doc] 11 | ``` 12 | 13 | Build documents 14 | --------------- 15 | 16 | First, clean up existing files: 17 | 18 | ``` 19 | make clean 20 | ``` 21 | 22 | Then build: 23 | 24 | ``` 25 | make html 26 | ``` 27 | 28 | Render locally 29 | -------------- 30 | 31 | ``` 32 | cd build/html 33 | python3 -m http.server 8000 34 | ``` 35 | -------------------------------------------------------------------------------- /docs/SIF4TI_CH.md: -------------------------------------------------------------------------------- 1 | # 标准项目格式 2 | 3 | version: 0.2 4 | 5 | 为了后续研究和使用的方便,我们需要一个统一的试题语法标准。 6 | 7 | ## 语法规则 8 | 1. 题目文本中只允许出现中文字符、中英文标点和换行符。 9 | 2. 使用 \$\SIFBlank\$ 替换横线,对于选择题中的括号使用 \$\SIFChoice\$ 替换。 10 | 3. 图片 ID 以公式的形式嵌入文本中:`$\FigureID{ uuid }$` 或用 base64 编码表示,特别的,内容为公式的图片用`$\FormFigureID{ uuid }$`表示。 11 | 4. 文本标注格式:统一用 `$\textf{item,CHAR_EN}$` 表示,目前定义的有:b-加粗,i-斜体,u-下划线,w-下划波浪线,d-加点,t-标题。标注可以混用,按字母顺序排序,例如:$\textf{EduNLP, biu}$ 表示 ***EduNLP*** 12 | 5. 其余诸如,英文字母、罗马字符、数字等数学符号一律需要使用 latex 格式表示,即嵌在 `$$` 之中。 13 | 6. 分子式的录入标准暂且参考 [INCHI](https://zh.wikipedia.org/wiki/%E5%9B%BD%E9%99%85%E5%8C%96%E5%90%88%E7%89%A9%E6%A0%87%E8%AF%86) 14 | 7. 目前对 latex 内部语法没有要求。 15 | 16 | ``` 17 | 1. Item -> CHARACTER|EN_PUN_LIST|CH_PUN_LIST|FORMULA|QUES_MARK 18 | 2. EN_PUN_LIST -> [',', '.', '?', '!', ':', ';', '\'', '\"', '(', ')', ' ','_','/','|','\\','<','>','[',']','-'] 19 | 3. CH_PUN_LIST -> [',', '。', '!', '?', ':',';', '‘', '’', '“', '”', '(', ')', ' ', '、','《','》','—','.'] 20 | 4. FORMULA -> $latex formula$ | $\FormFigureID{UUID}$ | $\FormFigureBase64{BASE64}$ 21 | 5. FIGURE -> $\FigureID{UUID}$ | $\FigureBase64{BASE64}$ 22 | 6. UUID -> [a-zA-Z\-0-9]+ 23 | 7. CHARACTER -> CHAR_EN | CHAR_CH 24 | 8. CHAR_EN -> [a-zA-Z]+ 25 | 9. CHAR_CH -> [\u4e00-\u9fa5]+ 26 | 10. DIGITAL -> [0-9]+ 27 | 11. QUES_MARK -> $\SIFBlank$ | $\SIFChoice$ 28 | ``` 29 | 30 | ### 注意事项 31 | 1. 保留字符与转义 32 | 2. 数字 33 | 3. 选空与填空 34 | 4. 对于单个的数字或字符也需要添加 `$$`(目前能实现自动校验) 35 | 5. latex 公式中尽量不出现中文:(`\text{这里出现中文}`) 36 | 6. MySql 数据库导入数据时会自动忽略一个 `\`,所以录入的公式需要进一步处理为 `\\` 37 | 38 | ## 示例 39 | 40 | 标准形式: 41 | 42 | 1. `若$x,y$满足约束条件$\\left\\{\\begin{array}{c}2 x+y-2 \\leq 0 \\\\ x-y-1 \\geq 0 \\\\ y+1 \\geq 0\\end{array}\\right.$,则$z=x+7 y$的最大值$\\SIFUnderline$'` 43 | 44 | 2. `已知函数$f(x)=|3 x+1|-2|x|$画出$y=f(x)$的图像求不等式$f(x)>f(x+1)$的解集$\\PictureID{3bf2ddf4-8af1-11eb-b750-b46bfc50aa29}$$\\PictureID{59b8bd14-8af1-11eb-93a5-b46bfc50aa29}$$\\PictureID{63118b3a-8b75-11eb-a5c0-b46bfc50aa29}$$\\PictureID{6a006179-8b76-11eb-b386-b46bfc50aa29}$$\\PictureID{088f15eb-8b7c-11eb-a86f-b46bfc50aa29}$` 45 | 46 | 非标准形式: 47 | 48 | 1. 字母、数字和数学符号连续混合出现: 49 | 例如: 50 | `完成下面的2x2列联表,` 51 | `(单位:m3)` 52 | `则输出的n=` 53 | 54 | 2. 特殊的数学符号没有用 latex 公式表示: 55 | 例如: 56 | `命题中真命题的序号是 ①` 57 | `AB是⊙O的直径,AC是⊙O的切线,BC交⊙O于点E.若D为AC的中点` 58 | 59 | 3. 出现以 unicode 编码写成的字符 60 | 例如:`则$a$的取值范围是(\u3000\u3000)` 61 | 62 | 63 | ## Change Log 64 | 65 | 2021-05-18 66 | 67 | 修改: 68 | 1. 原用 \$\SIFUnderline\$ 和 \$\SIFBracket\$ 来替换填空题中的横线和选择题中的括号,现分别用 \$\SIFBlank\$ 和 \$\SIFChoice\$ 替换。 69 | 2. 原统一用`$\PictureID{ uuid }$`表示图片,现使用`$\FigureID{ uuid }$`,其中对于数据公式,用`$\FormFigureID{ uuid }$`来表示。 70 | 71 | 2021-06-28 72 | 73 | 添加: 74 | 1. 注明 `$$` 之中不能出现换行符。 75 | 2. 添加文本标注格式说明。 76 | 77 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx 2 | sphinx_rtd_theme 3 | sphinx_toggleprompt 4 | sphinx-gallery>=0.6 5 | nbsphinx 6 | m2r2 7 | -------------------------------------------------------------------------------- /docs/source/_static/EduNLP.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/docs/source/_static/EduNLP.png -------------------------------------------------------------------------------- /docs/source/_static/formula.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/docs/source/_static/formula.png -------------------------------------------------------------------------------- /docs/source/_static/formulagroup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/docs/source/_static/formulagroup.png -------------------------------------------------------------------------------- /docs/source/_static/pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/docs/source/_static/pipeline.png -------------------------------------------------------------------------------- /docs/source/_static/流程图.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/docs/source/_static/流程图.png -------------------------------------------------------------------------------- /docs/source/api/ModelZoo.rst: -------------------------------------------------------------------------------- 1 | EduNLP.ModelZoo 2 | ================== 3 | 4 | base_model 5 | ----------- 6 | 7 | .. automodule:: EduNLP.ModelZoo.base_model 8 | :members: 9 | 10 | :: 11 | 相关方法中的参数说明: 12 | 13 | save_pretrained(output_dir): 14 | output_dir: str 15 | The path you want to save your model 16 | 17 | classmethodfrom_pretrained(pretrained_model_path, *args, **kwargs): 18 | pretrained_model_path: str 19 | The path where you load your checkpoint from 20 | 21 | save_config(config_dir): 22 | config_dir: str 23 | The path you want to save the config file 24 | 25 | @classmethod 26 | from_config(config_path, *args, **kwargs): 27 | config_path: str 28 | The path where you load the config file 29 | 30 | 31 | 32 | rnn 33 | ----------- 34 | 35 | .. automodule:: EduNLP.ModelZoo.rnn 36 | :members: 37 | :imported-members: 38 | 39 | :: 40 | 参数补充说明: 41 | @classmethod from_config(config_path, **kwargs): 42 | config_path: str 43 | The path where you load the config file 44 | 45 | 46 | 47 | disenqnet 48 | ----------- 49 | 50 | .. automodule:: EduNLP.ModelZoo.disenqnet 51 | :members: 52 | :imported-members: 53 | 54 | :: 55 | 参数补充说明: 56 | @classmethod from_config(config_path, **kwargs): 57 | config_path: str 58 | The path where you load the config file 59 | 60 | quesnet 61 | ----------- 62 | 63 | .. automodule:: EduNLP.ModelZoo.quesnet 64 | :members: 65 | :imported-members: 66 | 67 | utils 68 | ----------- 69 | 70 | .. automodule:: EduNLP.ModelZoo.utils 71 | :members: 72 | :imported-members: 73 | -------------------------------------------------------------------------------- /docs/source/api/formula.rst: -------------------------------------------------------------------------------- 1 | EduNLP.Formula 2 | ======================= 3 | 4 | .. automodule:: EduNLP.Formula.Formula 5 | :members: 6 | :imported-members: 7 | 8 | .. automodule:: EduNLP.Formula.ast 9 | :members: 10 | :imported-members: 11 | -------------------------------------------------------------------------------- /docs/source/api/i2v.rst: -------------------------------------------------------------------------------- 1 | EduNLP.I2V 2 | ============ 3 | 4 | .. automodule:: EduNLP.I2V.i2v 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/source/api/index.rst: -------------------------------------------------------------------------------- 1 | EduNLP 2 | ====== 3 | 4 | SIF 5 | ---------------------- 6 | .. automodule:: EduNLP.SIF.sif 7 | :members: 8 | :imported-members: 9 | 10 | EduNLP.Formula 11 | --------------------- 12 | 13 | .. automodule:: EduNLP.Formula.ast 14 | :members: 15 | :imported-members: 16 | 17 | EduNLP.I2V 18 | ----------------- 19 | 20 | .. automodule:: EduNLP.I2V.i2v 21 | :members: 22 | :imported-members: 23 | 24 | EduNLP.Pretrain 25 | ------------------- 26 | 27 | .. automodule:: EduNLP.Pretrain 28 | :members: 29 | :imported-members: 30 | 31 | EduNLP.Tokenizer 32 | ---------------------- 33 | 34 | .. automodule:: EduNLP.Tokenizer 35 | :members: 36 | :imported-members: 37 | 38 | Vector 39 | --------------- 40 | 41 | .. automodule:: EduNLP.Vector 42 | :members: 43 | :imported-members: 44 | 45 | 46 | Pipeline 47 | --------------- 48 | 49 | .. automodule:: EduNLP.Pipeline 50 | :members: 51 | :imported-members: 52 | -------------------------------------------------------------------------------- /docs/source/api/pipeline.rst: -------------------------------------------------------------------------------- 1 | EduNLP.Pipeline 2 | ================== 3 | 4 | Pipeline 5 | ---------------------------------------------------------- 6 | 7 | .. automodule:: EduNLP.Pipeline.base 8 | :members: 9 | 10 | 11 | Components 12 | ---------------------------------------------------------- 13 | 14 | .. automodule:: EduNLP.Pipeline.components 15 | :members: 16 | 17 | 18 | Property prediction 19 | ---------------------------------------------------------- 20 | 21 | .. automodule:: EduNLP.Pipeline.property_prediction 22 | :members: 23 | 24 | Knowledge prediction 25 | ---------------------------------------------------------- 26 | 27 | .. automodule:: EduNLP.Pipeline.knowledge_prediction 28 | :members: 29 | -------------------------------------------------------------------------------- /docs/source/api/pretrain.rst: -------------------------------------------------------------------------------- 1 | EduNLP.Pretrain 2 | ================== 3 | 4 | EduNLP.Pretrain.pretrian_utils 5 | --------------------------------------------------------------- 6 | .. automodule:: EduNLP.Pretrain.pretrian_utils 7 | :members: 8 | 9 | 10 | EduNLP.Pretrain.hugginface_utils 11 | --------------------------------------------------------------- 12 | 13 | .. automodule:: EduNLP.Pretrain.hugginface_utils 14 | :members: 15 | 16 | 17 | EduNLP.Pretrain.gensim_vec 18 | --------------------------------------------------------------- 19 | 20 | .. automodule:: EduNLP.Pretrain.gensim_vec 21 | :members: 22 | 23 | EduNLP.Pretrain.elmo_vec 24 | --------------------------------------------------------------- 25 | 26 | .. automodule:: EduNLP.Pretrain.elmo_vec 27 | :members: 28 | 29 | EduNLP.Pretrain.bert_vec 30 | --------------------------------------------------------------- 31 | 32 | .. automodule:: EduNLP.Pretrain.bert_vec 33 | :members: 34 | 35 | EduNLP.Pretrain.disenqnet_vec 36 | --------------------------------------------------------------- 37 | 38 | .. automodule:: EduNLP.Pretrain.disenqnet_vec 39 | :members: 40 | 41 | EduNLP.Pretrain.quesnet_vec 42 | --------------------------------------------------------------- 43 | 44 | .. automodule:: EduNLP.Pretrain.quesnet_vec 45 | :members: -------------------------------------------------------------------------------- /docs/source/api/sif.rst: -------------------------------------------------------------------------------- 1 | EduNLP.SIF 2 | ============== 3 | 4 | SIF 5 | ---------- 6 | .. automodule:: EduNLP.SIF.sif 7 | :members: 8 | :imported-members: 9 | 10 | 11 | Parser 12 | -------- 13 | .. automodule:: EduNLP.SIF.parser 14 | :members: 15 | :imported-members: 16 | 17 | Segment 18 | ---------- 19 | .. automodule:: EduNLP.SIF.segment.segment 20 | :members: 21 | :imported-members: 22 | 23 | 24 | Tokenization 25 | --------------- 26 | 27 | tokenize 28 | ^^^^^^^^^^ 29 | .. automodule:: EduNLP.SIF.tokenization.tokenization 30 | :members: 31 | :imported-members: 32 | 33 | text 34 | ^^^^^^ 35 | .. automodule:: EduNLP.SIF.tokenization.text 36 | :members: 37 | :imported-members: 38 | 39 | 40 | formula 41 | ^^^^^^^^^ 42 | .. automodule:: EduNLP.SIF.tokenization.formula.formula 43 | :members: 44 | :imported-members: 45 | 46 | .. automodule:: EduNLP.SIF.tokenization.formula.ast_token 47 | :members: 48 | :imported-members: 49 | 50 | .. automodule:: EduNLP.SIF.tokenization.formula.linear_token 51 | :members: 52 | :imported-members: 53 | -------------------------------------------------------------------------------- /docs/source/api/tokenizer.rst: -------------------------------------------------------------------------------- 1 | EduNLP.Tokenizer 2 | ===================================== 3 | 4 | .. automodule:: EduNLP.Tokenizer 5 | :members: 6 | :imported-members: 7 | 8 | AstFormulaTokenizer参数定义 9 | ####################################### 10 | 11 | :: 12 | Parameters 13 | ---------- 14 | symbol : str, optional 15 | Elements to symbolize before tokenization, by default "gmas" 16 | figures : _type_, optional 17 | Info for figures in items, by default None 18 | """ 19 | 20 | CharTokenizer参数定义 21 | ####################################### 22 | 23 | :: 24 | """Tokenize text char by char. eg. "题目内容" -> ["题", "目", "内", 容"] 25 | 26 | Parameters 27 | ---------- 28 | stop_words : str, optional 29 | stop_words to skip, by default "default" 30 | """ 31 | 32 | CustomTokenizer参数定义 33 | ####################################### 34 | 35 | :: 36 | """Tokenize SIF items by customized configuration 37 | 38 | Parameters 39 | ---------- 40 | symbol : str, optional 41 | Elements to symbolize before tokenization, by default "gmas" 42 | figures : _type_, optional 43 | Info for figures in items, by default None 44 | kwargs: addtional configuration for SIF items 45 | including text_params, formula_params, figure_params, more details could be found in `EduNLP.SIF.sif4sci` 46 | """ 47 | 48 | PureTextTokenizer参数定义 49 | ####################################### 50 | 51 | :: 52 | """ 53 | Treat all elements in SIF item as prue text. Spectially, tokenize formulas as text. 54 | 55 | Parameters 56 | ---------- 57 | handle_figure_formula : str, optional 58 | whether to skip or symbolize special formulas( $\\FormFigureID{…}$ and $\\FormFigureBase64{…}), 59 | by default skip 60 | 61 | SpaceTokenizer参数定义 62 | ####################################### 63 | 64 | :: 65 | """ 66 | Tokenize text by space. eg. "题目 内容" -> ["题目", "内容"] 67 | 68 | Parameters 69 | ---------- 70 | stop_words : str, optional 71 | stop_words to skip, by default "default" 72 | """ 73 | 74 | EduNLP.Tokenizer.get_tokenizer参数定义 75 | ####################################### 76 | 77 | :: 78 | Parameters 79 | ---------- 80 | name: str 81 | the name of tokenizer, e.g. text, pure_text. 82 | args: 83 | the parameters passed to tokenizer 84 | kwargs: 85 | the parameters passed to tokenizer 86 | Returns 87 | ------- 88 | tokenizer: Tokenizer -------------------------------------------------------------------------------- /docs/source/api/utils.rst: -------------------------------------------------------------------------------- 1 | EduNLP.utils 2 | ==================== 3 | 4 | .. automodule:: EduNLP.utils 5 | :members: 6 | :imported-members: 7 | -------------------------------------------------------------------------------- /docs/source/api/vector.rst: -------------------------------------------------------------------------------- 1 | EduNLP.Vector 2 | ========================== 3 | 4 | 5 | EduNLP.Vector.t2v 6 | ------------------------- 7 | 8 | .. automodule:: EduNLP.Vector.t2v 9 | :members: 10 | 11 | 12 | EduNLP.Vector.disenqnet 13 | ------------------------- 14 | 15 | .. automodule:: EduNLP.Vector.disenqnet.disenqnet 16 | :members: 17 | 18 | EduNLP.Vector.quesnet 19 | ------------------------- 20 | 21 | .. automodule:: EduNLP.Vector.quesnet.quesnet 22 | :members: 23 | 24 | EduNLP.Vector.elmo_vec 25 | ------------------------- 26 | 27 | .. automodule:: EduNLP.Vector.elmo_vec 28 | :members: 29 | 30 | 31 | EduNLP.Vector.gensim_vec 32 | ------------------------- 33 | 34 | .. automodule:: EduNLP.Vector.gensim_vec 35 | :members: 36 | 37 | 38 | EduNLP.Vector.embedding 39 | ------------------------- 40 | 41 | .. automodule:: EduNLP.Vector.embedding 42 | :members: 43 | -------------------------------------------------------------------------------- /docs/source/tutorial/en/index.rst: -------------------------------------------------------------------------------- 1 | Get Started 2 | =============== 3 | 4 | * `Standard Item Format `_ 5 | 6 | * `Syntax Parsing `_ 7 | 8 | * `Component Segmentation `_ 9 | 10 | * `Tokenization `_ 11 | 12 | * `Pre-training `_ 13 | 14 | * `Vectorization `_ 15 | 16 | * `Pipeline `_ 17 | 18 | Main process 19 | --------------- 20 | 21 | .. figure:: ../../_static/pipeline.png 22 | 23 | * `Component Segmentation `_ : Segment items in SIF format according to the types of items, so that elements in different types(text, formulas, pictures, etc.) can be tokenized respectively. 24 | 25 | * `Syntax Parsing `_ : parsing different components in different ways, including formula parsing, text parsing, etc., serves the tokenization process later. 26 | 27 | * `Tokenization `_: Further process the result of component segmentation and syntax parsing, and finally the multi-modal tokenization sequence of the item is obtained. 28 | 29 | * `Vectorization `_: Fed the list of tokenized items into pre-training models, so as to get the corresponding vectors of items. 30 | 31 | * **Downstream** Apply the obtained vectors to downstream tasks. 32 | 33 | Examples 34 | --------- 35 | 36 | To help you quickly understand the functions of this project, this section only shows the usages of common function interface. Intermediate function modules (such as parse, formula, segment, etc.) and more subdivided interface methods are not shown. For further study, please refer to relevant documents. 37 | 38 | ------------------------------------------------------------ 39 | 40 | .. nbgallery:: 41 | :caption: This is a thumbnail gallery: 42 | :name: start_galler 43 | :glob: 44 | 45 | Tokenization <../../build/blitz/sif/sif4sci.ipynb> 46 | 47 | Vectorization <../../build/blitz/i2v/get_pretrained_i2v.ipynb> 48 | 49 | Pipeline <../../build/blitz/pipeline/pipeline.ipynb> 50 | -------------------------------------------------------------------------------- /docs/source/tutorial/en/parse/TextSyntaxStructureParsing.rst: -------------------------------------------------------------------------------- 1 | Text syntax structure parsing 2 | -------------------------------- 3 | 4 | This section is mainly realized by EduNLP.SIF.Parse module. Its main function is to extract letters and numbers in the text and convert them into standard format. 5 | 6 | This module is mainly used as an *middle module* to parse the input text. In general, users do not call this module directly. 7 | 8 | Introduction of Main Content 9 | +++++++++++++++++++++++++++++++++++++ 10 | 11 | 1. Judge the type of the incoming text in the following order 12 | 13 | * is_chinese: its function is to match Chinese characters[\u4e00-\u9fa5]. 14 | 15 | * is_alphabet: its function is to match alphabets other than formulas. Only the alphabets between two Chinese characters will be corrected (wrapped with $$), and the rest of the cases are regarded as formulas that do not conform to latex syntax. 16 | 17 | * is_number: its function is to match numbers other than formulas. Only the numbers between two Chinese characters will be corrected, and the rest of the cases are regarded as formulas that do not conform to latex syntax. 18 | 19 | 2. Match latex formula 20 | 21 | * If Chinese characters appear in latex, print warning only once. 22 | 23 | * Use _is_formula_legal function, check the completeness and analyzability of latex formula, and report an error for formulas that do not conform to latex syntax. 24 | 25 | Input 26 | >>>>>>> 27 | 28 | Type: str 29 | 30 | Content:question text 31 | 32 | :: 33 | 34 | >>> text1 = '生产某种零件的A工厂25名工人的日加工零件数_ _' 35 | >>> text2 = 'X的分布列为( )' 36 | >>> text3 = '① AB是⊙O的直径,AC是⊙O的切线,BC交⊙O于点E.AC的中点为D' 37 | >>> text4 = '支持公式如$\\frac{y}{x}$,$\\SIFBlank$,$\\FigureID{1}$,不支持公式如$\\frac{ \\dddot y}{x}$' 38 | 39 | Parsing 40 | >>>>>>>>>>>>>>>>>>>> 41 | 42 | :: 43 | 44 | >>> text_parser1 = Parser(text1) 45 | >>> text_parser2 = Parser(text2) 46 | >>> text_parser3 = Parser(text3) 47 | >>> text_parser4 = Parser(text4) 48 | 49 | Related parameters description(?) 50 | >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 51 | 52 | - Try to convert text to standard format 53 | 54 | :: 55 | 56 | >>> text_parser1.description_list() 57 | >>> print('text_parser1.text:',text_parser1.text) 58 | text_parser1.text: 生产某种零件的$A$工厂$25$名工人的日加工零件数$\SIFBlank$ 59 | >>> text_parser2.description_list() 60 | >>> print('text_parser2.text:',text_parser2.text) 61 | text_parser2.text: $X$的分布列为$\SIFChoice$ 62 | 63 | - Determine if the text has syntax errors 64 | 65 | :: 66 | 67 | >>> text_parser3.description_list() 68 | >>> print('text_parser3.error_flag: ',text_parser3.error_flag) 69 | text_parser3.error_flag: 1 70 | >>> text_parser4.description_list() 71 | >>> print('text_parser4.fomula_illegal_flag: ',text_parser4.fomula_illegal_flag) 72 | text_parser4.fomula_illegal_flag: 1 73 | -------------------------------------------------------------------------------- /docs/source/tutorial/en/pipeline.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | Pipleine 3 | ======= 4 | 5 | .. nbgallery:: 6 | :caption: This is a thumbnail gallery: 7 | :name: pipleine_gallery 8 | :glob: 9 | 10 | Pipleine <../../build/blitz/pipeline/pipeline.ipynb> 11 | -------------------------------------------------------------------------------- /docs/source/tutorial/en/pretrain/loading.rst: -------------------------------------------------------------------------------- 1 | Load models 2 | ---------------- 3 | 4 | Transfer the obtained model to the I2V module to load the model. 5 | 6 | Examples: 7 | 8 | :: 9 | 10 | >>> model_path = "../test_model/d2v/test_gensim_luna_stem_tf_d2v_256.bin" 11 | >>> i2v = D2V("text","d2v",filepath=model_path, pretrained_t2v = False) 12 | -------------------------------------------------------------------------------- /docs/source/tutorial/en/pretrain/pub.rst: -------------------------------------------------------------------------------- 1 | The overview of our public model 2 | ------------------------------------ 3 | 4 | 5 | Version Description 6 | ######################### 7 | 8 | First level version: 9 | 10 | * Public version 1 (luna_pub): college entrance examination 11 | * Public version 2 (luna_pub_large): college entrance examination + regional examination 12 | 13 | Second level version: 14 | 15 | * Minor subjects(Chinese,Math,English,History,Geography,Politics,Biology,Physics,Chemistry) 16 | * Major subjects(science, arts and all subject) 17 | 18 | Third level version【to be finished】: 19 | 20 | * Don't use third-party initializers 21 | * Use third-party initializers 22 | 23 | Description of train data in models 24 | ####################################### 25 | 26 | * Currently, the data used in w2v and d2v models are the subjects of senior high school. 27 | * test data:`[OpenLUNA.json] `_ 28 | 29 | At present, the following models are provided. More models of different subjects and question types are being trained. Please look forward to it. 30 | "d2v_all_300" (all subject), "d2v_science_300" (Science), "d2v_english_300" (English),"d2v_literal_300" (Arts) 31 | 32 | Examples of model training 33 | ---------------------------- 34 | 35 | Get the dataset 36 | #################### 37 | 38 | .. toctree:: 39 | :maxdepth: 1 40 | :titlesonly: 41 | 42 | prepare_dataset <../../../build/blitz/pretrain/prepare_dataset.ipynb> 43 | 44 | An example of d2v in gensim model 45 | #################################### 46 | 47 | .. toctree:: 48 | :maxdepth: 1 49 | :titlesonly: 50 | 51 | d2v_bow_tfidf <../../../build/blitz/pretrain/gensim/d2v_bow_tfidf.ipynb> 52 | d2v_general <../../../build/blitz/pretrain/gensim/d2v_general.ipynb> 53 | d2v_stem_tf <../../../build/blitz/pretrain/gensim/d2v_stem_tf.ipynb> 54 | 55 | An example of w2v in gensim model 56 | #################################### 57 | 58 | .. toctree:: 59 | :maxdepth: 1 60 | :titlesonly: 61 | 62 | w2v_stem_text <../../../build/blitz/pretrain/gensim/w2v_stem_text.ipynb> 63 | w2v_stem_tf <../../../build/blitz/pretrain/gensim/w2v_stem_tf.ipynb> 64 | 65 | An example of seg_token 66 | ############################ 67 | 68 | .. toctree:: 69 | :maxdepth: 1 70 | :titlesonly: 71 | 72 | d2v.ipynb <../../../build/blitz/pretrain/seg_token/d2v.ipynb> 73 | d2v_d1 <../../../build/blitz/pretrain/seg_token/d2v_d1.ipynb> 74 | d2v_d2 <../../../build/blitz/pretrain/seg_token/d2v_d2.ipynb> 75 | -------------------------------------------------------------------------------- /docs/source/tutorial/en/pretrain/start.rst: -------------------------------------------------------------------------------- 1 | Train the model 2 | ------------------ 3 | 4 | Call train_Vector function interface directly to make the training model easier. This section calls the relevant training models in the gensim library. At present, the training methods of "sg"、 "cbow"、 "fastext"、 "d2v"、 "bow"、 "tfidf" are provided. Parameter embedding_dim is also provided for users to determine vector dimension according to their needs. 5 | 6 | Basic Steps 7 | ################## 8 | 9 | 1.Determine the type of model and select the appropriate tokenizer (GensimWordTokenizer、 GensimSegTokenizer) to finish tokenization. 10 | 11 | 2.Call train_vector function to get the required pre-trained model。 12 | 13 | Examples: 14 | 15 | :: 16 | 17 | >>> tokenizer = GensimWordTokenizer(symbol="gmas", general=True) 18 | >>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ 19 | ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$") 20 | >>> print(token_item.tokens[:10]) 21 | ['公式', '[FORMULA]', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[FORMULA]'] 22 | 23 | # 10 dimension with fasstext method 24 | train_vector(sif_items, "../../../data/w2v/gensim_luna_stem_tf_", 10, method="d2v") 25 | -------------------------------------------------------------------------------- /docs/source/tutorial/en/seg/SemanticComponentSegmentation.rst: -------------------------------------------------------------------------------- 1 | Semantic Component Segmentation 2 | ------------------------------------ 3 | 4 | Because multiple-choice questions are given in the form of dict, it is necessary to convert them into text format while retaining their data relationship. This function can be realized by dict2str4sif function which can convert multiple-choice question items into character format and identify question stem and options。 5 | 6 | 7 | Basic Usage 8 | ++++++++++++++++++ 9 | 10 | :: 11 | 12 | >>> item = { 13 | ... "stem": r"若复数$z=1+2 i+i^{3}$,则$|z|=$", 14 | ... "options": ['0', '1', r'$\sqrt{2}$', '2'], 15 | ... } 16 | >>> dict2str4sif(item) # doctest: +ELLIPSIS 17 | '$\\SIFTag{stem_begin}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem_end}$$\\SIFTag{options_begin}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2$\\SIFTag{options_end}$' 18 | 19 | Optional additional parameters / interfaces 20 | ++++++++++++++++++++++++++++++++++++++++++++++++ 21 | 22 | 1.add_list_no_tag: if this parameter is true, it means that you need to count the labels in the options section. 23 | 24 | :: 25 | 26 | >>> dict2str4sif(item, add_list_no_tag=True) # doctest: +ELLIPSIS 27 | '$\\SIFTag{stem_begin}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem_end}$$\\SIFTag{options_begin}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2$\\SIFTag{options_end}$' 28 | 29 | >>> dict2str4sif(item, add_list_no_tag=False) # doctest: +ELLIPSIS 30 | '$\\SIFTag{stem_begin}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem_end}$$\\SIFTag{options_begin}$0$\\SIFSep$1$\\SIFSep$$\\sqrt{2}$$\\SIFSep$2$\\SIFTag{options_end}$' 31 | 32 | 2.tag_mode: The location for the label can be selected using this parameter. 'delimiter' is to label both the beginning and the end,'head' is to label only the head, and 'tail' is to label only the tail. 33 | 34 | :: 35 | 36 | >>> dict2str4sif(item, tag_mode="head") # doctest: +ELLIPSIS 37 | '$\\SIFTag{stem}$若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{options}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2' 38 | 39 | >>> dict2str4sif(item, tag_mode="tail") # doctest: +ELLIPSIS 40 | '若复数$z=1+2 i+i^{3}$,则$|z|=$$\\SIFTag{stem}$$\\SIFTag{list_0}$0$\\SIFTag{list_1}$1$\\SIFTag{list_2}$$\\sqrt{2}$$\\SIFTag{list_3}$2$\\SIFTag{options}$' 41 | 42 | 3.key_as_tag: If this parameter is false, this process will only adds $\SIFSep$ between the options without distinguishing the type of segmentation label. 43 | 44 | :: 45 | 46 | >>> dict2str4sif(item, key_as_tag=False) 47 | '若复数$z=1+2 i+i^{3}$,则$|z|=$0$\\SIFSep$1$\\SIFSep$$\\sqrt{2}$$\\SIFSep$2' -------------------------------------------------------------------------------- /docs/source/tutorial/en/seg/StructuralComponentSegmentation.rst: -------------------------------------------------------------------------------- 1 | Structural Component Segmentation 2 | ------------------------------------ 3 | 4 | This step is to segment sliced items. In this step, there is a depth option. You can select all positions or some labels for segmentation according to your needs, such as \SIFSep and \SIFTag. You can also select where to add labels, either at the head and tail or only at the head or tail. 5 | 6 | 7 | There are two modes: 8 | 9 | * linear mode: it is used for text processing (word segmentation using jieba library); 10 | 11 | * ast mode: it is used to parse the formula. 12 | 13 | Basic Usage 14 | ++++++++++++++++++ 15 | 16 | :: 17 | 18 | >>> test_item = r"如图所示,则$\bigtriangleup ABC$的面积是$\SIFBlank$。$\FigureID{1}$" 19 | >>> seg(test_item) 20 | >>> ['如图所示,则', '\\bigtriangleup ABC', '的面积是', '\\SIFBlank', '。', \FigureID{1}] 21 | 22 | Optional additional parameters/interfaces 23 | +++++++++++++++++++++++++++++++++++++++++++++ 24 | 25 | 1.describe: count the number of elements of different types 26 | 27 | :: 28 | 29 | >>> s.describe() 30 | {'t': 3, 'f': 1, 'g': 1, 'm': 1} 31 | 32 | 2.filter: this interface can screen out one or more types of elements. 33 | 34 | Using this interface, you can pass in a "keep" parameter or a special character directly to choose what type of elements to retain. 35 | 36 | Element type represented by symbol: 37 | "t": text 38 | "f": formula 39 | "g": figure 40 | "m": question mark 41 | "a": tag 42 | "s": sep tag 43 | 44 | :: 45 | 46 | >>> with s.filter("f"): 47 | ... s 48 | ['如图所示,则', '的面积是', '\\SIFBlank', '。', \FigureID{1}] 49 | >>> with s.filter(keep="t"): 50 | ... s 51 | ['如图所示,则', '的面积是', '。'] 52 | 53 | 3.symbol: this interface can convert some types of data into special symbols 54 | 55 | Element type represented by symbol: 56 | 57 | - "t": text 58 | - "f": formula 59 | - "g": figure 60 | - "m": question mark 61 | 62 | :: 63 | 64 | >>> seg(test_item, symbol="fgm") 65 | ['如图所示,则', '[FORMULA]', '的面积是', '[MARK]', '。', '[FIGURE]'] 66 | >>> seg(test_item, symbol="tfgm") 67 | ['[TEXT]', '[FORMULA]', '[TEXT]', '[MARK]', '[TEXT]', '[FIGURE]'] 68 | -------------------------------------------------------------------------------- /docs/source/tutorial/en/tokenization/GensimSegTokenizer.rst: -------------------------------------------------------------------------------- 1 | GensimSegTokenizer 2 | ===================== 3 | 4 | By default, the pictures, separators, blanks in the question text and other parts of the incoming item are converted into special characters for data security and tokenization of text, formulas and labels. Also, the tokenizer uses linear analysis method for text and abstract analysis method of syntax tree for formulas. 5 | 6 | Compared to GensimWordTokenizer, the main differences are: 7 | 8 | * It provides the depth option for segmentation position, such as \SIFSep and \SIFTag. 9 | * By default, labels are inserted in the header of item components (such as text and formula). -------------------------------------------------------------------------------- /docs/source/tutorial/en/tokenization/GensimWordTokenizer.rst: -------------------------------------------------------------------------------- 1 | GensimWordTokenizer 2 | ===================== 3 | 4 | By default, the pictures, blanks in the question text and other parts of the incoming item are converted into special characters for data security and the tokenization of text, formulas, labels and separators. Also, the tokenizer uses linear analysis method for text and abstract syntax tree method for formulas respectively. You can choose each of them by ``general`` parameter: 5 | 6 | -true, it means that the incoming item conforms to SIF and the linear analysis method should be used. 7 | -false, it means that the incoming item doesn't conform to SIF and the abstract syntax tree method should be used. 8 | 9 | Examples 10 | ---------- 11 | 12 | :: 13 | 14 | >>> tokenizer = GensimWordTokenizer(symbol="gmas", general=True) 15 | >>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ 16 | ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$") 17 | >>> print(token_item.tokens[:10]) 18 | ['公式', '[FORMULA]', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[FORMULA]'] 19 | >>> tokenizer = GensimWordTokenizer(symbol="fgmas", general=False) 20 | >>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ 21 | ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$") 22 | >>> print(token_item.tokens[:10]) 23 | ['公式', '[FORMULA]', '如图', '[FIGURE]', '[FORMULA]', '约束条件', '公式', '[FORMULA]', '[SEP]', '[FORMULA]'] 24 | -------------------------------------------------------------------------------- /docs/source/tutorial/en/tokenization/PureTextTokenizer.rst: -------------------------------------------------------------------------------- 1 | PureTextTokenizer 2 | =================== 3 | 4 | By default, the pictures, labels, separators, blanks in the question text and other parts of the incoming item are converted into special characters for data security. At the same time, special formulas such as $\\FormFigureID{...}$ and $\\FormFigureBase64{...}$ are screened out to facilitate the tokenization of text and plain text formulas. Also, the tokenizer uses linear analysis method for text and formulas, and the ``key`` parameter provided is used to preprocess the incoming item, which will be improved based on users' requirements in the future. 5 | 6 | Examples 7 | ---------- 8 | 9 | :: 10 | 11 | >>> tokenizer = PureTextTokenizer() 12 | >>> items = ["有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ 13 | ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$"] 14 | >>> tokens = tokenizer(items) 15 | >>> next(tokens)[:10] 16 | ['公式', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[SEP]', 'z'] 17 | >>> items = ["已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$"] 18 | >>> tokens = tokenizer(items) 19 | >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE 20 | ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', 21 | '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', 22 | '\\quad', 'A', '\\cap', 'B', '='] 23 | >>> items = [{ 24 | ... "stem": "已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$", 25 | ... "options": ["1", "2"] 26 | ... }] 27 | >>> tokens = tokenizer(items, key=lambda x: x["stem"]) 28 | >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE 29 | ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', 30 | '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', 31 | '\\quad', 'A', '\\cap', 'B', '='] 32 | -------------------------------------------------------------------------------- /docs/source/tutorial/en/tokenization/TextTokenizer.rst: -------------------------------------------------------------------------------- 1 | TextTokenizer 2 | ================ 3 | 4 | By default, the pictures, labels, separators, blanks in the question text and other parts of the incoming item are converted into special characters for data security and tokenization of text and formulas. Also, the tokenizer uses linear analysis method for text and formulas, and the ``key`` parameter provided is used to preprocess the incoming item, which will be improved based on users' requirements in the future. 5 | 6 | 7 | Examples 8 | ---------- 9 | 10 | :: 11 | 12 | >>> items = ["已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$"] 13 | >>> tokenizer = TextTokenizer() 14 | >>> tokens = tokenizer(items) 15 | >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE 16 | ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', 17 | '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', 18 | '\\quad', 'A', '\\cap', 'B', '='] 19 | >>> items = [{ 20 | ... "stem": "已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$", 21 | ... "options": ["1", "2"] 22 | ... }] 23 | >>> tokens = tokenizer(items, key=lambda x: x["stem"]) 24 | >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE 25 | ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', 26 | '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', 27 | '\\quad', 'A', '\\cap', 'B', '='] 28 | -------------------------------------------------------------------------------- /docs/source/tutorial/en/tokenize/Sentence Segmentation.rst: -------------------------------------------------------------------------------- 1 | Sentence Segmentation 2 | ------------------------- 3 | During the process of sentence segmentation, a long document is divided into several sentences. Each sentence is a "token" (to be realized). 4 | -------------------------------------------------------------------------------- /docs/source/tutorial/en/tokenize/Tokenization.rst: -------------------------------------------------------------------------------- 1 | Tokenization 2 | -------------- 3 | Tokenization is comprehensive analysis. In this process, sentences with formulas are segmented into several markers. Each marker is a "token". 4 | We provide some encapsulated tokenizers for users to call them conveniently. The following is a complete list of tokenizers. 5 | 6 | Examples 7 | 8 | :: 9 | 10 | >>> items = ["已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$"] 11 | >>> tokenizer = TextTokenizer() 12 | >>> tokens = tokenizer(items) 13 | >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE 14 | ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', 15 | '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', 16 | '\\quad', 'A', '\\cap', 'B', '='] 17 | 18 | 19 | 20 | You can view ``./EduNLP/Tokenizer/tokenizer.py`` and ``./EduNLP/Pretrain/gensim_vec.py`` for more tokenizers. Following is a complete list of tokenizers: 21 | 22 | .. toctree:: 23 | :maxdepth: 1 24 | :titlesonly: 25 | 26 | ../tokenization/TextTokenizer 27 | ../tokenization/PureTextTokenizer 28 | ../tokenization/GensimSegTokenizer 29 | ../tokenization/GensimWordTokenizer 30 | -------------------------------------------------------------------------------- /docs/source/tutorial/en/tokenize/WordSegmentation.rst: -------------------------------------------------------------------------------- 1 | Word segmentation 2 | --------------------- 3 | 4 | Text-tokenization: A sentence (without formulas) consists of several "words" in order. The process of dividing a sentence into several words is called "Text-tokenization". According to the granularity of "words", it can be subdivided into "Word-tokenization" and "Char-tokenization". 5 | 6 | :: 7 | 8 | - Word-tokenization: each phrase is a token. 9 | 10 | - Char-tokenization: each character is a token. 11 | 12 | 13 | Text-tokenization is divided into two main steps: 14 | 15 | 1. Text-tokenization: 16 | 17 | - Word-tokenization: use the word segmentation tool to segment and extract words from the question text. Our project supports `jieba`. 18 | 19 | - Char-tokenization: process text by character. 20 | 21 | 2. Filter: filter the specified stopwords. 22 | 23 | The default stopwords used in this project:`[stopwords] `_ 24 | You can also use your own stopwords. The following example demonstrates how to use. 25 | 26 | Examples: 27 | 28 | :: 29 | 30 | >>> text = "三角函数是基本初等函数之一" 31 | >>> tokenize(text, granularity="word") 32 | ['三角函数', '初等', '函数'] 33 | 34 | >>> tokenize(text, granularity="char") 35 | ['三', '角', '函', '数', '基', '初', '函', '数'] 36 | 37 | -------------------------------------------------------------------------------- /docs/source/tutorial/en/vectorization/WithPre-trainedModel.rst: -------------------------------------------------------------------------------- 1 | Use the pre-training model: call get_pretrained_i2v directly 2 | -------------------------------------------------------------------- 3 | 4 | Use the pre-training model provided by EduNLP to convert the given question text into vectors. 5 | 6 | * Advantages: Simple and convenient. 7 | 8 | * Disadvantages: Only the model given in the project can be used, which has great limitations. 9 | 10 | * Call this function to obtain the corresponding pre-training model. At present, the following pre training models are provided: d2v_all_300, d2v_science_300, d2v_english_300 and d2v_literal_300. 11 | 12 | Selection and use of models 13 | #################################### 14 | 15 | Select the pre-training model according to the subject: 16 | 17 | +--------------------+------------------------+ 18 | | Pre-training model name | Subject of model training data | 19 | +====================+========================+ 20 | | d2v_all_300 | all subject | 21 | +--------------------+------------------------+ 22 | | d2v_science_300 | Science | 23 | +--------------------+------------------------+ 24 | | d2v_literal_300 | Arts | 25 | +--------------------+------------------------+ 26 | | d2v_english_300 | English | 27 | +--------------------+------------------------+ 28 | 29 | The concrete process of processing 30 | #################################### 31 | 32 | 1.Download the corresponding preprocessing model 33 | 34 | 2.Transfer the obtained model to D2V and process it with D2V 35 | Convert the obtained model into D2V and process it through D2V 36 | 37 | Examples: 38 | 39 | :: 40 | 41 | >>> i2v = get_pretrained_i2v("d2v_science_300") 42 | >>> i2v(item) 43 | -------------------------------------------------------------------------------- /docs/source/tutorial/en/vectorization/WithoutPre-trainedModel.rst: -------------------------------------------------------------------------------- 1 | Don't use the pre-trained model: call existing models directly 2 | ---------------------------------------------------------------- 3 | 4 | You can use any pre-trained model provided by yourself (just give the storage path of the model) to convert the given question text into vectors. 5 | 6 | * Advantages: it is flexible to use your own model and its parameters can be adjusted freely. 7 | 8 | Specific process of processing 9 | +++++++++++++++++++++++++++++++++++ 10 | 11 | 1.Call get_tokenizer function to get the result after word segmentation; 12 | 13 | 2.Select the model provided for vectorization depending on the model used. 14 | 15 | Examples: 16 | 17 | :: 18 | 19 | >>> model_path = "../test_model/d2v/test_gensim_luna_stem_tf_d2v_256.bin" 20 | >>> i2v = D2V("text","d2v",filepath=model_path, pretrained_t2v = False) 21 | >>> i2v(item) 22 | -------------------------------------------------------------------------------- /docs/source/tutorial/zh/index.rst: -------------------------------------------------------------------------------- 1 | 入门 2 | ===== 3 | 4 | * `SIF标准格式 `_ 5 | 6 | * `成分分解 `_ 7 | 8 | * `语法解析 `_ 9 | 10 | * `令牌化 `_ 11 | 12 | * `预训练 `_ 13 | 14 | * `向量化 `_ 15 | 16 | * `流水线 `_ 17 | 18 | 主要流程 19 | ---------- 20 | 21 | .. figure:: ../../_static/流程图.png 22 | 23 | * `成分分解 `_ :对符合SIF标准的试题进行分解,识别出题目中不同的成分(如文本、公式、图片等)。 24 | 25 | * `语法解析 `_ :对不同的成分进行个性化解析,包括公式解析、文本解析等,从而服务于后面的令牌化环节。 26 | 27 | * `令牌化 `_:根据成分分解和语法解析的结果,获取试题不同成分的令牌化序列,最终得到试题的多模态令牌序列。 28 | 29 | * `向量化 `_:将令牌序列送入预训练模型,得到试题相应的表征向量。 30 | 31 | * **下游模型**:将预训练模型得到的试题表征应用于各种下游任务(如难度预测、知识点预测、相似题检索等)。 32 | 33 | 示例 34 | -------- 35 | 36 | 为使您快速了解此项目的功能,此部分仅展示常用的函数接口使用方法(如得到令牌化序列、获取向量化表征等),对于其中间函数模块(如parse、segment、tokenize、formula等)以及更细分的接口方法不做展示,如需深入学习,请查看相关部分的文档。 37 | 38 | .. nbgallery:: 39 | :caption: This is a thumbnail gallery: 40 | :name: start_galler 41 | :glob: 42 | 43 | 令牌化 <../../build/blitz/sif/sif4sci.ipynb> 44 | 45 | 向量化 <../../build/blitz/i2v/get_pretrained_i2v.ipynb> 46 | 47 | 流水线 <../../build/blitz/pipeline/pipeline.ipynb> 48 | -------------------------------------------------------------------------------- /docs/source/tutorial/zh/pipeline.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | 流水线 3 | ======= 4 | 5 | .. nbinfo:: 6 | notebook: 7 | 8 | `流水线 <../../build/blitz/pipeline/pipeline.ipynb>`_ 9 | -------------------------------------------------------------------------------- /docs/source/tutorial/zh/sif.rst: -------------------------------------------------------------------------------- 1 | SIF标准格式 2 | ============================== 3 | 4 | 标准规范 5 | ---------------------------------------- 6 | 7 | version: 0.2 8 | 9 | 为了后续研究和使用的方便,我们需要一个统一的试题语法标准。 10 | 11 | 语法规则 12 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 13 | 14 | 1. 题目文本中只允许出现中文字符、中英文标点和换行符。 15 | 16 | 2. 使用 \$\SIFBlank\$ 替换横线,对于选择题中的括号使用 \$\SIFChoice\$ 替换。 17 | 18 | 3. 图片 ID 以公式的形式嵌入文本中:``$\FigureID{ uuid }$`` 或用 base64 编码表示,特别的,内容为公式的图片用 ``$\FormFigureID{ uuid }$`` 表示。 19 | 20 | 4. 文本标注格式:统一用 ``$\textf{item,CHAR_EN}$`` 表示,目前定义的有:b-加粗,i-斜体,u-下划线,w-下划波浪线,d-加点,t-标题。标注可以混用,按字母顺序排序,例如:$\textf{EduNLP, b}$ 表示 **EduNLP** 21 | 22 | 5. 其余诸如,英文字母、罗马字符、数字等数学符号一律需要使用 latex 格式表示,即嵌在 ``$$`` 之中。 23 | 24 | 6. 分子式的录入标准暂且参考 `INCHI `_ 25 | 26 | 7. 目前对 latex 内部语法没有要求。 27 | 28 | :: 29 | 30 | 1. Item -> CHARACTER|EN_PUN_LIST|CH_PUN_LIST|FORMULA|QUES_MARK 31 | 2. EN_PUN_LIST -> [',', '.', '?', '!', ':', ';', '\'', '\"', '(', ')', ' ','_','/','|','\\','<','>','[',']','-'] 32 | 3. CH_PUN_LIST -> [',', '。', '!', '?', ':',';', '‘', '’', '“', '”', '(', ')', ' ', '、','《','》','—','.'] 33 | 4. FORMULA -> $latex formula$ | $\FormFigureID{UUID}$ | $\FormFigureBase64{BASE64}$ 34 | 5. FIGURE -> $\FigureID{UUID}$ | $\FigureBase64{BASE64}$ 35 | 6. UUID -> [a-zA-Z\-0-9]+ 36 | 7. CHARACTER -> CHAR_EN | CHAR_CH 37 | 8. CHAR_EN -> [a-zA-Z]+ 38 | 9. CHAR_CH -> [\u4e00-\u9fa5]+ 39 | 10. DIGITAL -> [0-9]+ 40 | 11. QUES_MARK -> $\SIFBlank$ | $\SIFChoice$ 41 | 42 | 43 | 注意事项 44 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 45 | 46 | 1. 保留字符与转义 47 | 48 | 2. 数字 49 | 50 | 3. 选空与填空 51 | 52 | 4. 对于单个的数字或字符也需要添加 ``$$`` (目前能实现自动校验) 53 | 54 | 5. latex 公式中尽量不出现中文:(``\text{这里出现中文}``) 55 | 56 | 6. MySql 数据库导入数据时会自动忽略一个 ``\``,所以录入的公式需要进一步处理为 ``\\`` 57 | 58 | 示例 59 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 60 | 61 | 标准形式: 62 | 63 | :: 64 | 65 | 1. 若$x,y$满足约束条件$\\left\\{\\begin{array}{c}2 x+y-2 \\leq 0 \\\\ x-y-1 \\geq 0 \\\\ y+1 \\geq 0\\end{array}\\right.$,则$z=x+7 y$的最大值$\\SIFUnderline$' 66 | 67 | 2. 已知函数$f(x)=|3 x+1|-2|x|$画出$y=f(x)$的图像求不等式$f(x)>f(x+1)$的解集$\\PictureID{3bf2ddf4-8af1-11eb-b750-b46bfc50aa29}$$\\PictureID{59b8bd14-8af1-11eb-93a5-b46bfc50aa29}$$\\PictureID{63118b3a-8b75-11eb-a5c0-b46bfc50aa29}$$\\PictureID{6a006179-8b76-11eb-b386-b46bfc50aa29}$$\\PictureID{088f15eb-8b7c-11eb-a86f-b46bfc50aa29}$ 68 | 69 | 非标准形式: 70 | 71 | 1. 字母、数字和数学符号连续混合出现: 72 | 73 | 例如: 74 | 75 | ``完成下面的2x2列联表,`` 76 | 77 | ``(单位:m3)`` 78 | 79 | ``则输出的n=`` 80 | 81 | 2. 特殊的数学符号没有用 latex 公式表示: 82 | 83 | 例如: 84 | 85 | ``命题中真命题的序号是 ①`` 86 | 87 | ``AB是⊙O的直径,AC是⊙O的切线,BC交⊙O于点E.若D为AC的中点`` 88 | 89 | 3. 出现以 unicode 编码写成的字符 90 | 91 | 例如:``则$a$的取值范围是(\u3000\u3000)`` 92 | 93 | 94 | 标准化检验 95 | --------------------- 96 | 97 | 调用库 98 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 99 | :: 100 | 101 | from EduNLP.SIF import is_sif, to_sif 102 | 103 | is_sif 104 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 105 | 106 | 判断题目是否为SIF标准格式 107 | 108 | :: 109 | 110 | >>> text1 = '若$x,y$满足约束条件' 111 | >>> text2 = '$\\left\\{\\begin{array}{c}2 x+y-2 \\leq 0 \\\\ x-y-1 \\geq 0 \\\\ y+1 \\geq 0\\end{array}\\right.$,' 112 | >>> text3 = '则$z=x+7 y$的最大值$\\SIFUnderline$' 113 | >>> text4 = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...' 114 | >>> is_sif(text1) 115 | True 116 | >>> is_sif(text2) 117 | True 118 | >>> is_sif(text3) 119 | True 120 | >>> is_sif(text4) 121 | False 122 | 123 | to_sif 124 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 125 | 126 | 将题目转换为SIF标准格式 127 | 128 | :: 129 | 130 | >>> text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...' 131 | >>> to_sif(text) 132 | '某校一个课外学习小组为研究某作物的发芽率$y$和温度$x$(单位...' 133 | -------------------------------------------------------------------------------- /docs/source/tutorial/zh/tokenize.rst: -------------------------------------------------------------------------------- 1 | 语法解析 2 | ========= 3 | 4 | 在教育资源中,文本、公式都具有内在的隐式或显式的语法结构,提取这种结构对表征学习是大有裨益的: 5 | 6 | * 文本语法结构解析 7 | 8 | * 公式语法结构解析 9 | 10 | 文本语法结构解析 11 | -------------------- 12 | 13 | 根据题目文本切分粒度的大小,文本解析又分为 **“句解析”** 和 **“词解析”**。 14 | 15 | 16 | 句解析(sentence-tokenization) 17 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 18 | 19 | 将较长的文档切分成若干句子的过程称为“分句”。每个句子为一个“令牌”(token)。(待实现) 20 | 21 | 22 | 词解析(text-tokenization) 23 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 24 | 25 | 一个句子(不含公式)是由若干“词”按顺序构成的,将一个句子切分为若干词的过程称为“词解析”。根据词的粒度大小,又可细分为“词组解析”和"单字解析"。 26 | 27 | 主要步骤 28 | """"""""""""""""""""""""" 29 | 30 | (1)分词 31 | 32 | - 词组解析:使用分词工具切分并提取题目文本中的词。 33 | 本项目目前支持的分词工具有:`jieba` 34 | - 单字解析:按字符划分。 35 | 36 | (2) 过滤停用词 37 | 38 | - 本项目默认使用的停用词表:`stopwords `_ 39 | - 你也可以使用自己的停用词表,具体使用方法见下面的示例。 40 | 41 | 42 | 示例 43 | """"""""""""""""""""""""" 44 | 45 | 导入模块 46 | 47 | :: 48 | 49 | from EduNLP.SIF.tokenization.text import tokenize 50 | 51 | 52 | 输入 53 | 54 | :: 55 | 56 | text = "三角函数是基本初等函数之一" 57 | 58 | 59 | 词组解析 60 | 61 | :: 62 | 63 | # 输出:默认使用 EduNLP 项目提供的停用词表 64 | >>> tokenize(text, granularity="word") 65 | ['三角函数', '初等', '函数'] 66 | 67 | 68 | 单字解析 69 | 70 | :: 71 | 72 | # 输出:默认使用 EduNLP 项目提供的停用词表 73 | >>> tokenize(text, granularity="char") 74 | ['三', '角', '函', '数', '基', '初', '函', '数'] 75 | 76 | 77 | 使用自己的停用词表 78 | 79 | :: 80 | 81 | >>> spath = "test_stopwords.txt" 82 | >>> from EduNLP.SIF.tokenization.text.stopwords import get_stopwords 83 | >>> stopwords = get_stopwords(spath) 84 | >>> stopwords 85 | {'一旦', '一时', '一来', '一样', '一次', '一片', '一番', '一直', '一致'} 86 | >>> tokenize(text, granularity="word", stopwords=stopwords) 87 | ['三角函数', '是', '基本', '初等', '函数', '之一'] 88 | 89 | 90 | 公式语法结构解析 91 | -------------------- 92 | 93 | 公式解析(formula-tokenization):理科类文本中常常含有公式。将一个符合 latex 语法的公式解析为标记字符列表或抽象语法树的过程称为“公式解析”。 94 | 95 | 包括两种方案 96 | 97 | - 公式线性解析 98 | - 公式AST解析 99 | 100 | .. note:: 101 | 102 | 本小节主要介绍如何获取不同格式的公式解析结果。公式解析的底层实现请参考:`EduNLP.Formula` 部分。 103 | 104 | 105 | (1)公式线性解析 106 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 107 | 108 | 如果您想按 latex 语法标记拆分公式的各个部分,并得到顺序序列结果,输出方法可以选择:`linear` 109 | 110 | :: 111 | >>> tokenize(formula, method="linear") 112 | ['\\frac', '{', '\\pi', '}', '{', 'x', '+', 'y', '}', '+', '1', '=', 'x'] 113 | 114 | 115 | (2) 公式AST解析 116 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 117 | 118 | 如果您想得到公式解析出的语法分析树序列,输出方法可以选择:`ast` 119 | 120 | > 抽象语法分析树,简称语法树(Syntax tree),是源代码语法结构的一种抽象表示。它以树状的形式表现编程语言的语法结构,树上的每个节点都表示源代码中的一种结构。 121 | > 因此,ast 可以看做是公式的语法结构表征。 122 | 123 | :: 124 | >>> tokenize(formula, method="ast", return_type="list", ord2token=False) 125 | ['\\pi', '{ }', 'x', '+', 'y', '{ }', '\\frac', '+', '1', '=', 'x'] 126 | 127 | 128 | (3)公式AST解析+变量符号化 129 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 130 | 131 | 如果您只是关心公式的结构和类型,并不关心变量具体是什么,比如二元二次方程 `x^2 + y = 1` ,它从公式结构和类型上来说,和 `w^2 + z = 1` 没有区别。 132 | 此时,您可以设置如下参数:`ord2token = True`,将公式变量名转换成 token 133 | 134 | :: 135 | >>> tokenize(formula, method="ast", return_type="list", ord2token=True) 136 | ['mathord', '{ }', 'mathord', '+', 'mathord', '{ }', '\\frac', '+', 'textord', '=', 'mathord'] 137 | 138 | 139 | (4) 公式AST解析+变量标准化 140 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 141 | 142 | 如果您除了 (3) 中提供的功能之外,还需要区分不同的变量。此时可以另外设置参数:`var_numbering=True` 143 | 144 | :: 145 | >>> tokenize(formula, method="ast", return_type="list", ord2token=True, var_numbering=True) 146 | ['mathord_con', '{ }', 'mathord_0', '+', 'mathord_1', '{ }', '\\frac', '+', 'textord', '=', 'mathord_0'] 147 | 148 | -------------------------------------------------------------------------------- /examples/downstream/difficulty_prediction/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pandas as pd 3 | 4 | 5 | def load_json(open_path): 6 | print("[load_json] start : {}".format(open_path)) 7 | with open(open_path, "r", encoding='utf-8') as f: 8 | load_q = json.load(f) 9 | print("[load_json] num = {}, open_path = {}".format(len(load_q), open_path)) 10 | return load_q 11 | 12 | 13 | def get_train(train): 14 | train_data = [] 15 | for item in train: 16 | dic = {} 17 | dic["content"] = item["content"] 18 | dic["labels"] = float(item["difficulty"]) 19 | train_data.append(dic) 20 | return train_data 21 | 22 | 23 | def get_val(val): 24 | test_data, test_gap = [], [] 25 | start, end = 0, 0 26 | for batch in val: 27 | end += len(batch['questions']) 28 | for item in batch['questions']: 29 | dic = {} 30 | dic['content'] = item["stem"] 31 | dic['labels'] = item['diff'] 32 | # dic["labels"] = dic.pop("difficulty") 33 | test_data.append(dic) 34 | test_gap.append([start, end]) 35 | start = end 36 | return test_data, test_gap 37 | -------------------------------------------------------------------------------- /examples/downstream/discrimination_prediction/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pandas as pd 3 | 4 | 5 | def pre_disc(csv_path): 6 | items = pd.read_csv(csv_path) 7 | stem = items["stem"].tolist() 8 | disc = items["disc"].tolist() 9 | data = [] 10 | for i in range(len(stem)): 11 | dic = {} 12 | dic["content"] = stem[i] 13 | dic["labels"] = disc[i] 14 | data.append(dic) 15 | return data 16 | -------------------------------------------------------------------------------- /examples/downstream/paper_segmentation/samples/train/math/paper_1.txt: -------------------------------------------------------------------------------- 1 | ================= 2 | 2017年云南省临沧市临翔区民族中学高考数学三模试卷(文科) 3 | 选择题 4 | ================= 5 | 1. 已知集合,,则 \ ( ) $ 6 | A. B. C. D. 7 | ================= 8 | 2. 已知复数,则复数的模为 \ ( ) $ 9 | A. B. C. D. 10 | ================= 11 | 3. 已知点,,向量,若,则为 \ ( ) $ 12 | A. B. C. D. 13 | ================= 14 | 4. 已知函数满足,且当时,成立,若,,,则,,的大小关系是 \ ( ) $ 15 | A. B. C. D. 16 | ================= 17 | 5.如图的程序框图的算法思路源于数学名著几何原本中的“辗转相除法”,执行该程序框图图中“”表示除以的余数\ ( ) ab485270b=( 18 | A. B. C. D. 19 | ================= 20 | 6. 某三棱锥的三视图如图所示,则该三棱锥的表面积为 \ ( ) $ 21 | A. B. C. D. 22 | ================= 23 | 7. 曲线在点处的切线与轴、轴围成的封闭图形的面积为 \ ( ) $ 24 | A. B. C. D. 25 | ================= 26 | 8. 已知,则 \ ( ) $ 27 | A. B. C. D. 28 | ================= 29 | 9. 下列说法正确的个数是 \ ( ) $ 30 | 若为奇函数,则; 31 | “在中,若,则”的逆命题是假命题; 32 | “三个数,,成等比数列”是“”的既不充分也不必要条件; 33 | 命题“,”的否定是“,”. 34 | A. B. C. D. 35 | ================= 36 | 10. 将函数的图象向右平移个单位后得到的图象的一个对称轴是 \ ( ) $ 37 | A. B. C. D. 38 | ================= 39 | 11. 已知等差数列的公差,且,,成等比数列,若,是数列的前项和,则的最小值为 \ ( ) $ 40 | A. B. C. D. 41 | ================= 42 | 12. 已知焦点为的抛物线上有一点,以为圆心,为半径的圆被轴截得的弦长为,则 \ ( ) $ 43 | A. B. C. D. 44 | ================= 45 | 填空题 46 | ================= 47 | 13. 点是不等式组表示的平面区域内的一动点,且不等式恒成立,则的取值范围是 _____ . 48 | ================= 49 | 14. 已知的内角,,所对的边分别为,,,且,,,则的值为 _____ . 50 | ================= 51 | 15. 已知正四面体的棱长为,为棱的中点,过作其外接球的截面,则截面面积的最小值为 _____ . 52 | ================= 53 | 16. 设函数的图象与的图象关于直线对称,且,则 _____ . 54 | ================= 55 | 简答题 56 | ================= 57 | 17. 已知数列的前项和 58 | Ⅰ求数列的通项公式; 59 | Ⅱ若,求数列的前项和. 60 | ================= 61 | 18. 62 | ================= 63 | 19. 如图,在直角梯形中,,,,是中点,将沿折起,使得面; 64 | Ⅰ求证:平面平面; 65 | Ⅱ若是的中点求三棱锥的体积. 66 | ================= 67 | 20. 已知椭圆:的离心率为,过的左焦点的直线:,直线被圆:截得的弦长为. 68 | Ⅰ求椭圆的方程; 69 | Ⅱ设的右焦点为,在圆上是否存在点,满足,若存在,指出有几个这样的点不必求出点的坐标;若不存在,说明理由. 70 | ================= 71 | 21. 已知函数为常数 72 | 当时,求函数的单调区间; 73 | 求时,不等式恒成立,求实数的取值范围. 74 | ================= 75 | 22. 在直角坐标系中,曲线为参数,,其中,在以为极点,轴正半轴为极轴的极坐标系中,曲线:,曲线. 76 | Ⅰ求与交点的直角坐标系; 77 | Ⅱ若与相交于点,与相交于点,求的最大值. 78 | ================= 79 | 23. 设函数 80 | Ⅰ解不等式; 81 | Ⅱ当时,,求实数的取值范围. 82 | ================= 83 | -------------------------------------------------------------------------------- /examples/downstream/paper_segmentation/utils.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import os 4 | import logging 5 | from datetime import datetime 6 | 7 | ROOT_DIR = os.path.dirname(os.path.dirname(__file__)) 8 | 9 | def get_logger(logfile): 10 | os.makedirs(os.path.dirname(logfile), exist_ok=True) 11 | 12 | logger = logging.getLogger(name="QuesQuality") 13 | logger.setLevel(logging.INFO) 14 | 15 | handler = logging.FileHandler(filename=logfile, encoding="utf-8", mode="w") 16 | handler.setLevel(logging.INFO) 17 | formatter = logging.Formatter(fmt="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S") 18 | handler.setFormatter(formatter) 19 | 20 | consolehandler = logging.StreamHandler() 21 | consolehandler.setFormatter(formatter) 22 | 23 | logger.addHandler(handler) 24 | logger.addHandler(consolehandler) # log to file and print to console 25 | return logger 26 | 27 | 28 | def get_pk(y_pred, y, k): 29 | tag_num = len(y) 30 | count = 0 31 | for i in range(0, tag_num-k): 32 | seg_count_y_pred = 0 33 | seg_count_y = 0 34 | for j in range(i, i+k): 35 | seg_count_y_pred += y_pred[j] 36 | seg_count_y += y[j] 37 | if seg_count_y_pred != seg_count_y: 38 | count += 1 39 | return count 40 | -------------------------------------------------------------------------------- /examples/formula/formula.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "source": [ 6 | "# Formula\r\n", 7 | "\r\n", 8 | "## 概述\r\n", 9 | "\r\n", 10 | "Formula 首先在分词功能中对原始文本的公式做切分处理,另外提供多种功能使之能够适应多种用户需求,例如 [公式解析树] 功能,可以将数学公式的抽象语法分析树用文本或图片的形式表示出来;又如[公式变量标准化]的功能,能判断几个子公式内的‘x’为同一变量。\r\n", 11 | "\r\n", 12 | "由于本部分常作为中间模块,故仅展示基本调用方法,如需更进一步学习模块相关参数请参见对应文档。" 13 | ], 14 | "metadata": {} 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 1, 19 | "source": [ 20 | "import matplotlib.pyplot as plt\r\n", 21 | "from EduNLP.Formula import Formula\r\n", 22 | "from EduNLP.Formula import FormulaGroup\r\n", 23 | "from EduNLP.Formula.viz import ForestPlotter" 24 | ], 25 | "outputs": [], 26 | "metadata": {} 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "source": [ 31 | "## 公式语法结构分析\n", 32 | "\n", 33 | "### 初始化实例\n", 34 | "\n", 35 | "- item 类型:`str or List[Dict]` \n", 36 | "- item 内容:latex 公式 或 公式经解析后产生的抽象语法分析树(abstracted syntax tree)" 37 | ], 38 | "metadata": {} 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 2, 43 | "source": [ 44 | "f = Formula(\"x^2 + x+1 = y\")\r\n", 45 | "f " 46 | ], 47 | "outputs": [ 48 | { 49 | "output_type": "execute_result", 50 | "data": { 51 | "text/plain": [ 52 | "" 53 | ] 54 | }, 55 | "metadata": {}, 56 | "execution_count": 2 57 | } 58 | ], 59 | "metadata": { 60 | "collapsed": true 61 | } 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "source": [ 66 | "## 方程组结构解析\n", 67 | "\n", 68 | "调用 `FormulaGroup` 类解析公式方程组,相关的属性和函数方法同上。" 69 | ], 70 | "metadata": {} 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 21, 75 | "source": [ 76 | "fs = FormulaGroup([\r\n", 77 | " \"x^2 = y\",\r\n", 78 | " \"x^3 = y^2\",\r\n", 79 | " \"x + y = \\pi\"\r\n", 80 | "])\r\n", 81 | "fs" 82 | ], 83 | "outputs": [ 84 | { 85 | "output_type": "execute_result", 86 | "data": { 87 | "text/plain": [ 88 | ";;>" 89 | ] 90 | }, 91 | "metadata": {}, 92 | "execution_count": 21 93 | } 94 | ], 95 | "metadata": { 96 | "collapsed": false, 97 | "pycharm": { 98 | "name": "#%%\n" 99 | } 100 | } 101 | } 102 | ], 103 | "metadata": { 104 | "kernelspec": { 105 | "name": "python3", 106 | "display_name": "Python 3.8.5 64-bit" 107 | }, 108 | "language_info": { 109 | "name": "python", 110 | "version": "3.8.5", 111 | "mimetype": "text/x-python", 112 | "codemirror_mode": { 113 | "name": "ipython", 114 | "version": 3 115 | }, 116 | "pygments_lexer": "ipython3", 117 | "nbconvert_exporter": "python", 118 | "file_extension": ".py" 119 | }, 120 | "interpreter": { 121 | "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a" 122 | } 123 | }, 124 | "nbformat": 4, 125 | "nbformat_minor": 2 126 | } -------------------------------------------------------------------------------- /examples/formula/formula.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/3/8 @ tongshiwei 3 | # 4 | from EduNLP.Formula import Formula, FormulaGroup, link_formulas 5 | # 6 | # f1 = Formula(r"x + y", variable_standardization=True) 7 | # f2 = Formula(r"y + x", variable_standardization=True) 8 | # f3 = Formula(r"z + y", variable_standardization=True) 9 | # 10 | # print(f1.element) 11 | # print(f2.element) 12 | # print(f3.element) 13 | # 14 | # print("-----------------------") 15 | # 16 | # link_formulas(f1, f2, f3) 17 | # 18 | # print("------------------------") 19 | # 20 | # print(f1.element) 21 | # print(f2.element) 22 | # print(f3.element) 23 | # 24 | # print("---------------------") 25 | # 26 | # fg = FormulaGroup( 27 | # [r"x + y", r"y + x", r"y + z"] 28 | # ) 29 | # for f in fg: 30 | # print(f.element) 31 | 32 | # fg = FormulaGroup(["x", "y", "x"]) 33 | # print(fg.elements) 34 | 35 | fg = FormulaGroup(["x", Formula("y"), "x"]) 36 | print(fg.elements) -------------------------------------------------------------------------------- /examples/formula/tree.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "source": [ 7 | "import networkx as nx\n", 8 | "\n", 9 | "g = nx.DiGraph()\n", 10 | "g.add_node(0, value=1, id=0)\n", 11 | "g.add_node(1, value=2, id=1)\n", 12 | "g.add_node(2, id=2)\n", 13 | "g.add_edge(0, 1)\n", 14 | "g.add_edge(0, 2)\n", 15 | "g.nodes[0]" 16 | ], 17 | "outputs": [ 18 | { 19 | "output_type": "execute_result", 20 | "data": { 21 | "text/plain": [ 22 | "{'value': 1, 'id': 0}" 23 | ] 24 | }, 25 | "metadata": {}, 26 | "execution_count": 1 27 | } 28 | ], 29 | "metadata": { 30 | "collapsed": true 31 | } 32 | } 33 | ], 34 | "metadata": { 35 | "kernelspec": { 36 | "name": "python3", 37 | "display_name": "Python 3.8.5 64-bit" 38 | }, 39 | "language_info": { 40 | "name": "python", 41 | "version": "3.8.5", 42 | "mimetype": "text/x-python", 43 | "codemirror_mode": { 44 | "name": "ipython", 45 | "version": 3 46 | }, 47 | "pygments_lexer": "ipython3", 48 | "nbconvert_exporter": "python", 49 | "file_extension": ".py" 50 | }, 51 | "interpreter": { 52 | "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a" 53 | } 54 | }, 55 | "nbformat": 4, 56 | "nbformat_minor": 2 57 | } -------------------------------------------------------------------------------- /examples/i2v/i2v.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# I2V 向量化容器\n", 8 | "\n", 9 | "向量化过程是将原始题目(item)转成向量(vector)的过程,它包括两个步骤:\n", 10 | "- 使用 `Tokenizer` 将原始题目(item)转化为令牌化序列(tokens);\n", 11 | "- 使用 `T2V` 向量化容器 将令牌化序列(tokens)转成向量(vector)。\n", 12 | "\n", 13 | "为了使用户能直接使用本地的(或公开的)预训练模型,我们提供了`I2V向量化容器`, 将令牌化、向量化操作同时封装起来。" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "## 概述\n", 21 | "\n", 22 | "使用EduNLP的开源预训练模型将给定的题目转成向量。\n", 23 | "\n", 24 | "- 优点:用户不需要研究令牌化和模型加载的细节。令牌化和向量化的参数已由预训练模型的参数文件定义好。\n", 25 | "- 缺点:不适合修改预训练的模型参数或令牌化容器参数" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 1, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "import os\n", 35 | "\n", 36 | "items = [\n", 37 | " r\"题目一:如图几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\SIFChoice$$\\FigureID{1}$\",\n", 38 | " r\"题目二: 如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\SIFChoice$$\\FigureID{1}$\"\n", 39 | "]\n", 40 | "\n", 41 | "model_dir = \"../test_model\"" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "# 示例:使用 W2V 加载本地模型\n", 49 | "## W2V" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 2, 55 | "metadata": {}, 56 | "outputs": [ 57 | { 58 | "name": "stderr", 59 | "output_type": "stream", 60 | "text": [ 61 | "E:\\dev_env\\anaconda\\envs\\data\\lib\\site-packages\\gensim\\similarities\\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n", 62 | " warnings.warn(msg)\n" 63 | ] 64 | }, 65 | { 66 | "name": "stdout", 67 | "output_type": "stream", 68 | "text": [ 69 | "2 256\n", 70 | "2 56 256\n" 71 | ] 72 | } 73 | ], 74 | "source": [ 75 | "from EduNLP.I2V import W2V\n", 76 | "\n", 77 | "pretrained_path = os.path.join(model_dir, \"w2v/w2v_test_256/w2v_test_256.kv\")\n", 78 | "i2v = W2V(\"pure_text\", \"w2v\", pretrained_path)\n", 79 | "\n", 80 | "item_vector, token_vector = i2v(items)\n", 81 | "# or\n", 82 | "item_vector, token_vector = i2v.infer_vector(items)\n", 83 | "# or\n", 84 | "item_vector = i2v.infer_item_vector(items)\n", 85 | "token_vector = i2v.infer_token_vector(items)\n", 86 | "\n", 87 | "print(len(item_vector), len(item_vector[0])) \n", 88 | "print(len(token_vector), len(token_vector[0]), len(token_vector[0][0]))" 89 | ] 90 | } 91 | ], 92 | "metadata": { 93 | "interpreter": { 94 | "hash": "776957673adb719a00031a24ed5efd2fa5ce8a13405e5193f8d278edd3805d55" 95 | }, 96 | "kernelspec": { 97 | "display_name": "Python 3.6.13 64-bit ('data': conda)", 98 | "name": "python3" 99 | }, 100 | "language_info": { 101 | "codemirror_mode": { 102 | "name": "ipython", 103 | "version": 3 104 | }, 105 | "file_extension": ".py", 106 | "mimetype": "text/x-python", 107 | "name": "python", 108 | "nbconvert_exporter": "python", 109 | "pygments_lexer": "ipython3", 110 | "version": "3.6.2" 111 | }, 112 | "orig_nbformat": 4 113 | }, 114 | "nbformat": 4, 115 | "nbformat_minor": 2 116 | } 117 | -------------------------------------------------------------------------------- /examples/i2v/i2v_d2v.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 使用 D2V 向量化容器\n", 8 | "## 导入功能块" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "metadata": {}, 15 | "outputs": [ 16 | { 17 | "name": "stderr", 18 | "output_type": "stream", 19 | "text": [ 20 | "d:\\MySoftwares\\Anaconda\\envs\\data\\lib\\site-packages\\gensim\\similarities\\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n", 21 | " warnings.warn(msg)\n" 22 | ] 23 | } 24 | ], 25 | "source": [ 26 | "from EduNLP.I2V import I2V, D2V, get_pretrained_i2v" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "items = [\n", 36 | " r\"题目一:如图几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\SIFChoice$$\\FigureID{1}$\",\n", 37 | " r\"题目二: 如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\SIFChoice$$\\FigureID{1}$\"\n", 38 | "]" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "## 向量化\n", 46 | "### 使用EduNLP中公开的预训练模型\n", 47 | "> - D2V没有实现token向量化,只能获得 item(题目)的表征" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 4, 53 | "metadata": {}, 54 | "outputs": [ 55 | { 56 | "name": "stderr", 57 | "output_type": "stream", 58 | "text": [ 59 | "EduNLP, INFO Use pretrained t2v model d2v_test_256\n", 60 | "downloader, INFO http://base.ustc.edu.cn/data/model_zoo/modelhub/doc2vec_pub/1/d2v_test_256.zip is saved as ..\\test_model\\d2v\\d2v_test_256.zip\n", 61 | "downloader, INFO file existed, skipped\n" 62 | ] 63 | }, 64 | { 65 | "name": "stdout", 66 | "output_type": "stream", 67 | "text": [ 68 | "2 256\n" 69 | ] 70 | } 71 | ], 72 | "source": [ 73 | "save_dir = \"../test_model/d2v\"\n", 74 | "i2v = get_pretrained_i2v(\"d2v_test_256\", model_dir=save_dir)\n", 75 | "\n", 76 | "item_vector, _ = i2v.infer_vector(items)\n", 77 | "# or\n", 78 | "item_vector = i2v.infer_item_vector(items)\n", 79 | "\n", 80 | "print(len(item_vector), len(item_vector[0])) " 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "### 使用本地模型" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 5, 93 | "metadata": {}, 94 | "outputs": [ 95 | { 96 | "name": "stdout", 97 | "output_type": "stream", 98 | "text": [ 99 | "2 256\n" 100 | ] 101 | } 102 | ], 103 | "source": [ 104 | "pretrained_path = \"../test_model/d2v/d2v_test_256/d2v_test_256.bin\"\n", 105 | "i2v = D2V(\"pure_text\", \"d2v\", pretrained_path)\n", 106 | "\n", 107 | "item_vector, _ = i2v(items)\n", 108 | "# or\n", 109 | "item_vector, _ = i2v.infer_vector(items)\n", 110 | "# or\n", 111 | "item_vector = i2v.infer_item_vector(items)\n", 112 | "\n", 113 | "print(len(item_vector), len(item_vector[0])) " 114 | ] 115 | } 116 | ], 117 | "metadata": { 118 | "interpreter": { 119 | "hash": "2a09bcfc86f5d80d5adfb774779878f28f4d48d5a6d6c0020bcfd8afaf909ec6" 120 | }, 121 | "kernelspec": { 122 | "display_name": "Python 3.6.13 ('data')", 123 | "language": "python", 124 | "name": "python3" 125 | }, 126 | "language_info": { 127 | "codemirror_mode": { 128 | "name": "ipython", 129 | "version": 3 130 | }, 131 | "file_extension": ".py", 132 | "mimetype": "text/x-python", 133 | "name": "python", 134 | "nbconvert_exporter": "python", 135 | "pygments_lexer": "ipython3", 136 | "version": "3.6.13" 137 | }, 138 | "orig_nbformat": 4 139 | }, 140 | "nbformat": 4, 141 | "nbformat_minor": 2 142 | } 143 | -------------------------------------------------------------------------------- /examples/pretrain/prepare_dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Download Data by EduData" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "name": "stderr", 17 | "output_type": "stream", 18 | "text": [ 19 | "downloader, INFO http://base.ustc.edu.cn/data/OpenLUNA/OpenLUNA.json is saved as ..\\..\\data\\OpenLUNA.json\n" 20 | ] 21 | }, 22 | { 23 | "name": "stdout", 24 | "output_type": "stream", 25 | "text": [ 26 | "Downloading ..\\..\\data\\OpenLUNA.json 100.00%: 269KB | 269KB\n" 27 | ] 28 | }, 29 | { 30 | "data": { 31 | "text/plain": [ 32 | "'..\\\\..\\\\data\\\\OpenLUNA.json'" 33 | ] 34 | }, 35 | "execution_count": 1, 36 | "metadata": {}, 37 | "output_type": "execute_result" 38 | } 39 | ], 40 | "source": [ 41 | "from EduData import get_data\n", 42 | "\n", 43 | "get_data(\"open-luna\", \"../../data/\")" 44 | ] 45 | } 46 | ], 47 | "metadata": { 48 | "kernelspec": { 49 | "display_name": "Python 3.10.4 64-bit", 50 | "language": "python", 51 | "name": "python3" 52 | }, 53 | "language_info": { 54 | "codemirror_mode": { 55 | "name": "ipython", 56 | "version": 3 57 | }, 58 | "file_extension": ".py", 59 | "mimetype": "text/x-python", 60 | "name": "python", 61 | "nbconvert_exporter": "python", 62 | "pygments_lexer": "ipython3", 63 | "version": "3.10.4" 64 | }, 65 | "vscode": { 66 | "interpreter": { 67 | "hash": "2469a70536e4d2335a2ea8907942d0699c37342a371ac185bdb5b0aa6f073890" 68 | } 69 | } 70 | }, 71 | "nbformat": 4, 72 | "nbformat_minor": 2 73 | } 74 | -------------------------------------------------------------------------------- /examples/pretrain/rnn/rnn.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/8/3 @ tongshiwei 3 | 4 | from longling import load_jsonl 5 | from EduNLP.Tokenizer import get_tokenizer 6 | from EduNLP.Pretrain import train_vector 7 | from EduNLP.Vector import W2V, RNNModel 8 | 9 | 10 | def etl(): 11 | tokenizer = get_tokenizer("pure_text") 12 | return tokenizer([item["stem"] for item in load_jsonl("../../../data/OpenLUNA.json")]) 13 | 14 | 15 | items = list(etl()) 16 | model_path = train_vector(items, "./w2v", 10, "sg") 17 | 18 | w2v = W2V(model_path, "sg") 19 | rnn = RNNModel("lstm", w2v, 5, device="cpu") 20 | saved_params = rnn.save("./lstm.params", save_embedding=True) 21 | 22 | rnn1 = RNNModel("lstm", w2v, 5, model_params=saved_params) 23 | -------------------------------------------------------------------------------- /examples/sif/item.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/examples/sif/item.json -------------------------------------------------------------------------------- /examples/sif/parse/parse.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# parse\n", 8 | "\n", 9 | "主要功能为将文本中的字母、数字等进行提取,将其转换为标准格式。\n", 10 | "\n", 11 | "\n", 12 | "## 概述\n", 13 | "\n", 14 | "1、将选择题中的括号,填空题中的下划线用特殊标识替换掉,并将字符、公式用$$包裹起来,使item能通过$符号准确的按照类型切割开;\n", 15 | "\n", 16 | "2、判断当前item是否合法,并报出错误类型。\n", 17 | "\n", 18 | "## 具体处理内容\n", 19 | "\n", 20 | "1.匹配公式之外的英文字母、数字,只对两个汉字之间的字母、数字做修正,其余匹配到的情况视为不合 latex 语法录入的公式\n", 21 | "\n", 22 | "2.匹配“( )”型括号(包含英文格式和中文格式),即括号内无内容或为空格的括号,将括号替换 ``$\\\\SIFChoice$`` \n", 23 | "\n", 24 | "3.匹配下划线,替换连续的下划线或下划线中夹杂空格的情况,将其替换为 ``$\\\\SIFBlank$`` \n", 25 | "\n", 26 | "4.匹配latex公式,主要检查latex公式的完整性和可解析性,对latex 中出现中文字符发出警告" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "## 导入类" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "from EduNLP.Formula.ast import str2ast, katex_parse\n", 43 | "from EduNLP.SIF.parser import Parser" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "## 输入\n", 51 | "\n", 52 | "类型:str \n", 53 | "内容:题目文本 (text)" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 3, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "text1 = '生产某种零件的A工厂25名工人的日加工零件数_ _'\n", 63 | "text2 = 'X的分布列为( )'\n", 64 | "text3 = '① AB是⊙O的直径,AC是⊙O的切线,BC交⊙O于点E.AC的中点为D'\n", 65 | "text4 = '支持公式如$\\\\frac{y}{x}$,$\\\\SIFBlank$,$\\\\FigureID{1}$,不支持公式如$\\\\frac{ \\\\dddot y}{x}$'" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "## 输出" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "### 尝试转换为标准形式" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 6, 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "name": "stdout", 89 | "output_type": "stream", 90 | "text": [ 91 | "text_parser1.text: 生产某种零件的$A$工厂$25$名工人的日加工零件数$\\SIFBlank$\n", 92 | "text_parser2.text: $X$的分布列为$\\SIFChoice$\n" 93 | ] 94 | } 95 | ], 96 | "source": [ 97 | "text_parser1 = Parser(text1)\n", 98 | "text_parser1.description_list()\n", 99 | "print('text_parser1.text:',text_parser1.text)\n", 100 | "\n", 101 | "\n", 102 | "text_parser2 = Parser(text2)\n", 103 | "text_parser2.description_list()\n", 104 | "print('text_parser2.text:',text_parser2.text)\n" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "### 判断是否有语法问题" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 7, 117 | "metadata": {}, 118 | "outputs": [ 119 | { 120 | "name": "stdout", 121 | "output_type": "stream", 122 | "text": [ 123 | "text_parser3.error_flag: 1\n", 124 | "text_parser4.fomula_illegal_flag: 1\n" 125 | ] 126 | } 127 | ], 128 | "source": [ 129 | "text_parser3 = Parser(text3)\n", 130 | "text_parser3.description_list()\n", 131 | "print('text_parser3.error_flag: ',text_parser3.error_flag)\n", 132 | "\n", 133 | "\n", 134 | "text_parser4 = Parser(text4)\n", 135 | "text_parser4.description_list()\n", 136 | "print('text_parser4.fomula_illegal_flag: ',text_parser4.fomula_illegal_flag)\n" 137 | ] 138 | } 139 | ], 140 | "metadata": { 141 | "interpreter": { 142 | "hash": "6f23ddf1f0697a8f0c43dd2435bdb82528077c79e9967f824fba6a3b52b05faf" 143 | }, 144 | "kernelspec": { 145 | "display_name": "Python 3.6.3 64-bit", 146 | "name": "python3" 147 | }, 148 | "language_info": { 149 | "codemirror_mode": { 150 | "name": "ipython", 151 | "version": 3 152 | }, 153 | "file_extension": ".py", 154 | "mimetype": "text/x-python", 155 | "name": "python", 156 | "nbconvert_exporter": "python", 157 | "pygments_lexer": "ipython3", 158 | "version": "3.6.3" 159 | }, 160 | "orig_nbformat": 4 161 | }, 162 | "nbformat": 4, 163 | "nbformat_minor": 2 164 | } 165 | -------------------------------------------------------------------------------- /examples/sif/sci4sif.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/5/18 @ tongshiwei 3 | 4 | from EduNLP.SIF import sif4sci, link_formulas 5 | 6 | # item = r"若集合$A=\{x \in R | |x - 2| \leq 5\}$,则$A$中最小整数位是$\SIFChoice$" 7 | # print(item) 8 | # print(sif4sci(item, symbol="fgm", tokenization=False)) 9 | # print(sif4sci(item, symbol="fgm", tokenization=True)) 10 | # print(sif4sci(item, symbol="t")) 11 | # print(sif4sci(item, symbol="fgm", tokenization=False)) 12 | # print(sif4sci(item, symbol="fgm")) 13 | # print(sif4sci(item, symbol="gm", tokenization_params={"formula_params": {"method": "ast"}})) 14 | # print(sif4sci(item, symbol="gm", tokenization_params={"formula_params": {"method": "linear"}})) 15 | # print(sif4sci(item, tokenization_params={"formula_params": {"method": "ast", "ord2token": True}})) 16 | # print( 17 | # sif4sci(item, tokenization_params={"formula_params": {"method": "ast", "ord2token": True, "var_numbering": True}})) 18 | # print(sif4sci(item, tokenization_params={"formula_params": {"method": "ast", "return_type": "list"}})) 19 | # print( 20 | # sif4sci(item, tokenization_params={"formula_params": {"method": "ast", "ord2token": True, "return_type": "list"}}).formula_tokens 21 | # ) 22 | # print( 23 | # sif4sci(item, tokenization_params={ 24 | # "formula_params": {"method": "ast", "ord2token": True, "var_numbering": True, "return_type": "list"}}) 25 | # ) 26 | # print(sif4sci(item, tokenization_params={"formula_params": {"method": "ast", "return_type": "ast"}})) 27 | # print(sif4sci(item, tokenization_params={"formula_params": {"method": "ast", "return_type": "formula"}})) 28 | 29 | # e = r"$x$ 是 $y$ 那么 $y$ 和 $z$ 是什么 $x,y,z$" 30 | # print(sif4sci(e, symbol="gm", 31 | # tokenization_params={ 32 | # "formula_params": { 33 | # "method": "ast", "return_type": "list", "ord2token": True, "var_numbering": True, 34 | # } 35 | # })) 36 | # 37 | # test_item_1 = [r"$x < y$", r"$y = x$", r"$y < x$"] 38 | # tls = [ 39 | # sif4sci(e, symbol="gm", 40 | # tokenization_params={ 41 | # "formula_params": { 42 | # "method": "ast", "return_type": "list", "ord2token": True, "var_numbering": True, 43 | # } 44 | # }) 45 | # for e in test_item_1 46 | # ] 47 | # link_formulas(*tls) 48 | # print(tls) 49 | # seg = sif4sci(e, tokenization=False) 50 | # with seg.filter(keep="t"): 51 | # print(seg) 52 | # e = r'某校一个课外学习小组为研究某作物的发芽率y和温度x(单位:$^{\circ} \mathrm{C}$)的关系,在20个不同温度条件下进行种子发芽实验,由实验数据$\left(x_{i}, y_{i}\right)(i=1,2, \cdots, 20)$得到下面的散点图:由此散点图,在10$^{\circ} \mathrm{C}$至40$^{\circ} \mathrm{C}$之间,下面四个回归方程类型中最适宜作为发芽率y和温度x的回归方程类型的是$\FigureID{3bf20b91-8af1-11eb-86ff-b46bfc50aa29}$$\FigureID{59b851d3-8af1-11eb-bd45-b46bfc50aa29}$$\FigureID{6310d375-8b75-11eb-bf70-b46bfc50aa29}$$\FigureID{6a006175-8b76-11eb-aa57-b46bfc50aa29}$$\FigureID{088f15e7-8b7c-11eb-a8aa-b46bfc50aa29}$' 53 | # # e = r"$x$ 是 $y$ 那么 $y$ 和 $z$ 是什么 $x,y,z$" 54 | 55 | # e = r'已知集合$A=\left\{x \mid x^{2}-3 x-4<0\right\}, \quad B=\{-4,1,3,5\}, \quad$ 则 $A \cap B=$' 56 | 57 | from EduNLP.utils import dict2str4sif 58 | 59 | test_item_1 = { 60 | "stem": "已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$", 61 | "options": ['\\{-4,1\\}', '\\{1,5\\}', '\\{3,5\\}', '\\{1,3\\}'], 62 | } 63 | e = dict2str4sif(test_item_1, tag_mode="head", add_list_no_tag=False) 64 | seg = sif4sci( 65 | e, 66 | symbol="tfgmas", 67 | tokenization_params={ 68 | "formula_params": { 69 | "method": "ast", "return_type": "list", "ord2token": True 70 | } 71 | }, 72 | errors="raise" 73 | ) 74 | print(seg.tokens) 75 | # print(seg.get_segments()) 76 | # 77 | # import json 78 | # from tqdm import tqdm 79 | # 80 | # 81 | # def load_items(): 82 | # with open("../../data/OpenLUNA.json", encoding="utf-8") as f: 83 | # for line in f: 84 | # yield json.loads(line) 85 | # 86 | # 87 | # from EduNLP.SIF import sif4sci 88 | # 89 | # sif_items = [] 90 | # for i, item in tqdm(enumerate(load_items()), "sifing"): 91 | # if i > 100: 92 | # break 93 | # sif_item = sif4sci( 94 | # item["stem"], 95 | # symbol="gm", 96 | # tokenization_params={"formula_params": { 97 | # "method": "ast", 98 | # "return_type": "list", 99 | # "ord2token": True, 100 | # }} 101 | # ) 102 | # if sif_item: 103 | # sif_items.append(sif_item.tokens) 104 | -------------------------------------------------------------------------------- /examples/test_model/w2v/gensim_luna_stem_t_sg_100.kv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/examples/test_model/w2v/gensim_luna_stem_t_sg_100.kv -------------------------------------------------------------------------------- /examples/tokenizer/test_stopwords.txt: -------------------------------------------------------------------------------- 1 | 一旦 2 | 一时 3 | 一来 4 | 一样 5 | 一次 6 | 一片 7 | 一番 8 | 一直 9 | 一致 -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | # For pytest usage, refer to https://hb4dsai.readthedocs.io/zh/latest/Architecture/Test.html 3 | norecursedirs = docs *build* trash dev examples EduNLP/Formula/viz EduNLP/Formula/ast scripts data 4 | 5 | # Deal with marker warnings 6 | markers = 7 | flake8: flake8 8 | 9 | # Enable line length testing with maximum line length of 120 10 | flake8-max-line-length = 120 11 | 12 | # Ignore module level import not at top of file (E402) 13 | # Others can be found in https://flake8.pycqa.org/en/latest/user/error-codes.html 14 | flake8-ignore = E402 F401 F403 15 | 16 | # --doctest-modules is used for unittest 17 | addopts = --doctest-modules --cov --cov-report=term-missing --flake8 18 | -------------------------------------------------------------------------------- /scripts/extlib/katex2python.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/5/30 @ tongshiwei 3 | from pathlib import PurePath 4 | from fire import Fire 5 | import requests 6 | import js2py 7 | import tempfile 8 | 9 | 10 | def get_katex_from_url(version, tar): 11 | katex_version = version 12 | url = "https://cdn.jsdelivr.net/npm/katex@{}/dist/katex.js".format(katex_version) 13 | ret = requests.get(url, allow_redirects=True) 14 | assert ret.status_code == 200, ret.status_code 15 | content = requests.get(url).content 16 | tar.write(content) 17 | return url 18 | 19 | 20 | def update_katex_py(src=None, tar="katex.py"): 21 | ''' 22 | Notes 23 | ---------- 24 | In that some formulas can not parse well by katex.py for some js2py errors, 25 | we need to manually omit a few codes after ketex.py is built. 26 | eg 1. Array.fill() error : 27 | # var.get('res').put('cols', var.get('Array').create(var.get('numCols')).callprop('fill', Js({'type':Js('align'),'align':var.get('colAlign')}))) 28 | ''' 29 | src = "katex.js" if src is None else src 30 | if PurePath(src).suffix == ".js": 31 | print("%s -> %s" % (src, tar)) 32 | js2py.translate_file("katex.js", tar) 33 | else: 34 | with tempfile.NamedTemporaryFile() as tmp_tar: 35 | print("katex version: %s" % src) 36 | url = get_katex_from_url(src, tmp_tar) 37 | src = tmp_tar.name 38 | print("%s -> %s" % (url, tar)) 39 | js2py.translate_file(src, tar) 40 | 41 | 42 | if __name__ == '__main__': 43 | Fire(update_katex_py) 44 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [coverage:run] 2 | source=EduNLP 3 | omit=EduNLP/Formula/ast/*,EduNLP/Formula/viz/*,EduNLP/utils/path.py 4 | [coverage:report] 5 | exclude_lines = 6 | pragma: no cover 7 | pass 8 | raise NotImplementedError 9 | if __name__ == '__main__': 10 | if __name__ == "__main__": 11 | def __str__ 12 | def __repr__ 13 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from setuptools import setup, find_packages 3 | 4 | tutor_deps = [ 5 | "pillow", 6 | "tqdm", 7 | "ipython" 8 | ] 9 | test_deps = [ 10 | 'pytest>=4', 11 | 'pytest-cov>=2.6.0', 12 | 'pytest-flake8', 13 | 'flake8<5.0.0' 14 | ] 15 | docs_deps = [ 16 | 'sphinx', 17 | 'sphinx_rtd_theme', 18 | 'sphinx_toggleprompt', 19 | 'sphinx-gallery>=0.6', 20 | 'nbsphinx', 21 | 'm2r2' 22 | ] 23 | 24 | dev_deps = ["requests"] + docs_deps + test_deps 25 | 26 | try: 27 | import torch 28 | 29 | ml_pytorch_deps = [] 30 | except ModuleNotFoundError: 31 | import sys 32 | 33 | if 5 <= sys.version_info[1]: 34 | ml_pytorch_deps = ["torch<=1.12.1"] 35 | else: 36 | ml_pytorch_deps = [] 37 | logging.warning("Current python version %s is not supported by pytorch", str(sys.version_info[:2])) 38 | 39 | vec_deps = [ 40 | 'gensim', 41 | 'transformers<4.29.0', 42 | 'torchvision', 43 | 'datasets'] + ml_pytorch_deps 44 | 45 | setup( 46 | name='EduNLP', 47 | version='0.0.9', 48 | extras_require={ 49 | 'test': test_deps, 50 | 'doc': docs_deps, 51 | 'tutor': tutor_deps, 52 | 'dev': dev_deps, 53 | 'vec': vec_deps, 54 | 'full': vec_deps + tutor_deps 55 | }, 56 | packages=find_packages(), 57 | include_package_data=True, 58 | install_requires=[ 59 | 'networkx', 60 | 'numpy>=1.17.0', 61 | 'jieba', 62 | 'js2py', 63 | 'EduData>=0.0.16', 64 | 'PyBaize>=0.0.3' 65 | ], # And any other dependencies foo needs 66 | entry_points={ 67 | "console_scripts": [ 68 | "edunlp = EduNLP.main:cli", 69 | ], 70 | }, 71 | classifiers=[ 72 | 'Programming Language :: Python :: 3.6', 73 | 'Programming Language :: Python :: 3.7', 74 | 'Programming Language :: Python :: 3.8', 75 | 'Programming Language :: Python :: 3.9', 76 | "Environment :: Other Environment", 77 | "Intended Audience :: Developers", 78 | "License :: OSI Approved :: Apache Software License", 79 | "Operating System :: OS Independent", 80 | "Topic :: Software Development :: Libraries :: Python Modules", 81 | ], 82 | ) 83 | -------------------------------------------------------------------------------- /static/test_data/quesnet_img/000004d6-0479-11ec-829b-797d5eb43535.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/static/test_data/quesnet_img/000004d6-0479-11ec-829b-797d5eb43535.png -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/5/20 @ tongshiwei 3 | -------------------------------------------------------------------------------- /tests/test_ast.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from EduNLP.Formula.ast import str2ast 4 | 5 | 6 | def test_ast(): 7 | ast_str_list = [] 8 | # normal examples 9 | ast_str_list.append(r"{x + y}^\frac{\pi}{2} + 1 = x") 10 | ast_str_list.append(r"\color{#0FF} x = y") 11 | ast_str_list.append(r"x^2 + 1 = y") 12 | ast_str_list.append(r"\verb!x^2!") 13 | ast_str_list.append(r"\utilde{AB}") 14 | ast_str_list.append(r"\mathrm{Ab0}") 15 | ast_str_list.append(r"{1,2,3}") 16 | ast_str_list.append(r"\huge AB") 17 | ast_str_list.append(r"\underline{AB}") 18 | ast_str_list.append(r"\sqrt{\smash[b]{y}}") 19 | ast_str_list.append(r"\hbox{AA BB}") 20 | ast_str_list.append(r"abc\llap{abcdefghi}") 21 | ast_str_list.append(r"\raisebox{3em}{hi}") 22 | ast_str_list.append(r"\textcolor{#228B22}{F=ma}") 23 | ast_str_list.append(r"\displaystyle\sum_{i=1}^n") 24 | ast_str_list.append(r"\def\foo{x^2} \foo + \foo") 25 | ast_str_list.append(r"thank \hphantom{xyz} you") 26 | ast_str_list.append(r"\mathchoice{D}{T}{S}{SS}") 27 | ast_str_list.append(r"\bigotimes") 28 | ast_str_list.append(r"{AB}_b^c") 29 | ast_str_list.append(r"\left\{\begin{array}{c}2 x+y-2 \leq 0 \\ x-y-1 \geq 0 \\ y+1 \geq 0\end{array}\right.") 30 | ast_str_list.append(r"\cancel{5}") 31 | 32 | # work only when katex is in 'display' mode : 33 | ast_str_list.append(r"\begin{matrix} a & b \\ c & d \end{matrix}") 34 | ast_str_list.append(r"\begin{pmatrix} a&b\\c&d \end{pmatrix}") 35 | ast_str_list.append(r"\begin{matrix}k个\\ \overbrace{(-1)^{k-1}k,\cdots,(-1)^{k-1}k}\end{matrix}") 36 | 37 | # work only when 'trust' katex html func: 38 | ast_str_list.append(r"\href{https://katex.org}{katex}") 39 | ast_str_list.append(r"\htmlStyle{color: red;}{x}") 40 | ast_str_list.append(r"\url{www.baidu.com}") 41 | ast_str_list.append(r"\htmlId{bar}{x}") 42 | ast_str_list.append(r"\htmlClass{foo}{x}") 43 | ast_str_list.append("\\includegraphics[height=0.8em, totalheight=0.9em, \ 44 | width=0.9em, alt=KA logo]{https://katex.org/img/khan-academy.png}") 45 | 46 | for ast_str in ast_str_list: 47 | str2ast(ast_str) 48 | -------------------------------------------------------------------------------- /tests/test_formula.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/5/20 @ tongshiwei 3 | import pytest 4 | from EduNLP.Formula import Formula, FormulaGroup 5 | 6 | 7 | def test_formula(): 8 | formula = r"x + x" 9 | f = Formula(formula) 10 | f.variable_standardization(inplace=False) 11 | f.variable_standardization(inplace=True) 12 | assert len(f.ast_graph.nodes) == len(f.ast) 13 | f.to_str() 14 | 15 | formula = r"\frac{\pi}{2}" 16 | f = Formula(formula, variable_standardization=True) 17 | assert repr(f) == r"" 18 | 19 | f = Formula(f.ast) 20 | assert f.resetable is False 21 | with pytest.raises(TypeError): 22 | f.reset_ast() 23 | 24 | fg = FormulaGroup([r"x + x", r"x + \frac{\pi}{2}"], variable_standardization=True) 25 | for f in fg: 26 | assert f in fg 27 | assert len(fg[0].ast) == 3 28 | fg.to_str() 29 | 30 | fg = FormulaGroup(["x", "y", "x"]) 31 | assert len(fg.ast) == 3 32 | 33 | with pytest.raises(TypeError): 34 | FormulaGroup([{}]) 35 | -------------------------------------------------------------------------------- /tests/test_i2v/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/8/1 @ tongshiwei 3 | -------------------------------------------------------------------------------- /tests/test_i2v/test_pretrained.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/8/2 @ tongshiwei 3 | import pytest 4 | from EduNLP import get_pretrained_i2v 5 | # from EduNLP.I2V.i2v import MODELS 6 | from EduNLP.I2V import D2V, W2V 7 | from EduNLP.Vector import get_pretrained_model_info, get_all_pretrained_models 8 | 9 | 10 | def test_pretrained_i2v(tmp_path): 11 | 12 | d = tmp_path / "model" 13 | d.mkdir() 14 | 15 | url, t2v_name = get_pretrained_model_info("d2v_test_256") 16 | assert url != "" 17 | assert t2v_name == "d2v" 18 | model_names = get_all_pretrained_models() 19 | assert "d2v_test_256" in model_names 20 | 21 | get_pretrained_i2v("d2v_test_256", d) 22 | 23 | with pytest.raises(KeyError): 24 | get_pretrained_i2v("error") 25 | 26 | get_pretrained_i2v("w2v_test_256", d) 27 | 28 | # get_pretrained_i2v("quesnet_test_256", d) 29 | 30 | # get_pretrained_i2v("elmo_test", d) 31 | 32 | # # get_pretrained_i2v("tal_edu_bert", d) 33 | -------------------------------------------------------------------------------- /tests/test_main.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/8/2 @ tongshiwei 3 | 4 | from EduNLP.main import list_i2v 5 | 6 | 7 | def test_list_i2v(): 8 | list_i2v() 9 | -------------------------------------------------------------------------------- /tests/test_model_zoo/test_rnn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from EduNLP.ModelZoo.rnn import LM 3 | 4 | 5 | idxs = torch.tensor([ 6 | [1, 2, 3, 4, 0, 0], 7 | [1, 2, 0, 0, 0, 0], 8 | [1, 0, 0, 0, 0, 0], 9 | [1, 2, 0, 0, 0, 0] 10 | ]) 11 | 12 | lens = torch.tensor([4, 2, 1, 2]) 13 | 14 | rnn = LM(rnn_type="lstm", vocab_size=20, embedding_dim=5, hidden_size=10) 15 | output, hn = rnn(idxs, lens) 16 | 17 | print("[output]", output) 18 | print("[hn]", hn) 19 | -------------------------------------------------------------------------------- /tests/test_pipeline/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from PIL import Image 3 | from EduNLP.utils import abs_current_dir, path_append 4 | from EduNLP.Vector import get_pretrained_model_info 5 | from EduData import get_data 6 | 7 | 8 | @pytest.fixture(scope="module") 9 | def pretrained_elmo_for_property_prediction_dir(): 10 | model_dir = path_append(abs_current_dir(__file__), "../../examples/test_model/elmo", to_str=True) 11 | url, _ = get_pretrained_model_info('elmo_pp_test') 12 | path = get_data(url, model_dir) 13 | return path 14 | 15 | 16 | @pytest.fixture(scope="module") 17 | def pretrained_elmo_for_knowledge_prediction_dir(): 18 | model_dir = path_append(abs_current_dir(__file__), "../../examples/test_model/elmo", to_str=True) 19 | url, _ = get_pretrained_model_info('elmo_kp_test') 20 | path = get_data(url, model_dir) 21 | return path 22 | -------------------------------------------------------------------------------- /tests/test_pretrain/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdata-ustc/EduNLP/95cd9fc71a1bdd6156c42af7434aee0a8fc0a82e/tests/test_pretrain/__init__.py -------------------------------------------------------------------------------- /tests/test_pretrain/conftest.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/5/30 @ tongshiwei 3 | import torch 4 | import pytest 5 | import os 6 | from EduNLP.utils import abs_current_dir, path_append 7 | from EduNLP.ModelZoo import load_items 8 | 9 | # TEST_GPU = torch.cuda.is_available() 10 | 11 | 12 | @pytest.fixture(scope="module") 13 | def standard_luna_data(): 14 | data_path = path_append(abs_current_dir(__file__), "../../static/test_data/standard_luna_data.json", to_str=True) 15 | _data = load_items(data_path)[:10] 16 | return _data 17 | 18 | 19 | @pytest.fixture(scope="module") 20 | def pretrained_tokenizer_dir(tmp_path_factory): 21 | return str(tmp_path_factory.mktemp("pretrained_tokenizer_dir")) 22 | 23 | 24 | @pytest.fixture(scope="module") 25 | def pretrained_model_dir(tmp_path_factory): 26 | return str(tmp_path_factory.mktemp("pretrained_model_dir")) 27 | 28 | 29 | @pytest.fixture(scope="module") 30 | def pretrained_pp_dir(tmp_path_factory): 31 | return str(tmp_path_factory.mktemp("pretrained_pp_dir")) 32 | 33 | 34 | @pytest.fixture(scope="module") 35 | def pretrained_kp_dir(tmp_path_factory): 36 | return str(tmp_path_factory.mktemp("pretrained_kp_dir")) 37 | -------------------------------------------------------------------------------- /tests/test_pretrain/test_hugginface_utils.py: -------------------------------------------------------------------------------- 1 | from EduNLP.Pretrain.hugginface_utils import TokenizerForHuggingface 2 | from transformers import AutoTokenizer 3 | import os 4 | os.environ["WANDB_DISABLED"] = "true" 5 | 6 | 7 | class TestPretrainUtils: 8 | def test_hf_tokenzier(self, pretrained_tokenizer_dir): 9 | tokenizer = TokenizerForHuggingface(tokenize_method=None) 10 | tokenizer = TokenizerForHuggingface(add_special_tokens=True) 11 | assert isinstance(tokenizer.vocab_size, int) 12 | item = 'This is a test.' 13 | res = tokenizer.decode(tokenizer.encode(item)) 14 | right_ans = '[CLS] [UNK] is a test. [SEP]' 15 | assert res == right_ans, res 16 | items = ['This is a test.', 'This is a test 2.'] 17 | res = tokenizer.decode(tokenizer.encode(items)) 18 | right_ans = ['[CLS] [UNK] is a test. [SEP]', '[CLS] [UNK] is a test 2. [SEP]'] 19 | assert res == right_ans, res 20 | 21 | tokenizer_hf = AutoTokenizer.from_pretrained("bert-base-chinese") 22 | tokenizer_hf.save_pretrained(pretrained_tokenizer_dir) 23 | 24 | tokenizer_hf = TokenizerForHuggingface.from_pretrained(pretrained_tokenizer_dir) 25 | -------------------------------------------------------------------------------- /tests/test_pretrain/test_pretrain_utils.py: -------------------------------------------------------------------------------- 1 | from EduNLP.Pretrain.pretrian_utils import EduVocab, PretrainedEduTokenizer, EduDataset 2 | import pytest 3 | import os 4 | 5 | 6 | class TestPretrainUtils: 7 | def test_eduvocab(self): 8 | test = EduVocab(specials=['token1']) 9 | assert len(test) == 5 10 | token_list = ['An', 'apple', 'a', 'day', 'keeps', 'doctors', 'away'] 11 | test.add_tokens(token_list) 12 | right_ans = ['[PAD]', '[UNK]', '[BOS]', '[EOS]', 'token1', 13 | 'An', 'apple', 'a', 'day', 'keeps', 'doctors', 'away'] 14 | assert test.tokens == right_ans 15 | assert test.vocab_size == len(right_ans) 16 | test_token_list = ['An', 'banana', 'is', 'a', 'kind', 'of', 'fruit'] 17 | res = test.convert_sequence_to_token(test.convert_sequence_to_idx(test_token_list, bos=True, eos=True)) 18 | right_ans = ['[BOS]', 'An', '[UNK]', '[UNK]', 'a', '[UNK]', '[UNK]', '[UNK]', '[EOS]'] 19 | assert res == right_ans 20 | test.add_specials(['token2', 'token3']) 21 | right_ans = ['[PAD]', '[UNK]', '[BOS]', '[EOS]', 'token1', 'token2', 'token3'] 22 | test.special_tokens == right_ans 23 | test = EduVocab(corpus_items=[token_list]) 24 | 25 | def test_edu_tokenizer(self, pretrained_tokenizer_dir): 26 | test = EduVocab() 27 | token_list = ['An', 'apple', 'a', 'day', 'keeps', 'doctors', 'away'] 28 | test.add_tokens(token_list) 29 | vocab_path = os.path.join(pretrained_tokenizer_dir, 'vocab.txt') 30 | test.save_vocab(vocab_path) 31 | test = EduVocab(vocab_path=vocab_path) 32 | 33 | text = 'An apple a day keeps doctors away' 34 | tokenizer = PretrainedEduTokenizer(vocab_path=vocab_path, max_length=100) 35 | res = tokenizer(text, padding='max_length') 36 | assert res['seq_idx'].shape[0] == 100 37 | res = tokenizer(text, padding='longest') 38 | assert res['seq_idx'].shape[0] == res['seq_len'] 39 | res = tokenizer(text, padding='do_not_pad') 40 | assert res['seq_idx'].shape[0] == res['seq_len'] 41 | with pytest.raises(ValueError): 42 | res = tokenizer(text, padding='wrong_pad') 43 | tokenizer.add_tokens("[token]") 44 | tokenizer.add_specials("[special]") 45 | res = tokenizer.decode(tokenizer.encode({'content': 'An banana'}, key=lambda x: x['content'])) 46 | right_ans = ['An', '[UNK]'] 47 | print(res) 48 | assert res == right_ans, res 49 | 50 | res = tokenizer.decode(tokenizer.encode(['An banana'])) 51 | assert res == [['An', '[UNK]']] 52 | tokenizer.save_pretrained(f"{pretrained_tokenizer_dir}/save_dir") 53 | 54 | def test_edu_dateset(self, standard_luna_data, pretrained_tokenizer_dir): 55 | tokenizer = PretrainedEduTokenizer() 56 | tokenizer.set_vocab(standard_luna_data, key=lambda x: x["ques_content"]) 57 | dataset = EduDataset(tokenizer, 58 | items=standard_luna_data, 59 | stem_key="ques_content") 60 | assert "seq_idx" in dataset[0].keys() and "seq_len" in dataset[0].keys() 61 | dataset.to_disk(f"{pretrained_tokenizer_dir}/dataset") 62 | 63 | local_dataset = EduDataset(tokenizer, f"{pretrained_tokenizer_dir}/dataset") 64 | assert local_dataset[0] == dataset[0] 65 | -------------------------------------------------------------------------------- /tests/test_sif/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/5/20 @ tongshiwei 3 | -------------------------------------------------------------------------------- /tests/test_sif/conftest.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/5/20 @ tongshiwei 3 | import os 4 | import pytest 5 | from PIL import Image 6 | from EduNLP.utils import abs_current_dir, path_append, image2base64 7 | 8 | 9 | @pytest.fixture(scope="module") 10 | def img_dir(): 11 | return os.path.abspath(path_append(abs_current_dir(__file__), "..", "..", "asset", "_static")) 12 | 13 | 14 | @pytest.fixture(scope="module") 15 | def figure0(img_dir): 16 | return Image.open(path_append(img_dir, "item_formula.png", to_str=True)) 17 | 18 | 19 | @pytest.fixture(scope="module") 20 | def figure1(img_dir): 21 | return Image.open(path_append(img_dir, "item_figure.png", to_str=True)) 22 | 23 | 24 | @pytest.fixture(scope="module") 25 | def figure0_base64(figure0): 26 | return image2base64(figure0) 27 | 28 | 29 | @pytest.fixture(scope="module") 30 | def figure1_base64(figure1): 31 | return image2base64(figure1) 32 | -------------------------------------------------------------------------------- /tests/test_sif/test_parser.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from EduNLP.SIF.parser.parser import Parser 3 | 4 | 5 | def test_parser(): 6 | text = '' 7 | text_parser = Parser(text) 8 | text_parser.description_list() 9 | 10 | text = '随机$text{观测}$生产某种零件的A工厂25名工人的日加工零件数_ _' 11 | text_parser = Parser(text) 12 | text_parser.description_list() 13 | 14 | text = 'X的分布列为( )' 15 | text_parser = Parser(text) 16 | text_parser.description_list() 17 | 18 | text = '由题意得( )' 19 | text_parser = Parser(text) 20 | text_parser.description_list() 21 | assert text_parser.error_flag == 0 22 | 23 | text = '1.命题中真命题的序号是\n ① AB是⊙O的直径,AC是⊙O的切线,BC交⊙O于点E.AC的中点为D' 24 | text_parser = Parser(text) 25 | text_parser.description_list() 26 | assert text_parser.error_flag == 1 27 | 28 | text = r"公式两侧的匹配符号需要完整,如不允许$\frac{y}{x}" 29 | text_parser = Parser(text) 30 | text_parser.description_list() 31 | assert text_parser.error_flag == 1 32 | 33 | text = r"支持公式如$\frac{y}{x}$,$\SIFBlank$,$\FigureID{1}$,不支持公式如$\frac{ \dddot y}{x}$" 34 | text_parser = Parser(text) 35 | text_parser.description_list() 36 | assert text_parser.fomula_illegal_flag == 1 37 | -------------------------------------------------------------------------------- /tests/test_sif/test_segement.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/5/20 @ tongshiwei 3 | 4 | import pytest 5 | 6 | from EduNLP.SIF.segment import seg 7 | from EduNLP.utils import image2base64 8 | 9 | 10 | def test_segment(figure0, figure1, figure0_base64, figure1_base64): 11 | seg( 12 | r"如图所示,则$\FormFigureID{0}$的面积是$\SIFBlank$。$\FigureID{1}$", 13 | figures={ 14 | "0": figure0, 15 | "1": figure1 16 | } 17 | ) 18 | s = seg( 19 | r"如图所示,则$\FormFigureBase64{%s}$的面积是$\SIFBlank$。$\FigureBase64{%s}$" % (figure0_base64, figure1_base64), 20 | figures=True 21 | ) 22 | with pytest.raises(TypeError): 23 | s.append("123") 24 | seg_test_text = seg( 25 | r"如图所示,有三组$\textf{机器人,bu}$在踢$\textf{足球,b}$", 26 | figures=True 27 | ) 28 | assert seg_test_text.text_segments == ['如图所示,有三组机器人在踢足球'] 29 | -------------------------------------------------------------------------------- /tests/test_sif/test_sif.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/5/20 @ tongshiwei 3 | 4 | from EduNLP.SIF import is_sif 5 | from EduNLP.SIF import to_sif 6 | from EduNLP.SIF import sif4sci 7 | import pytest 8 | 9 | 10 | def test_is_sif(): 11 | text = '若$x,y$满足约束条件' \ 12 | '$\\left\\{\\begin{array}{c}2 x+y-2 \\leq 0 \\\\ x-y-1 \\geq 0 \\\\ y+1 \\geq 0\\end{array}\\right.$,' \ 13 | '则$z=x+7 y$的最大值$\\SIFUnderline$' 14 | assert is_sif(text) == 1 15 | 16 | text = '公式需要满足完整性,完整的公式如' \ 17 | '$\\begin{matrix} a & b \\\\ c & d \\end{matrix}$' \ 18 | ',不完整的公式如$\\begin{matrix} a & b \\\\ c & d$' 19 | with pytest.raises(ValueError): 20 | is_sif(text) 21 | 22 | text = '公式需要满足符合katex的支持性,可支持的公式如' \ 23 | '$\\begin{matrix} a & b \\\\ c & d \\end{matrix}$' \ 24 | ',不可支持的公式如$\\frac{ \\dddot y }{ x }$' 25 | with pytest.raises(ValueError): 26 | is_sif(text) 27 | 28 | 29 | def test_to_sif(): 30 | text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...' 31 | siftext = to_sif(text) 32 | print(siftext) 33 | 34 | ret = is_sif(text, return_parser=True) 35 | assert ret[0] == 0 36 | if ret[0] is not True: 37 | siftext = to_sif(text, parser=ret[1]) 38 | print(siftext) 39 | 40 | 41 | def test_sci4sif(figure0, figure1, figure0_base64, figure1_base64): 42 | repr(sif4sci( 43 | r"如图所示,则$\bigtriangleup ABC$的面积是$\SIFBlank$。$\FigureID{1}$", 44 | tokenization_params={ 45 | "formula_params": { 46 | "method": "ast", 47 | "return_type": "ast" 48 | } 49 | } 50 | )) 51 | repr(sif4sci( 52 | r"如图所示,则$\FormFigureID{0}$的面积是$\SIFBlank$。$\FigureID{1}$", 53 | figures={ 54 | "0": figure0, 55 | "1": figure1 56 | }, 57 | )) 58 | repr(sif4sci( 59 | item=r"如图所示,则$\FormFigureBase64{%s}$的面积是$\SIFBlank$。$\FigureBase64{%s}$" % ( 60 | figure0_base64, figure1_base64 61 | ), 62 | tokenization_params={ 63 | "figure_params": {"figure_instance": True} 64 | } 65 | )) 66 | repr(sif4sci( 67 | r"如图所示,则$\bigtriangleup ABC$的面积是$\SIFBlank$。$\FigureID{1}$", mode=0 68 | )) 69 | repr(sif4sci( 70 | r"如图所示,则$\bigtriangleup ABC$的面积是$\SIFBlank$。$\FigureID{1}$", mode=1 71 | )) 72 | repr(sif4sci( 73 | r"如图所示,则$\bigtriangleup ABC$的面积是$\SIFBlank$。$\FigureID{1}$", mode=2 74 | )) 75 | 76 | with pytest.raises(KeyError): 77 | repr(sif4sci( 78 | r"如图所示,则$\bigtriangleup ABC$的面积是$\SIFBlank$。$\FigureID{1}$", mode=3 79 | )) 80 | -------------------------------------------------------------------------------- /tests/test_sif/test_tokenization.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/5/20 @ tongshiwei 3 | 4 | import pytest 5 | from EduNLP.SIF.constants import Symbol 6 | from EduNLP.SIF.segment.segment import SegmentList, LatexFormulaSegment 7 | from EduNLP.SIF.tokenization import text 8 | from EduNLP.SIF.tokenization import formula 9 | from EduNLP.SIF.tokenization.tokenization import TokenList 10 | 11 | 12 | def test_text_tokenization(): 13 | with pytest.raises(TypeError): 14 | text.tokenize("12345", "alpha") 15 | 16 | 17 | def test_formula_tokenization(): 18 | with pytest.raises(ValueError): 19 | formula.ast_token.ast_tokenize("1 + 1", return_type="graph") 20 | 21 | with pytest.raises(TypeError): 22 | formula.tokenize("1 + 1", method="plain") 23 | 24 | # with pytest.raises(TypeError): 25 | # formula.tokenize(r"\phantom{=}56+4", method="ast") 26 | 27 | 28 | def test_tokenization(): 29 | tl = TokenList(SegmentList("")) 30 | with pytest.raises(TypeError): 31 | tl.append(Symbol("[Unknown]")) 32 | 33 | with pytest.raises(TypeError): 34 | tl.append("[Unknown]") 35 | 36 | tl.append(LatexFormulaSegment('x+y'), False) 37 | -------------------------------------------------------------------------------- /tests/test_tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/8/1 @ tongshiwei 3 | -------------------------------------------------------------------------------- /tests/test_tokenizer/test_tokenizer.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/8/1 @ tongshiwei 3 | 4 | import pytest 5 | from EduNLP.Tokenizer import get_tokenizer 6 | from EduNLP.Pretrain import DisenQTokenizer 7 | 8 | 9 | def test_tokenizer(): 10 | with pytest.raises(KeyError): 11 | get_tokenizer("error") 12 | 13 | 14 | def test_disenQTokenizer(): 15 | tokenizer = DisenQTokenizer(max_length=10, tokenize_method="space") 16 | # with pytest.raises(RuntimeError): 17 | # tokenizer("10 米 的 (2/5) = () 米 的 (1/2) .") 18 | 19 | test_items = [ 20 | "10 米 的 (2/5) = () 米 的 (1/2) . 多 余 的 字", 21 | "-1 - 1", 22 | "5 % 2 + 3.14", 23 | "3.x", 24 | ".", 25 | "", 26 | "-1/2", 27 | "/", 28 | "1.2%", 29 | ] 30 | tokenizer.set_vocab(test_items) 31 | print(tokenizer.vocab_size) 32 | for item in test_items: 33 | token_item = tokenizer(item) 34 | print(token_item) 35 | 36 | test_item = tokenizer(test_items[0], padding=True) 37 | assert test_item["seq_idx"].shape[-1] == 10 38 | 39 | 40 | def test_CharTokenizer(): 41 | items = [{ 42 | "stem": "文具店有 $600$ 本练习本,卖出一些后,还剩 $4$ 包,每包 $25$ 本,卖出多少本?", 43 | "options": ["1", "2"] 44 | }] 45 | tokenizer = get_tokenizer("char", stop_words=set(",?")) 46 | tokens = tokenizer(items, key=lambda x: x['stem']) 47 | ret = next(tokens) 48 | ans = ['文', '具', '店', '有', '$', '600', '$', '本', '练', '习', '本', '卖', '出', '一', 49 | '些', '后', '还', '剩', '$', '4', '$', '包', '每', '包', '$', '25', '$', '本', '卖', '出', '多', '少', '本'] 50 | assert ret == ans 51 | 52 | 53 | def test_SpaceTokenizer(): 54 | items = ['文具店有 $600$ 本练习本,卖出一些后,还剩 $4$ 包,每包 $25$ 本,卖出多少本?'] 55 | tokenizer = get_tokenizer("space", stop_words=[]) 56 | tokens = tokenizer(items) 57 | ret = next(tokens) 58 | ans = ['文具店有', '$600$', '本练习本,卖出一些后,还剩', '$4$', '包,每包', '$25$', '本,卖出多少本?'] 59 | assert ret == ans 60 | 61 | 62 | def test_AstformulaTokenizer(): 63 | items = ['文具店有 $600$ 本练习本,卖出一些后,还剩 $4$ 包,每包 $25$ 本,卖出多少本?'] 64 | tokenizer = get_tokenizer("ast_formula") 65 | tokens = tokenizer(items) 66 | ret = next(tokens) 67 | ans = ['文具店', 'textord', 'textord', 'textord', '练习本', '卖出', '剩', 'textord', '包', '每包', 'textord', 'textord', '卖出'] 68 | assert ret == ans 69 | 70 | 71 | def test_PuretextTokenizer(): 72 | items = ['文具店有 $600$ 本练习本,卖出一些后,还剩 $4$ 包,每包 $25$ 本,卖出多少本?'] 73 | tokenizer = get_tokenizer("pure_text", stop_words=[]) 74 | tokens = tokenizer(items) 75 | ret = next(tokens) 76 | ans = ['文具店', '600', '练习本', '卖出', '剩', '4', '包', '每包', '25', '卖出'] 77 | assert ret == ans 78 | tokenizer = get_tokenizer("pure_text", stop_words=[], handle_figure_formula=None) 79 | tokens = tokenizer(items) 80 | ret = next(tokens) 81 | assert ret == ans 82 | tokenizer = get_tokenizer("pure_text", stop_words=[], handle_figure_formula='symbolize') 83 | tokens = tokenizer(items) 84 | ret = next(tokens) 85 | assert ret == ans 86 | with pytest.raises(ValueError): 87 | tokenizer = get_tokenizer("pure_text", stop_words=[], handle_figure_formula='wrong') 88 | 89 | 90 | def test_CustomTokenizer(): 91 | items = [{ 92 | "stem": "文具店有 $600$ 本练习本,卖出一些后,还剩 $4$ 包,每包 $25$ 本,卖出多少本?", 93 | "options": ["1", "2"] 94 | }] 95 | tokenizer = get_tokenizer("custom", symbol='f') 96 | tokens = tokenizer(items, key=lambda x: x['stem']) 97 | ret = next(tokens) 98 | ans = ['文具店', '[FORMULA]', '练习本', '卖出', '剩', '[FORMULA]', '包', '每包', '[FORMULA]', '卖出'] 99 | assert ret == ans 100 | items = [{ 101 | "stem": "有公式$\\FormFigureID{1}$,如图$\\FigureID{088f15ea-xxx}$,若$x,y$满足约束条件公式$\\F\ 102 | ormFigureBase64{2}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$", 103 | "options": ["1", "2"] 104 | }] 105 | tokenizer = get_tokenizer("custom", symbol='f', handle_figure_formula="symbolize") 106 | tokens = tokenizer(items, key=lambda x: x['stem']) 107 | ret = next(tokens) 108 | ret.pop(3) 109 | ans = ['公式', '[FORMULA]', '如图', '\\FigureID{088f15ea-xxx}', '[FORMULA]', '约束条件', '公式', '[FORMULA]', 110 | '\\SIFSep', '[FORMULA]', '最大值', '\\SIFBlank'] 111 | ans.pop(3) 112 | assert ret == ans 113 | -------------------------------------------------------------------------------- /tests/test_utils/test_modules.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | from EduNLP.ModelZoo.utils import MLP, TextCNN 4 | 5 | 6 | def test_modules(): 7 | encoder = TextCNN(256, 128) 8 | 9 | input_embeds1 = torch.rand(4, 16, 256) 10 | hidden_embeds1 = encoder(input_embeds1) 11 | assert hidden_embeds1.shape == torch.Size([4, 128]) 12 | input_embeds2 = torch.rand(4, 1, 256) 13 | hidden_embeds2 = encoder(input_embeds2) 14 | assert hidden_embeds2.shape == torch.Size([4, 128]) 15 | 16 | classifier = MLP(128, 10, 64, 0.5, n_layers=4) 17 | logits = classifier(hidden_embeds1) 18 | assert logits.shape == torch.Size([4, 10]) 19 | -------------------------------------------------------------------------------- /tests/test_vec/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/5/30 @ tongshiwei 3 | -------------------------------------------------------------------------------- /tests/test_vec/conftest.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/5/30 @ tongshiwei 3 | 4 | import codecs 5 | import json 6 | import pytest 7 | from EduNLP.utils import abs_current_dir, path_append 8 | 9 | 10 | @pytest.fixture(scope="module") 11 | def data(): 12 | _data = [] 13 | data_path = path_append(abs_current_dir(__file__), "../../static/test_data/standard_luna_data.json", to_str=True) 14 | with codecs.open(data_path, encoding="utf-8") as f: 15 | for line in f.readlines(): 16 | _data.append(json.loads(line)) 17 | return _data 18 | -------------------------------------------------------------------------------- /tests/test_vec/test_t2v.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # 2021/8/2 @ tongshiwei 3 | 4 | import pytest 5 | from EduNLP.Vector import get_pretrained_t2v 6 | 7 | 8 | def test_t2v(): 9 | with pytest.raises(KeyError): 10 | get_pretrained_t2v("error") 11 | --------------------------------------------------------------------------------