├── .config └── config ├── .github ├── PULL_REQUEST_TEMPLATE.md ├── dependabot.yml └── workflows │ ├── ltp-core-publish.yml │ ├── ltp-extension-publish.yml │ ├── ltp-publish.yml │ ├── ruff.yaml │ └── test.yml ├── .gitignore ├── .ruff.toml ├── CITATION.cff ├── Cargo.toml ├── Makefile ├── README.md ├── appveyor.yml ├── data ├── .gitkeep └── examples │ ├── cws │ ├── raw.txt │ ├── test.txt │ ├── train.txt │ └── val.txt │ ├── ner │ ├── raw.txt │ ├── test.txt │ ├── train.txt │ ├── val.txt │ └── vocab.txt │ └── pos │ ├── raw.txt │ ├── test.txt │ ├── train.txt │ ├── val.txt │ └── vocab.txt ├── python ├── core │ ├── .env.example │ ├── LICENSE │ ├── MANIFEST.in │ ├── Makefile │ ├── README.md │ ├── bash │ │ ├── eval.sh │ │ └── train.sh │ ├── configs │ │ ├── callbacks │ │ │ ├── default.yaml │ │ │ ├── early_stopping.yaml │ │ │ ├── model_checkpoint.yaml │ │ │ ├── model_summary.yaml │ │ │ ├── none.yaml │ │ │ └── rich_progress_bar.yaml │ │ ├── datamodule │ │ │ ├── cls_datamodules.yaml │ │ │ ├── cws_datamodules.yaml │ │ │ ├── dep_datamodules.yaml │ │ │ ├── multi_datamodules.yaml │ │ │ ├── ner_datamodules.yaml │ │ │ ├── pos_datamodules.yaml │ │ │ ├── sdp_datamodules.yaml │ │ │ └── srl_datamodules.yaml │ │ ├── debug │ │ │ ├── default.yaml │ │ │ ├── fdr.yaml │ │ │ ├── limit.yaml │ │ │ ├── overfit.yaml │ │ │ └── profiler.yaml │ │ ├── eval.yaml │ │ ├── experiment │ │ │ ├── cls.yaml │ │ │ ├── cws.yaml │ │ │ ├── dep.yaml │ │ │ ├── example.yaml │ │ │ ├── multi.yaml │ │ │ ├── multi_bi.yaml │ │ │ ├── ner.yaml │ │ │ ├── pos.yaml │ │ │ ├── sdp.yaml │ │ │ └── srl.yaml │ │ ├── extras │ │ │ └── default.yaml │ │ ├── hparams_search │ │ │ └── ltp_optuna.yaml │ │ ├── hydra │ │ │ └── default.yaml │ │ ├── local │ │ │ └── .gitkeep │ │ ├── logger │ │ │ ├── comet.yaml │ │ │ ├── csv.yaml │ │ │ ├── many_loggers.yaml │ │ │ ├── mlflow.yaml │ │ │ ├── neptune.yaml │ │ │ ├── tensorboard.yaml │ │ │ └── wandb.yaml │ │ ├── model │ │ │ ├── cls_model.yaml │ │ │ ├── cws_model.yaml │ │ │ ├── dep_model.yaml │ │ │ ├── multi_model.yaml │ │ │ ├── ner_model.yaml │ │ │ ├── pos_model.yaml │ │ │ ├── sdp_model.yaml │ │ │ └── srl_model.yaml │ │ ├── paths │ │ │ └── default.yaml │ │ ├── train.yaml │ │ └── trainer │ │ │ ├── cpu.yaml │ │ │ ├── ddp.yaml │ │ │ ├── ddp_sim.yaml │ │ │ ├── default.yaml │ │ │ ├── gpu.yaml │ │ │ └── mps.yaml │ ├── data │ │ ├── .gitkeep │ │ ├── conllu │ │ │ ├── dev.conllu │ │ │ ├── test.conllu │ │ │ ├── train.conllu │ │ │ └── vocabs │ │ │ │ ├── deprel.txt │ │ │ │ ├── deps.txt │ │ │ │ ├── feats.txt │ │ │ │ ├── lemma.txt │ │ │ │ ├── upos.txt │ │ │ │ ├── word.txt │ │ │ │ ├── word_char.txt │ │ │ │ └── xpos.txt │ │ ├── ner │ │ │ ├── dev.bio │ │ │ ├── test.bio │ │ │ ├── train.bio │ │ │ └── vocabs │ │ │ │ └── bio.txt │ │ └── srl │ │ │ ├── dev.txt │ │ │ ├── test.txt │ │ │ ├── train.txt │ │ │ └── vocabs │ │ │ ├── arguments.txt │ │ │ └── predicate.txt │ ├── logs │ │ └── .gitkeep │ ├── ltp_core │ │ ├── __init__.py │ │ ├── algorithms │ │ │ ├── __init__.py │ │ │ ├── eisner.py │ │ │ └── get_entities.py │ │ ├── datamodules │ │ │ ├── __init__.py │ │ │ ├── adapters │ │ │ │ ├── __init__.py │ │ │ │ ├── dependency_parsing.py │ │ │ │ ├── named_entity_recognition.py │ │ │ │ ├── postagger.py │ │ │ │ ├── segmention.py │ │ │ │ ├── semantic_dependency_parsing.py │ │ │ │ ├── semantic_role_labeling.py │ │ │ │ └── sentence_classification.py │ │ │ ├── components │ │ │ │ ├── __init__.py │ │ │ │ ├── bio.py │ │ │ │ ├── conllu.py │ │ │ │ └── srl.py │ │ │ ├── multi_task_datamodule.py │ │ │ ├── task_datamodule.py │ │ │ └── utils │ │ │ │ ├── __init__.py │ │ │ │ ├── collate.py │ │ │ │ ├── datasets.py │ │ │ │ ├── iterator.py │ │ │ │ ├── multitask_dataloader.py │ │ │ │ └── vocab_helper.py │ │ ├── eval.py │ │ ├── models │ │ │ ├── __init__.py │ │ │ ├── components │ │ │ │ ├── __init__.py │ │ │ │ ├── graph.py │ │ │ │ ├── sent.py │ │ │ │ └── token.py │ │ │ ├── criterion │ │ │ │ ├── __init__.py │ │ │ │ ├── graph.py │ │ │ │ ├── sent.py │ │ │ │ └── token.py │ │ │ ├── functional │ │ │ │ ├── __init__.py │ │ │ │ ├── distill.py │ │ │ │ ├── eisner.py │ │ │ │ └── multilabel_categorical_crossentropy.py │ │ │ ├── lit_model.py │ │ │ ├── ltp_model.py │ │ │ ├── metrics │ │ │ │ ├── __init__.py │ │ │ │ ├── graph.py │ │ │ │ ├── sent.py │ │ │ │ └── token.py │ │ │ ├── nn │ │ │ │ ├── __init__.py │ │ │ │ ├── biaffine.py │ │ │ │ ├── crf.py │ │ │ │ ├── global_pointer.py │ │ │ │ ├── mlp.py │ │ │ │ └── relative_transformer.py │ │ │ ├── optimization │ │ │ │ ├── __init__.py │ │ │ │ ├── layer_lrs.py │ │ │ │ └── scheduler.py │ │ │ ├── processor │ │ │ │ └── __init__.py │ │ │ └── utils │ │ │ │ ├── __init__.py │ │ │ │ ├── instantiate.py │ │ │ │ └── transformer.py │ │ ├── train.py │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── pylogger.py │ │ │ ├── rich_utils.py │ │ │ └── utils.py │ ├── pyproject.toml │ ├── requirements.txt │ ├── setup.cfg │ ├── setup.py │ └── tests │ │ ├── __init__.py │ │ └── test_crf.py ├── extension │ ├── Cargo.toml │ ├── LICENSE │ ├── README.md │ ├── examples │ │ ├── benchmark.py │ │ ├── benchmark2.py │ │ └── legacy_train.py │ ├── ltp_extension │ │ ├── __init__.py │ │ ├── algorithms │ │ │ ├── __init__.py │ │ │ └── algorithms.pyi │ │ ├── ltp_extension.pyi │ │ └── perceptron │ │ │ ├── __init__.py │ │ │ └── perceptron.pyi │ ├── pyproject.toml │ ├── src │ │ ├── algorithms.rs │ │ ├── hook.rs │ │ ├── lib.rs │ │ ├── perceptron │ │ │ ├── alg.rs │ │ │ ├── com.rs │ │ │ ├── mod.rs │ │ │ ├── model.rs │ │ │ ├── specialization │ │ │ │ ├── cws.rs │ │ │ │ ├── mod.rs │ │ │ │ ├── ner.rs │ │ │ │ └── pos.rs │ │ │ └── trainer.rs │ │ ├── stnsplit.rs │ │ └── utils │ │ │ ├── mod.rs │ │ │ └── parallelism.rs │ └── utils │ │ └── stub.py └── interface │ ├── LICENSE │ ├── MANIFEST.in │ ├── Makefile │ ├── README.md │ ├── docs │ ├── README.md │ ├── api │ │ └── ltp.rst │ ├── appendix.rst │ ├── conf.py │ ├── index.rst │ ├── introduction.rst │ ├── performance.rst │ └── quickstart.rst │ ├── examples │ ├── conllu.py │ ├── issues.py │ ├── rules.py │ ├── server.py │ └── simple.py │ ├── ltp │ ├── __init__.py │ ├── generic.py │ ├── interface.py │ ├── legacy.py │ ├── mixin.py │ ├── module.py │ ├── nerual.py │ └── utils.py │ ├── pyproject.toml │ ├── requirements.txt │ ├── setup.py │ └── utils │ └── upload_models.py └── rust ├── ltp-cffi ├── Cargo.toml ├── LICENSE ├── README.md ├── cbindgen.toml ├── examples │ └── example.c └── src │ ├── lib.rs │ ├── model.rs │ └── stnsplit.rs └── ltp ├── Cargo.toml ├── LICENSE ├── README.md ├── examples ├── cws.rs ├── ner.rs ├── pos.rs └── simple.rs ├── src ├── lib.rs ├── perceptron │ ├── definition │ │ ├── cws.rs │ │ ├── mod.rs │ │ ├── ner.rs │ │ └── pos.rs │ ├── feature.rs │ ├── mod.rs │ ├── model.rs │ ├── parameter.rs │ ├── serialization.rs │ └── trainer.rs └── utils │ ├── eisner.rs │ ├── entities.rs │ ├── hook.rs │ ├── mod.rs │ ├── stnsplit.rs │ └── viterbi.rs ├── test ├── eisner.npz └── viterbi.npz └── vendor └── schema ├── cws.avsc ├── ner.avsc └── pos.avsc /.config/config: -------------------------------------------------------------------------------- 1 | [target.x86_64-apple-darwin] 2 | rustflags = [ 3 | "-C", "link-arg=-undefined", 4 | "-C", "link-arg=dynamic_lookup", 5 | ] 6 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## What does this PR do? 2 | 3 | 9 | 10 | Fixes #\ 11 | 12 | ## Before submitting 13 | 14 | - [ ] Did you make sure **title is self-explanatory** and **the description concisely explains the PR**? 15 | - [ ] Did you make sure your **PR does only one thing**, instead of bundling different changes together? 16 | - [ ] Did you list all the **breaking changes** introduced by this pull request? 17 | - [ ] Did you **test your PR locally** with `pytest` command? 18 | - [ ] Did you **run pre-commit hooks** with `pre-commit run -a` command? 19 | 20 | ## Did you have fun? 21 | 22 | Make sure you had fun coding 🙃 23 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "pip" # See documentation for possible values 9 | directory: "/python/core" # Location of package manifests 10 | schedule: 11 | interval: "daily" 12 | ignore: 13 | - dependency-name: "pytorch-lightning" 14 | update-types: ["version-update:semver-patch"] 15 | - dependency-name: "torchmetrics" 16 | update-types: ["version-update:semver-patch"] 17 | - package-ecosystem: cargo 18 | directory: "/" 19 | schedule: 20 | interval: monthly 21 | time: "04:00" 22 | timezone: Europe/Berlin 23 | -------------------------------------------------------------------------------- /.github/workflows/ltp-core-publish.yml: -------------------------------------------------------------------------------- 1 | name: Upload LTP Core Python Package 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | jobs: 7 | core: 8 | runs-on: ubuntu-latest 9 | strategy: 10 | matrix: 11 | target: [x86_64] 12 | steps: 13 | - uses: actions/checkout@v2 14 | - uses: actions/setup-python@v2 15 | with: 16 | python-version: 3.9 17 | architecture: x64 18 | - name: Build Wheels 19 | run: | 20 | pip wheel --no-deps -w dist python/core 21 | - name: Upload wheels 22 | uses: actions/upload-artifact@v2 23 | with: 24 | name: wheels 25 | path: dist 26 | 27 | release: 28 | name: Release 29 | runs-on: ubuntu-latest 30 | needs: [core] 31 | steps: 32 | - uses: actions/download-artifact@v2 33 | with: 34 | name: wheels 35 | - uses: actions/setup-python@v2 36 | with: 37 | python-version: 3.9 38 | - name: Publish LTP Core to PyPi 39 | env: 40 | TWINE_USERNAME: __token__ 41 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD_CORE }} 42 | run: | 43 | pip install --upgrade twine 44 | twine upload --skip-existing ltp_core-* 45 | -------------------------------------------------------------------------------- /.github/workflows/ltp-publish.yml: -------------------------------------------------------------------------------- 1 | name: Upload LTP Python Package 2 | 3 | on: 4 | workflow_dispatch: 5 | release: 6 | types: [created] 7 | 8 | jobs: 9 | interface: 10 | runs-on: ubuntu-latest 11 | strategy: 12 | matrix: 13 | target: [x86_64] 14 | steps: 15 | - uses: actions/checkout@v2 16 | - uses: actions/setup-python@v2 17 | with: 18 | python-version: 3.9 19 | architecture: x64 20 | - name: Build Wheels 21 | run: | 22 | pip wheel --no-deps -w dist python/interface 23 | - name: Upload wheels 24 | uses: actions/upload-artifact@v2 25 | with: 26 | name: wheels 27 | path: dist 28 | 29 | release: 30 | name: Release 31 | runs-on: ubuntu-latest 32 | needs: [interface] 33 | steps: 34 | - uses: actions/download-artifact@v2 35 | with: 36 | name: wheels 37 | - uses: actions/setup-python@v2 38 | with: 39 | python-version: 3.9 40 | - name: Publish LTP to PyPi 41 | env: 42 | TWINE_USERNAME: __token__ 43 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 44 | run: | 45 | pip install --upgrade twine 46 | twine upload --skip-existing ltp-* 47 | -------------------------------------------------------------------------------- /.github/workflows/ruff.yaml: -------------------------------------------------------------------------------- 1 | # Same as `code-quality-pr.yaml` but triggered on commit to main branch 2 | # and runs on all files (instead of only the changed ones) 3 | 4 | name: Ruff 5 | on: [ push, pull_request ] 6 | jobs: 7 | ruff: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/checkout@v3 11 | - uses: chartboost/ruff-action@v1 -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | #on: 4 | # push: 5 | # branches: [ main ] 6 | # pull_request: 7 | # branches: [ main, "release/*" ] 8 | 9 | on: 10 | workflow_dispatch: 11 | 12 | jobs: 13 | run_tests: 14 | runs-on: ${{ matrix.os }} 15 | 16 | strategy: 17 | fail-fast: false 18 | matrix: 19 | os: ["ubuntu-latest", "macos-latest", "windows-latest"] 20 | python-version: ["3.7", "3.8", "3.9", "3.10"] 21 | 22 | timeout-minutes: 10 23 | 24 | steps: 25 | - name: Checkout 26 | uses: actions/checkout@v3 27 | 28 | - name: Set up Python ${{ matrix.python-version }} 29 | uses: actions/setup-python@v3 30 | with: 31 | python-version: ${{ matrix.python-version }} 32 | 33 | - name: Install dependencies 34 | run: | 35 | python -m pip install --upgrade pip 36 | pip install -r requirements.txt 37 | pip install pytest 38 | pip install protobuf==3.20.0 39 | 40 | - name: List dependencies 41 | run: | 42 | python -m pip list 43 | 44 | - name: Run pytest 45 | run: | 46 | pytest -v python/core 47 | 48 | # upload code coverage report 49 | code-coverage: 50 | runs-on: ubuntu-latest 51 | 52 | steps: 53 | - name: Checkout 54 | uses: actions/checkout@v2 55 | 56 | - name: Set up Python 3.10 57 | uses: actions/setup-python@v2 58 | with: 59 | python-version: "3.10" 60 | 61 | - name: Install dependencies 62 | run: | 63 | python -m pip install --upgrade pip 64 | pip install -r requirements.txt 65 | pip install pytest 66 | pip install pytest-cov[toml] 67 | pip install protobuf==3.20.0 68 | 69 | - name: Run tests and collect coverage 70 | run: pytest --cov python/core/ltp 71 | 72 | - name: Upload coverage to Codecov 73 | uses: codecov/codecov-action@v3 74 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /data 2 | /target 3 | Cargo.lock 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | pip-wheel-metadata/ 28 | share/python-wheels/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | MANIFEST 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .nox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *.cover 54 | *.py,cover 55 | .hypothesis/ 56 | .pytest_cache/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # IPython 85 | profile_default/ 86 | ipython_config.py 87 | 88 | # pyenv 89 | .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 99 | __pypackages__/ 100 | 101 | # Celery stuff 102 | celerybeat-schedule 103 | celerybeat.pid 104 | 105 | # SageMath parsed files 106 | *.sage.py 107 | 108 | # Environments 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | 134 | ### VisualStudioCode 135 | .vscode/* 136 | !.vscode/settings.json 137 | !.vscode/tasks.json 138 | !.vscode/launch.json 139 | !.vscode/extensions.json 140 | *.code-workspace 141 | **/.vscode 142 | 143 | # JetBrains 144 | .idea/ 145 | 146 | # Lightning-Hydra-Template 147 | python/core/configs/local/default.yaml 148 | python/core/data/ 149 | python/core/logs/ 150 | python/core/wandb/ 151 | python/core/.env 152 | python/core/.autoenv 153 | 154 | .DS_Store 155 | /bindings 156 | /python/interface/models 157 | -------------------------------------------------------------------------------- /.ruff.toml: -------------------------------------------------------------------------------- 1 | # Enable flake8-bugbear (`B`) rules. 2 | select = ["E", "F", "B"] 3 | 4 | # Never enforce `E501` (line length violations). 5 | ignore = ["E501"] 6 | 7 | # Avoid trying to fix flake8-bugbear (`B`) violations. 8 | unfixable = ["B"] 9 | 10 | # Ignore `E402` (import violations) in all `__init__.py` files, and in `path/to/file.py`. 11 | [per-file-ignores] 12 | "__init__.py" = ["E402"] 13 | "path/to/file.py" = ["E402"] 14 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = [ 3 | "rust/ltp", 4 | "rust/ltp-cffi", 5 | "python/extension", 6 | ] 7 | 8 | [profile.release] 9 | lto = true 10 | codegen-units = 1 11 | panic = "abort" 12 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | help: ## Show help 3 | @grep -E '^[.a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' 4 | 5 | sync: ## Merge changes from main branch to your current branch 6 | git fetch --all 7 | git merge main 8 | 9 | bdist: ## build ltp and ltp_extension 10 | pip wheel --no-deps -w dist python/core 11 | pip wheel --no-deps -w dist python/interface 12 | maturin build --release -m python/extension/Cargo.toml --out dist 13 | 14 | cbindgen_header: 15 | mkdir -p bindings/c 16 | cbindgen --config rust/ltp-cffi/cbindgen.toml --crate ltp-cffi --output bindings/c/ltp.h 17 | 18 | cbindgen: cbindgen_header 19 | cargo build --release --package ltp-cffi 20 | cp target/release/libltp.* bindings/c 21 | 22 | cbindgen_example: cbindgen 23 | gcc -L "$(pwd)bindings/c" -lltp -I "$(pwd)bindings/c" -o target/c_example rust/ltp-cffi/examples/example.c 24 | ./target/c_example 25 | 26 | train_legacy: 27 | cargo run --package ltp --release --example cws -- train --train data/examples/cws/train.txt --eval data/examples/cws/val.txt --model=data/cws_model.bin 28 | cargo run --package ltp --release --example cws -- eval --eval data/examples/cws/test.txt --model=data/cws_model.bin 29 | cargo run --package ltp --release --example cws -- predict --input data/examples/cws/raw.txt --output data/examples/cws/output.txt --model=data/cws_model.bin 30 | 31 | cargo run --package ltp --release --example pos -- train --train data/examples/pos/train.txt --eval data/examples/pos/val.txt --model=data/pos_model.bin --vocab data/examples/pos/vocab.txt 32 | cargo run --package ltp --release --example pos -- eval --eval data/examples/pos/test.txt --model=data/pos_model.bin 33 | cargo run --package ltp --release --example pos -- predict --input data/examples/pos/raw.txt --output data/examples/pos/output.txt --model=data/pos_model.bin 34 | 35 | cargo run --package ltp --release --example ner -- train --train data/examples/ner/train.txt --eval data/examples/ner/val.txt --model=data/ner_model.bin --vocab data/examples/ner/vocab.txt 36 | cargo run --package ltp --release --example ner -- eval --eval data/examples/ner/test.txt --model=data/ner_model.bin 37 | cargo run --package ltp --release --example ner -- predict --input data/examples/ner/raw.txt --output data/examples/ner/output.txt --model=data/ner_model.bin 38 | -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | branches: 2 | only: 3 | - 3.X 4 | 5 | environment: 6 | P: "c:/projects/libs" 7 | 8 | # clone directory 9 | clone_folder: c:\projects\ltp 10 | 11 | os: Visual Studio 2015 12 | 13 | platform: 14 | - x86 15 | - x64 16 | 17 | configuration: 18 | - Debug 19 | - Release 20 | 21 | install: 22 | # by default, all script lines are interpreted as batch 23 | 24 | build: 25 | project: ALL_BUILD.vcxproj # path to Visual Studio solution or project 26 | 27 | # scripts to run before build 28 | before_build: 29 | - echo Running cmake... 30 | - cd c:\projects\ltp 31 | - cmake -G "Visual Studio 14 2015 Win64" -DCMAKE_INSTALL_PREFIX=%P% 32 | 33 | after_build: 34 | - cd c:\projects\ltp 35 | - 7z a ltp-win-%PLATFORM%-%CONFIGURATION%.zip bin\examples\%CONFIGURATION%\*_cmdline.exe bin\%CONFIGURATION%\ltp_test.exe lib\%CONFIGURATION%\*.dll 36 | 37 | artifacts: 38 | - path: ltp-win-$(platform)-$(configuration).zip 39 | name: ltp-win-$(platform)-$(configuration).zip 40 | -------------------------------------------------------------------------------- /data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HIT-SCIR/ltp/6c6059d5ccad87dca003190ee7565af86e7e22a0/data/.gitkeep -------------------------------------------------------------------------------- /data/examples/cws/raw.txt: -------------------------------------------------------------------------------- 1 | 在已恢复通车的铁轨上,百余名铁路工人正利用列车经过的间隙抓紧补修工作。1 2 | -------------------------------------------------------------------------------- /data/examples/cws/test.txt: -------------------------------------------------------------------------------- 1 | 在 已 恢复 通车 的 铁轨 上 , 百 余 名 铁路 工人 正 利用 列车 经过 的 间隙 抓紧 补修 工作 。 2 | -------------------------------------------------------------------------------- /data/examples/cws/train.txt: -------------------------------------------------------------------------------- 1 | 在 已 恢复 通车 的 铁轨 上 , 百 余 名 铁路 工人 正 利用 列车 经过 的 间隙 抓紧 补修 工作 。 2 | -------------------------------------------------------------------------------- /data/examples/cws/val.txt: -------------------------------------------------------------------------------- 1 | 在 已 恢复 通车 的 铁轨 上 , 百 余 名 铁路 工人 正 利用 列车 经过 的 间隙 抓紧 补修 工作 。 2 | -------------------------------------------------------------------------------- /data/examples/ner/raw.txt: -------------------------------------------------------------------------------- 1 | 台湾/ns 是/v 中国/ns 领土/n 不可分割/i 的/u 一/m 部分/n 。/wp 2 | -------------------------------------------------------------------------------- /data/examples/ner/test.txt: -------------------------------------------------------------------------------- 1 | 台湾/ns/S-Ns 是/v/O 中国/ns/S-Ns 领土/n/O 不可分割/i/O 的/u/O 一/m/O 部分/n/O 。/wp/O 2 | -------------------------------------------------------------------------------- /data/examples/ner/train.txt: -------------------------------------------------------------------------------- 1 | 台湾/ns/S-Ns 是/v/O 中国/ns/S-Ns 领土/n/O 不可分割/i/O 的/u/O 一/m/O 部分/n/O 。/wp/O 2 | -------------------------------------------------------------------------------- /data/examples/ner/val.txt: -------------------------------------------------------------------------------- 1 | 台湾/ns/S-Ns 是/v/O 中国/ns/S-Ns 领土/n/O 不可分割/i/O 的/u/O 一/m/O 部分/n/O 。/wp/O 2 | -------------------------------------------------------------------------------- /data/examples/ner/vocab.txt: -------------------------------------------------------------------------------- 1 | O 2 | B-Nh 3 | B-Ni 4 | B-Ns 5 | E-Nh 6 | E-Ni 7 | E-Ns 8 | I-Nh 9 | I-Ni 10 | I-Ns 11 | S-Nh 12 | S-Ni 13 | S-Ns 14 | -------------------------------------------------------------------------------- /data/examples/pos/raw.txt: -------------------------------------------------------------------------------- 1 | 中 葡 总理 对 两 国 关系 的 现状 给予 了 积极 的 评价 。 2 | -------------------------------------------------------------------------------- /data/examples/pos/test.txt: -------------------------------------------------------------------------------- 1 | 中/j 葡/j 总理/n 对/p 两/m 国/n 关系/n 的/u 现状/n 给予/v 了/u 积极/a 的/u 评价/v 。/wp 2 | -------------------------------------------------------------------------------- /data/examples/pos/train.txt: -------------------------------------------------------------------------------- 1 | 中/j 葡/j 总理/n 对/p 两/m 国/n 关系/n 的/u 现状/n 给予/v 了/u 积极/a 的/u 评价/v 。/wp 2 | -------------------------------------------------------------------------------- /data/examples/pos/val.txt: -------------------------------------------------------------------------------- 1 | 中/j 葡/j 总理/n 对/p 两/m 国/n 关系/n 的/u 现状/n 给予/v 了/u 积极/a 的/u 评价/v 。/wp 2 | -------------------------------------------------------------------------------- /data/examples/pos/vocab.txt: -------------------------------------------------------------------------------- 1 | a 2 | b 3 | c 4 | d 5 | e 6 | h 7 | i 8 | j 9 | k 10 | m 11 | n 12 | nd 13 | nh 14 | ni 15 | nl 16 | ns 17 | nt 18 | nz 19 | o 20 | p 21 | q 22 | r 23 | u 24 | v 25 | wp 26 | ws 27 | z 28 | -------------------------------------------------------------------------------- /python/core/.env.example: -------------------------------------------------------------------------------- 1 | # example of file for storing private and user specific environment variables, like keys or system paths 2 | # rename it to ".env" (excluded from version control by default) 3 | # .env is loaded by train.py automatically 4 | # hydra allows you to reference variables in .yaml configs with special syntax: ${oc.env:MY_VAR} 5 | 6 | MY_VAR="/home/user/my/system/path" 7 | -------------------------------------------------------------------------------- /python/core/LICENSE: -------------------------------------------------------------------------------- 1 | 1. 语言技术平台面向国内外大学、中科院各研究所以及个人研究者免费开放源代码,但如上述机构和个人将该平台用于商业目的(如企业合作项目等)则需要付费。 2 | 2. 除上述机构以外的企事业单位,如申请使用该平台,需付费。 3 | 3. 凡涉及付费问题,请发邮件到 car@ir.hit.edu.cn 洽商。 4 | 4. 如果您在 LTP 基础上发表论文或取得科研成果,请您在发表论文和申报成果时声明“使用了哈工大社会计算与信息检索研究中心研制的语言技术平台(LTP)”. 5 | 同时,发信给car@ir.hit.edu.cn,说明发表论文或申报成果的题目、出处等。 6 | -------------------------------------------------------------------------------- /python/core/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include requirements.txt 3 | 4 | recursive-include ltp_core * 5 | 6 | recursive-exclude * *.pyc 7 | recursive-exclude * .DS_Store 8 | recursive-exclude * __pycache__ 9 | -------------------------------------------------------------------------------- /python/core/Makefile: -------------------------------------------------------------------------------- 1 | 2 | help: ## Show help 3 | @grep -E '^[.a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' 4 | 5 | clean: ## Clean autogenerated files 6 | rm -rf dist 7 | find . -type f -name "*.DS_Store" -ls -delete 8 | find . | grep -E "(__pycache__|\.pyc|\.pyo)" | xargs rm -rf 9 | find . | grep -E ".pytest_cache" | xargs rm -rf 10 | find . | grep -E ".ipynb_checkpoints" | xargs rm -rf 11 | rm -f .coverage 12 | 13 | clean-logs: ## Clean logs 14 | rm -rf logs/** 15 | 16 | style: ## Run pre-commit hooks 17 | pre-commit run -a 18 | 19 | sync: ## Merge changes from main branch to your current branch 20 | git fetch --all 21 | git merge main 22 | 23 | test: ## Run not slow tests 24 | pytest -k "not slow" 25 | 26 | test-full: ## Run all tests 27 | pytest 28 | 29 | train: ## Train the model 30 | python ltp_core/train.py experiment=example 31 | 32 | debug: ## Enter debugging mode with pdb 33 | # 34 | # tips: 35 | # - use "import pdb; pdb.set_trace()" to set breakpoint 36 | # - use "h" to print all commands 37 | # - use "n" to execute the next line 38 | # - use "c" to run until the breakpoint is hit 39 | # - use "l" to print src code around current line, "ll" for full function code 40 | # - docs: https://docs.python.org/3/library/pdb.html 41 | # 42 | python -m pdb ltp_core/train.py debug=default 43 | -------------------------------------------------------------------------------- /python/core/README.md: -------------------------------------------------------------------------------- 1 | | Language | version | 2 | | ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | 3 | | [Python](python/interface/README.md) | [![LTP](https://img.shields.io/pypi/v/ltp?label=LTP)](https://pypi.org/project/ltp) [![LTP-Core](https://img.shields.io/pypi/v/ltp-core?label=LTP-Core)](https://pypi.org/project/ltp-core) [![LTP-Extension](https://img.shields.io/pypi/v/ltp-extension?label=LTP-Extension)](https://pypi.org/project/ltp-extension) | 4 | | [Rust](rust/ltp/README.md) | [![LTP](https://img.shields.io/crates/v/ltp?label=LTP)](https://crates.io/crates/ltp) | 5 | 6 | # LTP Core 7 | 8 | 为 LTP 神经网络模型提供支持。 9 | -------------------------------------------------------------------------------- /python/core/bash/eval.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | #SBATCH -N 1 3 | #SBATCH -t 7-00:00:00 4 | 5 | export TOKENIZERS_PARALLELISM=false 6 | PYTHONPATH=. python ltp_core/eval.py "$@" 7 | -------------------------------------------------------------------------------- /python/core/bash/train.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | #SBATCH -N 1 3 | #SBATCH -t 7-00:00:00 4 | 5 | export TOKENIZERS_PARALLELISM=false 6 | PYTHONPATH=. python ltp_core/train.py "$@" 7 | -------------------------------------------------------------------------------- /python/core/configs/callbacks/default.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - model_checkpoint.yaml 3 | - early_stopping.yaml 4 | - model_summary.yaml 5 | - rich_progress_bar.yaml 6 | - _self_ 7 | 8 | model_checkpoint: 9 | dirpath: ${paths.output_dir}/checkpoints 10 | filename: "epoch_{epoch:03d}" 11 | monitor: "val/acc" 12 | mode: "max" 13 | save_last: True 14 | auto_insert_metric_name: False 15 | 16 | early_stopping: 17 | monitor: "val/acc" 18 | patience: 100 19 | mode: "max" 20 | 21 | model_summary: 22 | max_depth: -1 23 | -------------------------------------------------------------------------------- /python/core/configs/callbacks/early_stopping.yaml: -------------------------------------------------------------------------------- 1 | # https://pytorch-lightning.readthedocs.io/en/latest/api/pytorch_lightning.callbacks.EarlyStopping.html 2 | 3 | # Monitor a metric and stop training when it stops improving. 4 | # Look at the above link for more detailed information. 5 | early_stopping: 6 | _target_: pytorch_lightning.callbacks.EarlyStopping 7 | monitor: ??? # quantity to be monitored, must be specified !!! 8 | min_delta: 0. # minimum change in the monitored quantity to qualify as an improvement 9 | patience: 3 # number of checks with no improvement after which training will be stopped 10 | verbose: False # verbosity mode 11 | mode: "min" # "max" means higher metric value is better, can be also "min" 12 | strict: True # whether to crash the training if monitor is not found in the validation metrics 13 | check_finite: True # when set True, stops training when the monitor becomes NaN or infinite 14 | stopping_threshold: null # stop training immediately once the monitored quantity reaches this threshold 15 | divergence_threshold: null # stop training as soon as the monitored quantity becomes worse than this threshold 16 | check_on_train_epoch_end: null # whether to run early stopping at the end of the training epoch 17 | # log_rank_zero_only: False # this keyword argument isn't available in stable version 18 | -------------------------------------------------------------------------------- /python/core/configs/callbacks/model_checkpoint.yaml: -------------------------------------------------------------------------------- 1 | # https://pytorch-lightning.readthedocs.io/en/latest/api/pytorch_lightning.callbacks.ModelCheckpoint.html 2 | 3 | # Save the model periodically by monitoring a quantity. 4 | # Look at the above link for more detailed information. 5 | model_checkpoint: 6 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 7 | dirpath: null # directory to save the model file 8 | filename: null # checkpoint filename 9 | monitor: null # name of the logged metric which determines when model is improving 10 | verbose: False # verbosity mode 11 | save_last: null # additionally always save an exact copy of the last checkpoint to a file last.ckpt 12 | save_top_k: 1 # save k best models (determined by above metric) 13 | mode: "min" # "max" means higher metric value is better, can be also "min" 14 | auto_insert_metric_name: True # when True, the checkpoints filenames will contain the metric name 15 | save_weights_only: False # if True, then only the model’s weights will be saved 16 | every_n_train_steps: null # number of training steps between checkpoints 17 | train_time_interval: null # checkpoints are monitored at the specified time interval 18 | every_n_epochs: null # number of epochs between checkpoints 19 | save_on_train_epoch_end: null # whether to run checkpointing at the end of the training epoch or the end of validation 20 | -------------------------------------------------------------------------------- /python/core/configs/callbacks/model_summary.yaml: -------------------------------------------------------------------------------- 1 | # https://pytorch-lightning.readthedocs.io/en/latest/api/pytorch_lightning.callbacks.RichModelSummary.html 2 | 3 | # Generates a summary of all layers in a LightningModule with rich text formatting. 4 | # Look at the above link for more detailed information. 5 | model_summary: 6 | _target_: pytorch_lightning.callbacks.RichModelSummary 7 | max_depth: 1 # the maximum depth of layer nesting that the summary will include 8 | -------------------------------------------------------------------------------- /python/core/configs/callbacks/none.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HIT-SCIR/ltp/6c6059d5ccad87dca003190ee7565af86e7e22a0/python/core/configs/callbacks/none.yaml -------------------------------------------------------------------------------- /python/core/configs/callbacks/rich_progress_bar.yaml: -------------------------------------------------------------------------------- 1 | # https://pytorch-lightning.readthedocs.io/en/latest/api/pytorch_lightning.callbacks.RichProgressBar.html 2 | 3 | # Create a progress bar with rich text formatting. 4 | # Look at the above link for more detailed information. 5 | rich_progress_bar: 6 | _target_: pytorch_lightning.callbacks.RichProgressBar 7 | -------------------------------------------------------------------------------- /python/core/configs/datamodule/cls_datamodules.yaml: -------------------------------------------------------------------------------- 1 | _target_: ltp_core.datamodules.TaskDataModule 2 | 3 | tokenizer: ${model.model.backbone.pretrained_model_name_or_path} 4 | 5 | datamodules: 6 | batch_size: 16 7 | num_workers: 4 8 | pin_memory: True 9 | load: 10 | _target_: ltp_core.datamodules.adapters.sentence_classification.build_dataset 11 | _partial_: true 12 | task_name: "cola" 13 | -------------------------------------------------------------------------------- /python/core/configs/datamodule/cws_datamodules.yaml: -------------------------------------------------------------------------------- 1 | _target_: ltp_core.datamodules.TaskDataModule 2 | 3 | tokenizer: ${model.model.backbone.pretrained_model_name_or_path} 4 | 5 | datamodules: 6 | batch_size: 16 7 | num_workers: 4 8 | pin_memory: True 9 | load: 10 | _target_: ltp_core.datamodules.adapters.segmention.build_dataset 11 | _partial_: true 12 | task_name: "cws" 13 | data_dir: "data/conllu" 14 | mode: "bmes" 15 | -------------------------------------------------------------------------------- /python/core/configs/datamodule/dep_datamodules.yaml: -------------------------------------------------------------------------------- 1 | _target_: ltp_core.datamodules.TaskDataModule 2 | 3 | tokenizer: ${model.model.backbone.pretrained_model_name_or_path} 4 | 5 | datamodules: 6 | batch_size: 16 7 | num_workers: 4 8 | pin_memory: True 9 | load: 10 | _target_: ltp_core.datamodules.adapters.dependency_parsing.build_dataset 11 | _partial_: true 12 | task_name: "dep" 13 | data_dir: "data/conllu" 14 | -------------------------------------------------------------------------------- /python/core/configs/datamodule/multi_datamodules.yaml: -------------------------------------------------------------------------------- 1 | _target_: ltp_core.datamodules.MultiTaskDataModule 2 | 3 | tau: 0.8 4 | tokenizer: ${model.model.backbone.pretrained_model_name_or_path} 5 | 6 | datamodules: 7 | cws: 8 | batch_size: 16 9 | num_workers: 4 10 | pin_memory: True 11 | load: 12 | _target_: ltp_core.datamodules.adapters.segmention.build_dataset 13 | _partial_: true 14 | task_name: "cws" 15 | data_dir: "data/conllu" 16 | mode: "bmes" 17 | 18 | pos: 19 | batch_size: 16 20 | num_workers: 4 21 | pin_memory: True 22 | load: 23 | _target_: ltp_core.datamodules.adapters.postagger.build_dataset 24 | _partial_: true 25 | task_name: "pos" 26 | data_dir: "data/conllu" 27 | 28 | ner: 29 | batch_size: 16 30 | num_workers: 4 31 | pin_memory: True 32 | load: 33 | _target_: ltp_core.datamodules.adapters.named_entity_recognition.build_dataset 34 | _partial_: true 35 | task_name: "ner" 36 | data_dir: "data/ner" 37 | 38 | srl: 39 | batch_size: 16 40 | num_workers: 4 41 | pin_memory: True 42 | load: 43 | _target_: ltp_core.datamodules.adapters.semantic_role_labeling.build_dataset 44 | _partial_: true 45 | task_name: "srl" 46 | data_dir: "data/srl" 47 | 48 | dep: 49 | batch_size: 16 50 | num_workers: 4 51 | pin_memory: True 52 | load: 53 | _target_: ltp_core.datamodules.adapters.dependency_parsing.build_dataset 54 | _partial_: true 55 | task_name: "dep" 56 | data_dir: "data/conllu" 57 | 58 | sdp: 59 | batch_size: 16 60 | num_workers: 4 61 | pin_memory: True 62 | load: 63 | _target_: ltp_core.datamodules.adapters.semantic_dependency_parsing.build_dataset 64 | _partial_: true 65 | task_name: "sdp" 66 | data_dir: "data/conllu" 67 | -------------------------------------------------------------------------------- /python/core/configs/datamodule/ner_datamodules.yaml: -------------------------------------------------------------------------------- 1 | _target_: ltp_core.datamodules.TaskDataModule 2 | 3 | tokenizer: ${model.model.backbone.pretrained_model_name_or_path} 4 | 5 | datamodules: 6 | batch_size: 16 7 | num_workers: 4 8 | pin_memory: True 9 | load: 10 | _target_: ltp_core.datamodules.adapters.named_entity_recognition.build_dataset 11 | _partial_: true 12 | task_name: "ner" 13 | data_dir: "data/ner" 14 | -------------------------------------------------------------------------------- /python/core/configs/datamodule/pos_datamodules.yaml: -------------------------------------------------------------------------------- 1 | _target_: ltp_core.datamodules.TaskDataModule 2 | 3 | tokenizer: ${model.model.backbone.pretrained_model_name_or_path} 4 | 5 | datamodules: 6 | batch_size: 16 7 | num_workers: 4 8 | pin_memory: True 9 | load: 10 | _target_: ltp_core.datamodules.adapters.postagger.build_dataset 11 | _partial_: true 12 | task_name: "pos" 13 | data_dir: "data/conllu" 14 | -------------------------------------------------------------------------------- /python/core/configs/datamodule/sdp_datamodules.yaml: -------------------------------------------------------------------------------- 1 | _target_: ltp_core.datamodules.TaskDataModule 2 | 3 | tokenizer: ${model.model.backbone.pretrained_model_name_or_path} 4 | 5 | datamodules: 6 | batch_size: 16 7 | num_workers: 4 8 | pin_memory: True 9 | load: 10 | _target_: ltp_core.datamodules.adapters.semantic_dependency_parsing.build_dataset 11 | _partial_: true 12 | task_name: "sdp" 13 | data_dir: "data/conllu" 14 | -------------------------------------------------------------------------------- /python/core/configs/datamodule/srl_datamodules.yaml: -------------------------------------------------------------------------------- 1 | _target_: ltp_core.datamodules.TaskDataModule 2 | 3 | tokenizer: ${model.model.backbone.pretrained_model_name_or_path} 4 | 5 | datamodules: 6 | batch_size: 16 7 | num_workers: 4 8 | pin_memory: True 9 | load: 10 | _target_: ltp_core.datamodules.adapters.semantic_role_labeling.build_dataset 11 | _partial_: true 12 | task_name: "srl" 13 | data_dir: "data/srl" 14 | -------------------------------------------------------------------------------- /python/core/configs/debug/default.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # default debugging setup, runs 1 full epoch 4 | # other debugging configs can inherit from this one 5 | 6 | # overwrite task name so debugging logs are stored in separate folder 7 | task_name: "debug" 8 | 9 | # disable callbacks and loggers during debugging 10 | callbacks: null 11 | logger: null 12 | 13 | extras: 14 | ignore_warnings: False 15 | enforce_tags: False 16 | 17 | # sets level of all command line loggers to 'DEBUG' 18 | # https://hydra.cc/docs/tutorials/basic/running_your_app/logging/ 19 | hydra: 20 | job_logging: 21 | root: 22 | level: DEBUG 23 | 24 | # use this to also set hydra loggers to 'DEBUG' 25 | # verbose: True 26 | 27 | trainer: 28 | max_epochs: 1 29 | accelerator: cpu # debuggers don't like gpus 30 | devices: 1 # debuggers don't like multiprocessing 31 | detect_anomaly: true # raise exception if NaN or +/-inf is detected in any tensor 32 | 33 | datamodule: 34 | num_workers: 0 # debuggers don't like multiprocessing 35 | pin_memory: False # disable gpu memory pin 36 | -------------------------------------------------------------------------------- /python/core/configs/debug/fdr.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # runs 1 train, 1 validation and 1 test step 4 | 5 | defaults: 6 | - default.yaml 7 | 8 | trainer: 9 | fast_dev_run: true 10 | -------------------------------------------------------------------------------- /python/core/configs/debug/limit.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # uses only 1% of the training data and 5% of validation/test data 4 | 5 | defaults: 6 | - default.yaml 7 | 8 | trainer: 9 | max_epochs: 3 10 | limit_train_batches: 0.01 11 | limit_val_batches: 0.05 12 | limit_test_batches: 0.05 13 | -------------------------------------------------------------------------------- /python/core/configs/debug/overfit.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # overfits to 3 batches 4 | 5 | defaults: 6 | - default.yaml 7 | 8 | trainer: 9 | max_epochs: 20 10 | overfit_batches: 3 11 | 12 | # model ckpt and early stopping need to be disabled during overfitting 13 | callbacks: null 14 | -------------------------------------------------------------------------------- /python/core/configs/debug/profiler.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # runs with execution time profiling 4 | 5 | defaults: 6 | - default.yaml 7 | 8 | trainer: 9 | max_epochs: 1 10 | profiler: "simple" 11 | # profiler: "advanced" 12 | # profiler: "pytorch" 13 | -------------------------------------------------------------------------------- /python/core/configs/eval.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - _self_ 5 | - datamodule: multi_datamodules.yaml 6 | - model: multi_model.yaml 7 | - logger: null 8 | - trainer: default.yaml 9 | - paths: default.yaml 10 | - extras: default.yaml 11 | - hydra: default.yaml 12 | 13 | task_name: "eval" 14 | 15 | tags: ["dev"] 16 | 17 | # passing checkpoint path is necessary for evaluation 18 | ckpt_path: ??? 19 | -------------------------------------------------------------------------------- /python/core/configs/experiment/cls.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # to execute this experiment run: 4 | # python train.py experiment=example 5 | 6 | defaults: 7 | - override /datamodule: cls_datamodules.yaml 8 | - override /model: cls_model.yaml 9 | - override /callbacks: default.yaml 10 | - override /trainer: gpu.yaml 11 | 12 | # all parameters below will be merged with parameters from default configurations set above 13 | # this allows you to overwrite only specified parameters 14 | 15 | tags: ["cls"] 16 | 17 | seed: 12345 18 | 19 | trainer: 20 | min_epochs: 1 21 | max_epochs: 10 22 | gradient_clip_val: 1.0 23 | 24 | logger: 25 | wandb: 26 | tags: "${tags}" 27 | name: "sent-cls-${oc.env:SLURM_JOB_ID,localhost}-${now:%Y-%m-%d_%H:%M:%S.%f}" 28 | 29 | callbacks: 30 | model_checkpoint: 31 | monitor: "val/mean_metric" 32 | mode: "max" 33 | 34 | early_stopping: 35 | monitor: "val/mean_metric" 36 | patience: 3 37 | mode: "max" 38 | -------------------------------------------------------------------------------- /python/core/configs/experiment/cws.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # to execute this experiment run: 4 | # python train.py experiment=example 5 | 6 | defaults: 7 | - override /datamodule: cws_datamodules.yaml 8 | - override /model: cws_model.yaml 9 | - override /callbacks: default.yaml 10 | - override /trainer: gpu.yaml 11 | 12 | # all parameters below will be merged with parameters from default configurations set above 13 | # this allows you to overwrite only specified parameters 14 | 15 | tags: ["cws"] 16 | 17 | seed: 12345 18 | 19 | trainer: 20 | min_epochs: 1 21 | max_epochs: 10 22 | gradient_clip_val: 1.0 23 | 24 | logger: 25 | wandb: 26 | tags: "${tags}" 27 | name: "ltp-${oc.env:SLURM_JOB_ID,localhost}-${now:%Y-%m-%d_%H:%M:%S.%f}" 28 | 29 | callbacks: 30 | model_checkpoint: 31 | monitor: "val/mean_metric" 32 | mode: "max" 33 | 34 | early_stopping: 35 | monitor: "val/mean_metric" 36 | patience: 3 37 | mode: "max" 38 | -------------------------------------------------------------------------------- /python/core/configs/experiment/dep.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # to execute this experiment run: 4 | # python train.py experiment=example 5 | 6 | defaults: 7 | - override /datamodule: dep_datamodules.yaml 8 | - override /model: dep_model.yaml 9 | - override /callbacks: default.yaml 10 | - override /trainer: gpu.yaml 11 | 12 | # all parameters below will be merged with parameters from default configurations set above 13 | # this allows you to overwrite only specified parameters 14 | 15 | tags: ["dep"] 16 | 17 | seed: 12345 18 | 19 | trainer: 20 | min_epochs: 1 21 | max_epochs: 10 22 | gradient_clip_val: 1.0 23 | 24 | logger: 25 | wandb: 26 | tags: "${tags}" 27 | name: "ltp-${oc.env:SLURM_JOB_ID,localhost}-${now:%Y-%m-%d_%H:%M:%S.%f}" 28 | 29 | callbacks: 30 | model_checkpoint: 31 | monitor: "val/mean_metric" 32 | mode: "max" 33 | 34 | early_stopping: 35 | monitor: "val/mean_metric" 36 | patience: 3 37 | mode: "max" 38 | -------------------------------------------------------------------------------- /python/core/configs/experiment/example.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # to execute this experiment run: 4 | # python train.py experiment=example 5 | 6 | defaults: 7 | - override /datamodule: multi_datamodules.yaml 8 | - override /model: multi_model.yaml 9 | - override /callbacks: default.yaml 10 | - override /trainer: gpu.yaml 11 | 12 | # all parameters below will be merged with parameters from default configurations set above 13 | # this allows you to overwrite only specified parameters 14 | 15 | tags: ["ltp"] 16 | 17 | seed: 12345 18 | 19 | trainer: 20 | min_epochs: 1 21 | max_epochs: 3 22 | gradient_clip_val: 1.0 23 | 24 | model: 25 | model: 26 | backbone: 27 | pretrained_model_name_or_path: hfl/chinese-electra-small-generator 28 | heads: 29 | cws: 30 | input_size: 64 31 | num_labels: 4 32 | pos: 33 | input_size: 64 34 | ner: 35 | input_size: 64 36 | srl: 37 | input_size: 64 38 | hidden_size: 32 39 | dep: 40 | input_size: 64 41 | sdp: 42 | input_size: 64 43 | 44 | logger: 45 | wandb: 46 | tags: "${tags}" 47 | name: "ltp-${oc.env:SLURM_JOB_ID,localhost}-${now:%Y-%m-%d_%H:%M:%S.%f}" 48 | 49 | callbacks: 50 | model_checkpoint: 51 | monitor: "val/mean_metric" 52 | mode: "max" 53 | 54 | early_stopping: 55 | monitor: "val/mean_metric" 56 | patience: 3 57 | mode: "max" 58 | -------------------------------------------------------------------------------- /python/core/configs/experiment/multi.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # to execute this experiment run: 4 | # python train.py experiment=example 5 | 6 | defaults: 7 | - override /datamodule: multi_datamodules.yaml 8 | - override /model: multi_model.yaml 9 | - override /callbacks: default.yaml 10 | - override /trainer: gpu.yaml 11 | 12 | # all parameters below will be merged with parameters from default configurations set above 13 | # this allows you to overwrite only specified parameters 14 | 15 | tags: ["ltp"] 16 | 17 | seed: 12345 18 | 19 | trainer: 20 | min_epochs: 1 21 | max_epochs: 10 22 | gradient_clip_val: 1.0 23 | 24 | logger: 25 | wandb: 26 | tags: "${tags}" 27 | name: "ltp-${oc.env:SLURM_JOB_ID,localhost}-${now:%Y-%m-%d_%H:%M:%S.%f}" 28 | 29 | callbacks: 30 | model_checkpoint: 31 | monitor: "val/mean_metric" 32 | mode: "max" 33 | 34 | early_stopping: 35 | monitor: "val/mean_metric" 36 | patience: 3 37 | mode: "max" 38 | -------------------------------------------------------------------------------- /python/core/configs/experiment/multi_bi.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # to execute this experiment run: 4 | # python train.py experiment=example 5 | 6 | defaults: 7 | - override /datamodule: multi_datamodules.yaml 8 | - override /model: multi_model.yaml 9 | - override /callbacks: default.yaml 10 | - override /trainer: gpu.yaml 11 | 12 | # all parameters below will be merged with parameters from default configurations set above 13 | # this allows you to overwrite only specified parameters 14 | 15 | tags: ["ltp"] 16 | 17 | seed: 12345 18 | 19 | trainer: 20 | min_epochs: 1 21 | max_epochs: 10 22 | gradient_clip_val: 1.0 23 | 24 | logger: 25 | wandb: 26 | tags: "${tags}" 27 | name: "ltp-${oc.env:SLURM_JOB_ID,localhost}-${now:%Y-%m-%d_%H:%M:%S.%f}" 28 | 29 | callbacks: 30 | model_checkpoint: 31 | monitor: "val/mean_metric" 32 | mode: "max" 33 | 34 | early_stopping: 35 | monitor: "val/mean_metric" 36 | patience: 3 37 | mode: "max" 38 | 39 | datamodule: 40 | datamodules: 41 | cws: 42 | load: 43 | mode: "bi" 44 | 45 | model: 46 | metrics: 47 | cws: 48 | _ltp_target_: ltp_core.models.metrics.token.SeqEvalF1 49 | tags_or_path: ["B", "I"] 50 | model: 51 | heads: 52 | cws: 53 | num_labels: 2 54 | -------------------------------------------------------------------------------- /python/core/configs/experiment/ner.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # to execute this experiment run: 4 | # python train.py experiment=example 5 | 6 | defaults: 7 | - override /datamodule: ner_datamodules.yaml 8 | - override /model: ner_model.yaml 9 | - override /callbacks: default.yaml 10 | - override /trainer: gpu.yaml 11 | 12 | # all parameters below will be merged with parameters from default configurations set above 13 | # this allows you to overwrite only specified parameters 14 | 15 | tags: ["ner"] 16 | 17 | seed: 12345 18 | 19 | trainer: 20 | min_epochs: 1 21 | max_epochs: 10 22 | gradient_clip_val: 1.0 23 | 24 | logger: 25 | wandb: 26 | tags: "${tags}" 27 | name: "ltp-${oc.env:SLURM_JOB_ID,localhost}-${now:%Y-%m-%d_%H:%M:%S.%f}" 28 | 29 | callbacks: 30 | model_checkpoint: 31 | monitor: "val/mean_metric" 32 | mode: "max" 33 | 34 | early_stopping: 35 | monitor: "val/mean_metric" 36 | patience: 3 37 | mode: "max" 38 | -------------------------------------------------------------------------------- /python/core/configs/experiment/pos.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # to execute this experiment run: 4 | # python train.py experiment=example 5 | 6 | defaults: 7 | - override /datamodule: pos_datamodules.yaml 8 | - override /model: pos_model.yaml 9 | - override /callbacks: default.yaml 10 | - override /trainer: gpu.yaml 11 | 12 | # all parameters below will be merged with parameters from default configurations set above 13 | # this allows you to overwrite only specified parameters 14 | 15 | tags: ["pos"] 16 | 17 | seed: 12345 18 | 19 | trainer: 20 | min_epochs: 1 21 | max_epochs: 10 22 | gradient_clip_val: 1.0 23 | 24 | logger: 25 | wandb: 26 | tags: "${tags}" 27 | name: "ltp-${oc.env:SLURM_JOB_ID,localhost}-${now:%Y-%m-%d_%H:%M:%S.%f}" 28 | 29 | callbacks: 30 | model_checkpoint: 31 | monitor: "val/mean_metric" 32 | mode: "max" 33 | 34 | early_stopping: 35 | monitor: "val/mean_metric" 36 | patience: 3 37 | mode: "max" 38 | -------------------------------------------------------------------------------- /python/core/configs/experiment/sdp.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # to execute this experiment run: 4 | # python train.py experiment=example 5 | 6 | defaults: 7 | - override /datamodule: sdp_datamodules.yaml 8 | - override /model: sdp_model.yaml 9 | - override /callbacks: default.yaml 10 | - override /trainer: gpu.yaml 11 | 12 | # all parameters below will be merged with parameters from default configurations set above 13 | # this allows you to overwrite only specified parameters 14 | 15 | tags: ["sdp"] 16 | 17 | seed: 12345 18 | 19 | trainer: 20 | min_epochs: 1 21 | max_epochs: 10 22 | gradient_clip_val: 1.0 23 | 24 | logger: 25 | wandb: 26 | tags: "${tags}" 27 | name: "ltp-${oc.env:SLURM_JOB_ID,localhost}-${now:%Y-%m-%d_%H:%M:%S.%f}" 28 | 29 | callbacks: 30 | model_checkpoint: 31 | monitor: "val/mean_metric" 32 | mode: "max" 33 | 34 | early_stopping: 35 | monitor: "val/mean_metric" 36 | patience: 3 37 | mode: "max" 38 | -------------------------------------------------------------------------------- /python/core/configs/experiment/srl.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # to execute this experiment run: 4 | # python train.py experiment=example 5 | 6 | defaults: 7 | - override /datamodule: srl_datamodules.yaml 8 | - override /model: srl_model.yaml 9 | - override /callbacks: default.yaml 10 | - override /trainer: gpu.yaml 11 | 12 | # all parameters below will be merged with parameters from default configurations set above 13 | # this allows you to overwrite only specified parameters 14 | 15 | tags: ["srl"] 16 | 17 | seed: 12345 18 | 19 | trainer: 20 | min_epochs: 1 21 | max_epochs: 10 22 | gradient_clip_val: 1.0 23 | 24 | logger: 25 | wandb: 26 | tags: "${tags}" 27 | name: "ltp-${oc.env:SLURM_JOB_ID,localhost}-${now:%Y-%m-%d_%H:%M:%S.%f}" 28 | 29 | callbacks: 30 | model_checkpoint: 31 | monitor: "val/mean_metric" 32 | mode: "max" 33 | 34 | early_stopping: 35 | monitor: "val/mean_metric" 36 | patience: 3 37 | mode: "max" 38 | -------------------------------------------------------------------------------- /python/core/configs/extras/default.yaml: -------------------------------------------------------------------------------- 1 | # disable python warnings if they annoy you 2 | ignore_warnings: False 3 | 4 | # ask user for tags if none are provided in the config 5 | enforce_tags: True 6 | 7 | # pretty print config tree at the start of the run using Rich library 8 | print_config: True 9 | -------------------------------------------------------------------------------- /python/core/configs/hparams_search/ltp_optuna.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # example hyperparameter optimization of some experiment with Optuna: 4 | # python train.py -m hparams_search=mnist_optuna experiment=example 5 | 6 | defaults: 7 | - override /hydra/sweeper: optuna 8 | 9 | # choose metric which will be optimized by Optuna 10 | # make sure this is the correct name of some metric logged in lightning module! 11 | optimized_metric: "val/mean_metric" 12 | 13 | # here we define Optuna hyperparameter search 14 | # it optimizes for value returned from function with @hydra.main decorator 15 | # docs: https://hydra.cc/docs/next/plugins/optuna_sweeper 16 | hydra: 17 | mode: "MULTIRUN" # set hydra to multirun by default if this config is attached 18 | 19 | sweeper: 20 | _target_: hydra_plugins.hydra_optuna_sweeper.optuna_sweeper.OptunaSweeper 21 | 22 | # storage URL to persist optimization results 23 | # for example, you can use SQLite if you set 'sqlite:///example.db' 24 | storage: null 25 | 26 | # name of the study to persist optimization results 27 | study_name: null 28 | 29 | # number of parallel workers 30 | n_jobs: 1 31 | 32 | # 'minimize' or 'maximize' the objective 33 | direction: maximize 34 | 35 | # total number of runs that will be executed 36 | n_trials: 20 37 | 38 | # choose Optuna hyperparameter sampler 39 | # you can choose bayesian sampler (tpe), random search (without optimization), grid sampler, and others 40 | # docs: https://optuna.readthedocs.io/en/stable/reference/samplers.html 41 | sampler: 42 | _target_: optuna.samplers.TPESampler 43 | seed: 1234 44 | n_startup_trials: 10 # number of random sampling runs before optimization starts 45 | 46 | # define hyperparameter search space 47 | params: 48 | model.optimizer.lr: interval(0.0001, 0.1) 49 | -------------------------------------------------------------------------------- /python/core/configs/hydra/default.yaml: -------------------------------------------------------------------------------- 1 | # https://hydra.cc/docs/configure_hydra/intro/ 2 | 3 | # enable color logging 4 | defaults: 5 | - override hydra_logging: colorlog 6 | - override job_logging: colorlog 7 | 8 | # output directory, generated dynamically on each run 9 | run: 10 | dir: ${paths.log_dir}/${task_name}/runs/${now:%Y-%m-%d}_${now:%H-%M-%S} 11 | sweep: 12 | dir: ${paths.log_dir}/${task_name}/multiruns/${now:%Y-%m-%d}_${now:%H-%M-%S} 13 | subdir: ${hydra.job.num} 14 | -------------------------------------------------------------------------------- /python/core/configs/local/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HIT-SCIR/ltp/6c6059d5ccad87dca003190ee7565af86e7e22a0/python/core/configs/local/.gitkeep -------------------------------------------------------------------------------- /python/core/configs/logger/comet.yaml: -------------------------------------------------------------------------------- 1 | # https://www.comet.ml 2 | 3 | comet: 4 | _target_: pytorch_lightning.loggers.comet.CometLogger 5 | api_key: ${oc.env:COMET_API_TOKEN} # api key is loaded from environment variable 6 | save_dir: "${paths.output_dir}" 7 | project_name: "ltp" 8 | rest_api_key: null 9 | # experiment_name: "" 10 | experiment_key: null # set to resume experiment 11 | offline: False 12 | prefix: "" 13 | -------------------------------------------------------------------------------- /python/core/configs/logger/csv.yaml: -------------------------------------------------------------------------------- 1 | # csv logger built in lightning 2 | 3 | csv: 4 | _target_: pytorch_lightning.loggers.csv_logs.CSVLogger 5 | save_dir: "${paths.output_dir}" 6 | name: "csv/" 7 | prefix: "" 8 | -------------------------------------------------------------------------------- /python/core/configs/logger/many_loggers.yaml: -------------------------------------------------------------------------------- 1 | # train with many loggers at once 2 | 3 | defaults: 4 | - wandb.yaml 5 | - tensorboard.yaml 6 | - csv.yaml 7 | # - comet.yaml 8 | # - mlflow.yaml 9 | # - neptune.yaml 10 | -------------------------------------------------------------------------------- /python/core/configs/logger/mlflow.yaml: -------------------------------------------------------------------------------- 1 | # https://mlflow.org 2 | 3 | mlflow: 4 | _target_: pytorch_lightning.loggers.mlflow.MLFlowLogger 5 | # experiment_name: "" 6 | # run_name: "" 7 | tracking_uri: ${paths.log_dir}/mlflow/mlruns # run `mlflow ui` command inside the `logs/mlflow/` dir to open the UI 8 | tags: null 9 | # save_dir: "./mlruns" 10 | prefix: "" 11 | artifact_location: null 12 | # run_id: "" 13 | -------------------------------------------------------------------------------- /python/core/configs/logger/neptune.yaml: -------------------------------------------------------------------------------- 1 | # https://neptune.ai 2 | 3 | neptune: 4 | _target_: pytorch_lightning.loggers.neptune.NeptuneLogger 5 | api_key: ${oc.env:NEPTUNE_API_TOKEN} # api key is loaded from environment variable 6 | project: "ltp" 7 | # name: "" 8 | log_model_checkpoints: True 9 | prefix: "" 10 | -------------------------------------------------------------------------------- /python/core/configs/logger/tensorboard.yaml: -------------------------------------------------------------------------------- 1 | # https://www.tensorflow.org/tensorboard/ 2 | 3 | tensorboard: 4 | _target_: pytorch_lightning.loggers.tensorboard.TensorBoardLogger 5 | save_dir: "${paths.output_dir}/tensorboard/" 6 | name: null 7 | log_graph: False 8 | default_hp_metric: True 9 | prefix: "" 10 | # version: "" 11 | -------------------------------------------------------------------------------- /python/core/configs/logger/wandb.yaml: -------------------------------------------------------------------------------- 1 | # https://wandb.ai 2 | 3 | wandb: 4 | _target_: pytorch_lightning.loggers.wandb.WandbLogger 5 | # name: "" # name of the run (normally generated by wandb) 6 | save_dir: "${paths.output_dir}" 7 | offline: False 8 | id: null # pass correct id to resume experiment! 9 | anonymous: null # enable anonymous logging 10 | project: "ltp" 11 | log_model: False # upload lightning ckpts 12 | prefix: "" # a string to put at the beginning of metric keys 13 | # entity: "" # set to name of your wandb team 14 | group: "" 15 | tags: [] 16 | job_type: "" 17 | -------------------------------------------------------------------------------- /python/core/configs/model/cls_model.yaml: -------------------------------------------------------------------------------- 1 | _target_: ltp_core.models.lit_model.LTPLitModule 2 | 3 | optimizer: 4 | _ltp_target_: torch.optim.AdamW 5 | _ltp_partial_: true 6 | lr: 2e-5 7 | weight_decay: 0.0 8 | 9 | layer_lrs: 10 | _ltp_target_: ltp_core.models.optimization.layer_lrs.get_layer_lrs_with_crf 11 | _ltp_partial_: true 12 | transformer_prefix: backbone 13 | learning_rate: ${model.optimizer.lr} 14 | layer_decay: 0.8 # 0.8 for Base/Small, 0.9 for Large 15 | n_layers: 12 16 | crf_prefix: "crf" 17 | crf_ratio: 10.0 18 | 19 | scheduler: 20 | _ltp_target_: ltp_core.models.optimization.scheduler.compose_with_scheduler 21 | _ltp_partial_: true 22 | scheduler_type: "exponential" 23 | scheduler_args: null 24 | warmup_ratio: 0.1 25 | interval: "epoch" 26 | frequency: 3 27 | 28 | criterions: 29 | cls: 30 | _ltp_target_: ltp_core.models.criterion.sent.ClassificationLoss 31 | 32 | metrics: 33 | cls: 34 | _ltp_target_: ltp_core.models.metrics.sent.ClsAccuracy 35 | 36 | model: 37 | _ltp_target_: ltp_core.models.ltp_model.LTPModule 38 | backbone: 39 | _ltp_target_: transformers.AutoModel.from_pretrained 40 | pretrained_model_name_or_path: hfl/chinese-electra-180g-base-discriminator 41 | 42 | processor: 43 | cls: 44 | _ltp_target_: ltp_core.models.processor.ClsOnly 45 | 46 | heads: 47 | cls: 48 | _ltp_target_: ltp_core.models.components.sent.MLPClassifier 49 | input_size: 768 50 | num_labels: 2 51 | dropout: 0.1 52 | -------------------------------------------------------------------------------- /python/core/configs/model/cws_model.yaml: -------------------------------------------------------------------------------- 1 | _target_: ltp_core.models.lit_model.LTPLitModule 2 | 3 | optimizer: 4 | _ltp_target_: torch.optim.AdamW 5 | _ltp_partial_: true 6 | lr: 2e-5 7 | weight_decay: 0.0 8 | 9 | layer_lrs: 10 | _ltp_target_: ltp_core.models.optimization.layer_lrs.get_layer_lrs_with_crf 11 | _ltp_partial_: true 12 | transformer_prefix: backbone 13 | learning_rate: ${model.optimizer.lr} 14 | layer_decay: 0.8 # 0.8 for Base/Small, 0.9 for Large 15 | n_layers: 12 16 | crf_prefix: "crf" 17 | crf_ratio: 10.0 18 | 19 | scheduler: 20 | _ltp_target_: ltp_core.models.optimization.scheduler.compose_with_scheduler 21 | _ltp_partial_: true 22 | scheduler_type: "linear" 23 | scheduler_args: null 24 | warmup_ratio: 0.02 25 | interval: "step" 26 | frequency: 1 27 | 28 | criterions: 29 | cws: 30 | _ltp_target_: ltp_core.models.criterion.token.TokenLoss 31 | 32 | metrics: 33 | cws: 34 | _ltp_target_: ltp_core.models.metrics.token.SeqEvalF1 35 | tags_or_path: ["B", "M", "E", "S"] 36 | 37 | model: 38 | _ltp_target_: ltp_core.models.ltp_model.LTPModule 39 | backbone: 40 | _ltp_target_: transformers.AutoModel.from_pretrained 41 | pretrained_model_name_or_path: hfl/chinese-electra-180g-base-discriminator 42 | 43 | processor: 44 | cws: 45 | _ltp_target_: ltp_core.models.processor.TokenOnly 46 | 47 | heads: 48 | cws: 49 | _ltp_target_: ltp_core.models.components.token.MLPTokenClassifier 50 | input_size: 768 51 | num_labels: 4 52 | dropout: 0.1 53 | -------------------------------------------------------------------------------- /python/core/configs/model/dep_model.yaml: -------------------------------------------------------------------------------- 1 | _target_: ltp_core.models.lit_model.LTPLitModule 2 | 3 | optimizer: 4 | _ltp_target_: torch.optim.AdamW 5 | _ltp_partial_: true 6 | lr: 2e-5 7 | weight_decay: 0.0 8 | 9 | layer_lrs: 10 | _ltp_target_: ltp_core.models.optimization.layer_lrs.get_layer_lrs_with_crf 11 | _ltp_partial_: true 12 | transformer_prefix: backbone 13 | learning_rate: ${model.optimizer.lr} 14 | layer_decay: 0.8 # 0.8 for Base/Small, 0.9 for Large 15 | n_layers: 12 16 | crf_prefix: "crf" 17 | crf_ratio: 10.0 18 | 19 | scheduler: 20 | _ltp_target_: ltp_core.models.optimization.scheduler.compose_with_scheduler 21 | _ltp_partial_: true 22 | scheduler_type: "linear" 23 | scheduler_args: null 24 | warmup_ratio: 0.02 25 | interval: "step" 26 | frequency: 1 27 | 28 | criterions: 29 | dep: 30 | _ltp_target_: ltp_core.models.criterion.graph.DEPLoss 31 | 32 | metrics: 33 | dep: 34 | _ltp_target_: ltp_core.models.metrics.graph.DEPLas 35 | 36 | model: 37 | _ltp_target_: ltp_core.models.ltp_model.LTPModule 38 | backbone: 39 | _ltp_target_: transformers.AutoModel.from_pretrained 40 | pretrained_model_name_or_path: hfl/chinese-electra-180g-base-discriminator 41 | 42 | processor: 43 | dep: 44 | _ltp_target_: ltp_core.models.processor.WordsWithHead 45 | 46 | heads: 47 | dep: 48 | _ltp_target_: ltp_core.models.components.graph.BiaffineClassifier 49 | input_size: 768 50 | num_labels: 14 51 | dropout: 0.1 52 | -------------------------------------------------------------------------------- /python/core/configs/model/ner_model.yaml: -------------------------------------------------------------------------------- 1 | _target_: ltp_core.models.lit_model.LTPLitModule 2 | 3 | optimizer: 4 | _ltp_target_: torch.optim.AdamW 5 | _ltp_partial_: true 6 | lr: 2e-5 7 | weight_decay: 0.0 8 | 9 | layer_lrs: 10 | _ltp_target_: ltp_core.models.optimization.layer_lrs.get_layer_lrs_with_crf 11 | _ltp_partial_: true 12 | transformer_prefix: backbone 13 | learning_rate: ${model.optimizer.lr} 14 | layer_decay: 0.8 # 0.8 for Base/Small, 0.9 for Large 15 | n_layers: 12 16 | crf_prefix: "crf" 17 | crf_ratio: 10.0 18 | 19 | scheduler: 20 | _ltp_target_: ltp_core.models.optimization.scheduler.compose_with_scheduler 21 | _ltp_partial_: true 22 | scheduler_type: "linear" 23 | scheduler_args: null 24 | warmup_ratio: 0.02 25 | interval: "step" 26 | frequency: 1 27 | 28 | criterions: 29 | ner: 30 | _ltp_target_: ltp_core.models.criterion.token.TokenLoss 31 | 32 | metrics: 33 | ner: 34 | _ltp_target_: ltp_core.models.metrics.token.SeqEvalF1 35 | tags_or_path: ${datamodule.datamodules.load.data_dir}/vocabs/bio.txt 36 | 37 | model: 38 | _ltp_target_: ltp_core.models.ltp_model.LTPModule 39 | backbone: 40 | _ltp_target_: transformers.AutoModel.from_pretrained 41 | pretrained_model_name_or_path: hfl/chinese-electra-180g-base-discriminator 42 | 43 | processor: 44 | ner: 45 | _ltp_target_: ltp_core.models.processor.WordsOnly 46 | 47 | heads: 48 | ner: 49 | _ltp_target_: ltp_core.models.components.token.MLPTokenClassifier 50 | input_size: 768 51 | num_labels: 13 52 | dropout: 0.1 53 | -------------------------------------------------------------------------------- /python/core/configs/model/pos_model.yaml: -------------------------------------------------------------------------------- 1 | _target_: ltp_core.models.lit_model.LTPLitModule 2 | 3 | optimizer: 4 | _ltp_target_: torch.optim.AdamW 5 | _ltp_partial_: true 6 | lr: 2e-5 7 | weight_decay: 0.0 8 | 9 | layer_lrs: 10 | _ltp_target_: ltp_core.models.optimization.layer_lrs.get_layer_lrs_with_crf 11 | _ltp_partial_: true 12 | transformer_prefix: backbone 13 | learning_rate: ${model.optimizer.lr} 14 | layer_decay: 0.8 # 0.8 for Base/Small, 0.9 for Large 15 | n_layers: 12 16 | crf_prefix: "crf" 17 | crf_ratio: 10.0 18 | 19 | scheduler: 20 | _ltp_target_: ltp_core.models.optimization.scheduler.compose_with_scheduler 21 | _ltp_partial_: true 22 | scheduler_type: "linear" 23 | scheduler_args: null 24 | warmup_ratio: 0.02 25 | interval: "step" 26 | frequency: 1 27 | 28 | criterions: 29 | pos: 30 | _ltp_target_: ltp_core.models.criterion.token.TokenLoss 31 | 32 | metrics: 33 | pos: 34 | _ltp_target_: ltp_core.models.metrics.token.TokenAccuracy 35 | 36 | model: 37 | _ltp_target_: ltp_core.models.ltp_model.LTPModule 38 | backbone: 39 | _ltp_target_: transformers.AutoModel.from_pretrained 40 | pretrained_model_name_or_path: hfl/chinese-electra-180g-base-discriminator 41 | 42 | processor: 43 | pos: 44 | _ltp_target_: ltp_core.models.processor.WordsOnly 45 | 46 | heads: 47 | pos: 48 | _ltp_target_: ltp_core.models.components.token.MLPTokenClassifier 49 | input_size: 768 50 | num_labels: 27 51 | dropout: 0.1 52 | -------------------------------------------------------------------------------- /python/core/configs/model/sdp_model.yaml: -------------------------------------------------------------------------------- 1 | _target_: ltp_core.models.lit_model.LTPLitModule 2 | 3 | optimizer: 4 | _ltp_target_: torch.optim.AdamW 5 | _ltp_partial_: true 6 | lr: 2e-5 7 | weight_decay: 0.0 8 | 9 | layer_lrs: 10 | _ltp_target_: ltp_core.models.optimization.layer_lrs.get_layer_lrs_with_crf 11 | _ltp_partial_: true 12 | transformer_prefix: backbone 13 | learning_rate: ${model.optimizer.lr} 14 | layer_decay: 0.8 # 0.8 for Base/Small, 0.9 for Large 15 | n_layers: 12 16 | crf_prefix: "crf" 17 | crf_ratio: 10.0 18 | 19 | scheduler: 20 | _ltp_target_: ltp_core.models.optimization.scheduler.compose_with_scheduler 21 | _ltp_partial_: true 22 | scheduler_type: "linear" 23 | scheduler_args: null 24 | warmup_ratio: 0.02 25 | interval: "step" 26 | frequency: 1 27 | 28 | criterions: 29 | sdp: 30 | _ltp_target_: ltp_core.models.criterion.graph.SDPLoss 31 | 32 | metrics: 33 | sdp: 34 | _ltp_target_: ltp_core.models.metrics.graph.SDPLas 35 | 36 | model: 37 | _ltp_target_: ltp_core.models.ltp_model.LTPModule 38 | backbone: 39 | _ltp_target_: transformers.AutoModel.from_pretrained 40 | pretrained_model_name_or_path: hfl/chinese-electra-180g-base-discriminator 41 | 42 | processor: 43 | sdp: 44 | _ltp_target_: ltp_core.models.processor.WordsWithHead 45 | 46 | heads: 47 | sdp: 48 | _ltp_target_: ltp_core.models.components.graph.BiaffineClassifier 49 | input_size: 768 50 | num_labels: 56 51 | dropout: 0.1 52 | -------------------------------------------------------------------------------- /python/core/configs/model/srl_model.yaml: -------------------------------------------------------------------------------- 1 | _target_: ltp_core.models.lit_model.LTPLitModule 2 | 3 | optimizer: 4 | _ltp_target_: torch.optim.AdamW 5 | _ltp_partial_: true 6 | lr: 2e-5 7 | weight_decay: 0.0 8 | 9 | layer_lrs: 10 | _ltp_target_: ltp_core.models.optimization.layer_lrs.get_layer_lrs_with_crf 11 | _ltp_partial_: true 12 | transformer_prefix: backbone 13 | learning_rate: ${model.optimizer.lr} 14 | layer_decay: 0.8 # 0.8 for Base/Small, 0.9 for Large 15 | n_layers: 12 16 | crf_prefix: "crf" 17 | crf_ratio: 10.0 18 | 19 | scheduler: 20 | _ltp_target_: ltp_core.models.optimization.scheduler.compose_with_scheduler 21 | _ltp_partial_: true 22 | scheduler_type: "linear" 23 | scheduler_args: null 24 | warmup_ratio: 0.02 25 | interval: "step" 26 | frequency: 1 27 | 28 | criterions: 29 | srl: 30 | _ltp_target_: ltp_core.models.criterion.token.SRLLoss 31 | 32 | metrics: 33 | srl: 34 | _ltp_target_: ltp_core.models.metrics.token.SRLEvalF1 35 | tags_or_path: ${datamodule.datamodules.load.data_dir}/vocabs/arguments.txt 36 | 37 | model: 38 | _ltp_target_: ltp_core.models.ltp_model.LTPModule 39 | backbone: 40 | _ltp_target_: transformers.AutoModel.from_pretrained 41 | pretrained_model_name_or_path: hfl/chinese-electra-180g-base-discriminator 42 | 43 | processor: 44 | srl: 45 | _ltp_target_: ltp_core.models.processor.WordsOnly 46 | 47 | heads: 48 | srl: 49 | _ltp_target_: ltp_core.models.components.token.BiaffineTokenClassifier 50 | input_size: 768 51 | hidden_size: 300 52 | num_labels: 97 53 | dropout: 0.1 54 | use_crf: True 55 | -------------------------------------------------------------------------------- /python/core/configs/paths/default.yaml: -------------------------------------------------------------------------------- 1 | # path to root directory 2 | # this requires PROJECT_ROOT environment variable to exist 3 | # PROJECT_ROOT is inferred and set by pyrootutils package in `train.py` and `eval.py` 4 | root_dir: ${oc.env:PROJECT_ROOT} 5 | 6 | # path to data directory 7 | data_dir: ${paths.root_dir}/data/ 8 | 9 | # path to logging directory 10 | log_dir: ${paths.root_dir}/logs/ 11 | 12 | # path to output directory, created dynamically by hydra 13 | # path generation pattern is specified in `configs/hydra/default.yaml` 14 | # use it to store all files generated during the run, like ckpts and metrics 15 | output_dir: ${hydra:runtime.output_dir} 16 | 17 | # path to working directory 18 | work_dir: ${hydra:runtime.cwd} 19 | -------------------------------------------------------------------------------- /python/core/configs/train.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # specify here default configuration 4 | # order of defaults determines the order in which configs override each other 5 | defaults: 6 | - _self_ 7 | - datamodule: multi_datamodules.yaml 8 | - model: multi_model.yaml 9 | - callbacks: default.yaml 10 | - logger: null # set logger here or use command line (e.g. `python train.py logger=tensorboard`) 11 | - trainer: default.yaml 12 | - paths: default.yaml 13 | - extras: default.yaml 14 | - hydra: default.yaml 15 | 16 | # experiment configs allow for version control of specific hyperparameters 17 | # e.g. best hyperparameters for given model and datamodule 18 | - experiment: null 19 | 20 | # config for hyperparameter optimization 21 | - hparams_search: null 22 | 23 | # optional local config for machine/user specific settings 24 | # it's optional since it doesn't need to exist and is excluded from version control 25 | - optional local: default.yaml 26 | 27 | # debugging config (enable through command line, e.g. `python train.py debug=default) 28 | - debug: null 29 | 30 | # task name, determines output directory path 31 | task_name: "train" 32 | 33 | # tags to help you identify your experiments 34 | # you can overwrite this in experiment configs 35 | # overwrite from command line with `python train.py tags="[first_tag, second_tag]"` 36 | # appending lists from command line is currently not supported :( 37 | # https://github.com/facebookresearch/hydra/issues/1547 38 | tags: ["dev"] 39 | 40 | # set False to skip model training 41 | train: True 42 | 43 | # evaluate on test set, using best model weights achieved during training 44 | # lightning chooses best weights based on the metric specified in checkpoint callback 45 | test: True 46 | 47 | # simply provide checkpoint path to resume training 48 | ckpt_path: null 49 | 50 | # seed for random number generators in pytorch, numpy and python.random 51 | seed: null 52 | -------------------------------------------------------------------------------- /python/core/configs/trainer/cpu.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - default.yaml 3 | 4 | accelerator: cpu 5 | devices: 1 6 | -------------------------------------------------------------------------------- /python/core/configs/trainer/ddp.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - default.yaml 3 | 4 | # use "ddp_spawn" instead of "ddp", 5 | # it's slower but normal "ddp" currently doesn't work ideally with hydra 6 | # https://github.com/facebookresearch/hydra/issues/2070 7 | # https://pytorch-lightning.readthedocs.io/en/latest/accelerators/gpu_intermediate.html#distributed-data-parallel-spawn 8 | strategy: ddp_spawn 9 | 10 | accelerator: gpu 11 | devices: 4 12 | num_nodes: 1 13 | sync_batchnorm: True 14 | -------------------------------------------------------------------------------- /python/core/configs/trainer/ddp_sim.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - default.yaml 3 | 4 | # simulate DDP on CPU, useful for debugging 5 | accelerator: cpu 6 | devices: 2 7 | strategy: ddp_spawn 8 | -------------------------------------------------------------------------------- /python/core/configs/trainer/default.yaml: -------------------------------------------------------------------------------- 1 | _target_: pytorch_lightning.Trainer 2 | 3 | default_root_dir: ${paths.output_dir} 4 | 5 | min_epochs: 1 # prevents early stopping 6 | max_epochs: 10 7 | 8 | accelerator: cpu 9 | devices: 1 10 | 11 | # mixed precision for extra speed-up 12 | # precision: 16 13 | 14 | # set True to to ensure deterministic results 15 | # makes training slower but gives more reproducibility than just setting seeds 16 | deterministic: False 17 | -------------------------------------------------------------------------------- /python/core/configs/trainer/gpu.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - default.yaml 3 | 4 | accelerator: gpu 5 | devices: 1 6 | -------------------------------------------------------------------------------- /python/core/configs/trainer/mps.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - default.yaml 3 | 4 | accelerator: mps 5 | devices: 1 6 | -------------------------------------------------------------------------------- /python/core/data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HIT-SCIR/ltp/6c6059d5ccad87dca003190ee7565af86e7e22a0/python/core/data/.gitkeep -------------------------------------------------------------------------------- /python/core/data/conllu/dev.conllu: -------------------------------------------------------------------------------- 1 | 1 他 _ _ r _ 3 SBV 3:AGT _ 2 | 2 点头 _ _ v _ 3 ADV 3:MANN _ 3 | 3 表示 _ _ v _ 0 HED 0:Root _ 4 | 4 同意 _ _ v _ 3 VOB 3:dCONT _ 5 | 5 我 _ _ r _ 7 ATT 7:FEAT _ 6 | 6 的 _ _ u _ 5 RAD 5:mDEPD _ 7 | 7 意见 _ _ n _ 4 VOB 4:CONT _ 8 | 8 。 _ _ wp _ 3 WP 3:mPUNC _ 9 | -------------------------------------------------------------------------------- /python/core/data/conllu/test.conllu: -------------------------------------------------------------------------------- 1 | 1 我们 _ _ r _ 7 SBV 7:AGT _ 2 | 2 即将 _ _ d _ 7 ADV 7:mDEPD _ 3 | 3 以 _ _ p _ 7 ADV 6:mRELA _ 4 | 4 昂扬 _ _ a _ 6 ATT 6:FEAT _ 5 | 5 的 _ _ u _ 4 RAD 4:mDEPD _ 6 | 6 斗志 _ _ n _ 3 POB 7:DATV _ 7 | 7 迎来 _ _ v _ 0 HED 0:Root _ 8 | 8 新 _ _ a _ 11 ATT 11:FEAT _ 9 | 9 的 _ _ u _ 8 RAD 8:mDEPD _ 10 | 10 一 _ _ m _ 11 ATT 11:MEAS _ 11 | 11 年 _ _ q _ 7 VOB 7:TIME _ 12 | 12 。 _ _ wp _ 7 WP 7:mPUNC _ 13 | -------------------------------------------------------------------------------- /python/core/data/conllu/train.conllu: -------------------------------------------------------------------------------- 1 | 1 他 _ _ r _ 2 SBV 2:AGT _ 2 | 2 叫 _ _ v _ 0 HED 0:Root _ 3 | 3 汤姆 _ _ nh _ 2 DBL 2:DATV|4:AGT _ 4 | 4 去 _ _ v _ 2 VOB 2:eSUCC _ 5 | 5 拿 _ _ v _ 4 COO 2:eSUCC|4:eSUCC _ 6 | 6 外衣 _ _ n _ 5 VOB 5:PAT _ 7 | 7 。 _ _ wp _ 2 WP 2:mPUNC _ 8 | -------------------------------------------------------------------------------- /python/core/data/conllu/vocabs/deprel.txt: -------------------------------------------------------------------------------- 1 | ADV 2 | ATT 3 | CMP 4 | COO 5 | DBL 6 | FOB 7 | HED 8 | IOB 9 | LAD 10 | POB 11 | RAD 12 | SBV 13 | VOB 14 | WP 15 | -------------------------------------------------------------------------------- /python/core/data/conllu/vocabs/deps.txt: -------------------------------------------------------------------------------- 1 | AGT 2 | CONT 3 | DATV 4 | EXP 5 | FEAT 6 | LINK 7 | LOC 8 | MANN 9 | MATL 10 | MEAS 11 | PAT 12 | REAS 13 | Root 14 | SCO 15 | STAT 16 | TIME 17 | TOOL 18 | dAGT 19 | dCONT 20 | dDATV 21 | dEXP 22 | dFEAT 23 | dLINK 24 | dLOC 25 | dMANN 26 | dMATL 27 | dMEAS 28 | dPAT 29 | dREAS 30 | dSCO 31 | dSTAT 32 | dTIME 33 | dTOOL 34 | eCOO 35 | ePREC 36 | eSUCC 37 | mDEPD 38 | mNEG 39 | mPUNC 40 | mRELA 41 | rAGT 42 | rCONT 43 | rDATV 44 | rEXP 45 | rFEAT 46 | rLINK 47 | rLOC 48 | rMANN 49 | rMATL 50 | rMEAS 51 | rPAT 52 | rREAS 53 | rSCO 54 | rSTAT 55 | rTIME 56 | rTOOL 57 | -------------------------------------------------------------------------------- /python/core/data/conllu/vocabs/feats.txt: -------------------------------------------------------------------------------- 1 | _ 2 | -------------------------------------------------------------------------------- /python/core/data/conllu/vocabs/lemma.txt: -------------------------------------------------------------------------------- 1 | _ 2 | -------------------------------------------------------------------------------- /python/core/data/conllu/vocabs/upos.txt: -------------------------------------------------------------------------------- 1 | _ 2 | -------------------------------------------------------------------------------- /python/core/data/conllu/vocabs/word.txt: -------------------------------------------------------------------------------- 1 | _ 2 | -------------------------------------------------------------------------------- /python/core/data/conllu/vocabs/word_char.txt: -------------------------------------------------------------------------------- 1 | _ 2 | -------------------------------------------------------------------------------- /python/core/data/conllu/vocabs/xpos.txt: -------------------------------------------------------------------------------- 1 | a 2 | b 3 | c 4 | d 5 | e 6 | h 7 | i 8 | j 9 | k 10 | m 11 | n 12 | nd 13 | nh 14 | ni 15 | nl 16 | ns 17 | nt 18 | nz 19 | o 20 | p 21 | q 22 | r 23 | u 24 | v 25 | wp 26 | ws 27 | z 28 | -------------------------------------------------------------------------------- /python/core/data/ner/dev.bio: -------------------------------------------------------------------------------- 1 | 正在 O 2 | 执行 O 3 | 第十四 O 4 | 次 O 5 | 南极 S-Ns 6 | 考察 O 7 | 任务 O 8 | 的 O 9 | 中国 S-Ns 10 | 考察队员 O 11 | , O 12 | 目前 O 13 | 分别 O 14 | 在 O 15 | 长城站 O 16 | 、 O 17 | 中山站 O 18 | 和 O 19 | “ O 20 | 雪龙 O 21 | ” O 22 | 号 O 23 | 船上 O 24 | 。 O 25 | -------------------------------------------------------------------------------- /python/core/data/ner/test.bio: -------------------------------------------------------------------------------- 1 | 编者 O 2 | 的 O 3 | 话 O 4 | ∶ O 5 | 党中央 B-Ni 6 | 国务院 E-Ni 7 | 最近 O 8 | 召开 O 9 | 的 O 10 | 国有 O 11 | 企业 O 12 | 下岗 O 13 | 职工 O 14 | 基本 O 15 | 生活 O 16 | 保障 O 17 | 和 O 18 | 再 O 19 | 就业 O 20 | 工作 O 21 | 会议 O 22 | , O 23 | 提出 O 24 | 要 O 25 | 把 O 26 | 这项 O 27 | 工作 O 28 | 作为 O 29 | 当前 O 30 | 一个 O 31 | 头等 O 32 | 大事 O 33 | 来 O 34 | 抓 O 35 | , O 36 | 并 O 37 | 做 O 38 | 了 O 39 | 全面 O 40 | 的 O 41 | 动员 O 42 | 和 O 43 | 部署 O 44 | , O 45 | 为了 O 46 | 配合 O 47 | 会议 O 48 | 精神 O 49 | 的 O 50 | 贯彻 O 51 | 落实 O 52 | , O 53 | 我们 O 54 | 将 O 55 | 组织 O 56 | 一 O 57 | 系列 O 58 | 报道 O 59 | , O 60 | 多 O 61 | 层次 O 62 | 、 O 63 | 多 O 64 | 侧面 O 65 | 的 O 66 | 宣传 O 67 | 中央 O 68 | 精神 O 69 | , O 70 | 报道 O 71 | 各地 O 72 | 新 O 73 | 经验 O 74 | 、 O 75 | 新 O 76 | 做法 O 77 | 。 O 78 | -------------------------------------------------------------------------------- /python/core/data/ner/train.bio: -------------------------------------------------------------------------------- 1 | 台湾 S-Ns 2 | 是 O 3 | 中国 S-Ns 4 | 领土 O 5 | 不可分割 O 6 | 的 O 7 | 一 O 8 | 部分 O 9 | 。 O 10 | -------------------------------------------------------------------------------- /python/core/data/ner/vocabs/bio.txt: -------------------------------------------------------------------------------- 1 | O 2 | B-Nh 3 | B-Ni 4 | B-Ns 5 | E-Nh 6 | E-Ni 7 | E-Ns 8 | I-Nh 9 | I-Ni 10 | I-Ns 11 | S-Nh 12 | S-Ni 13 | S-Ns 14 | -------------------------------------------------------------------------------- /python/core/data/srl/dev.txt: -------------------------------------------------------------------------------- 1 | 请 Y O O O 2 | 守住 Y O O O 3 | 你 _ O B-ARG1 O 4 | 的 _ O I-ARG1 O 5 | 道德 _ O I-ARG1 O 6 | 底线 _ O I-ARG1 O 7 | , _ O O O 8 | 即使 _ O B-ARGM-ADV B-ARGM-DIS 9 | 你 _ O I-ARGM-ADV B-ARG0 10 | 没有 Y O I-ARGM-ADV O 11 | 一 _ O I-ARGM-ADV B-ARG1 12 | 个 _ O I-ARGM-ADV I-ARG1 13 | 十几 _ O I-ARGM-ADV I-ARG1 14 | 岁 _ O I-ARGM-ADV I-ARG1 15 | 的 _ O I-ARGM-ADV I-ARG1 16 | 女儿 _ O I-ARGM-ADV I-ARG1 17 | 。 _ O O O 18 | -------------------------------------------------------------------------------- /python/core/data/srl/test.txt: -------------------------------------------------------------------------------- 1 | 百团大战 _ B-ARG0 O O O 2 | 的 _ I-ARG0 O O O 3 | 战略 _ I-ARG0 O O O 4 | 目的 _ I-ARG0 O O O 5 | 是 Y O O O O 6 | 要 Y B-ARG1 O O O 7 | 打破 Y I-ARG1 O O O 8 | 敌人 _ I-ARG1 O B-ARG1 O 9 | 对 _ I-ARG1 O I-ARG1 O 10 | 根据地 _ I-ARG1 O I-ARG1 O 11 | 的 _ I-ARG1 O I-ARG1 O 12 | 封锁 _ I-ARG1 O I-ARG1 O 13 | , _ O O O O 14 | 因此 _ O O O B-ARGM-DIS 15 | 破路 _ O O O B-ARG0 16 | , _ O O O I-ARG0 17 | 拔 _ O O O I-ARG0 18 | 据点 _ O O O I-ARG0 19 | 十分 _ O O O B-ARGM-ADV 20 | 重要 Y O O O O 21 | 。 _ O O O O 22 | -------------------------------------------------------------------------------- /python/core/data/srl/train.txt: -------------------------------------------------------------------------------- 1 | 站 Y O O B-ARGM-TPC B-ARG0 2 | 在 _ B-ARGM-ADV O I-ARGM-TPC I-ARG0 3 | 楼下 _ I-ARGM-ADV O I-ARGM-TPC I-ARG0 4 | 的 _ O O I-ARGM-TPC I-ARG0 5 | 居民 _ O O I-ARGM-TPC I-ARG0 6 | 很多 _ O O B-ARG0 B-ARG0 7 | 人 _ O O I-ARG0 I-ARG0 8 | 都 _ O O B-ARGM-ADV B-ARGM-ADV 9 | 是 Y O O O O 10 | 捏 Y O O O O 11 | 着 _ O O O O 12 | 鼻子 _ O O B-ARG1 O 13 | 在 _ O O O B-ARGM-LOC 14 | 一旁 _ O O O I-ARGM-LOC 15 | 观看 Y O O O O 16 | 。 _ O O O O 17 | -------------------------------------------------------------------------------- /python/core/data/srl/vocabs/arguments.txt: -------------------------------------------------------------------------------- 1 | O 2 | B-ARG0 3 | B-ARG0-ADV 4 | B-ARG0-CND 5 | B-ARG0-CRD 6 | B-ARG0-MNR 7 | B-ARG0-PRD 8 | B-ARG0-PSE 9 | B-ARG0-PSR 10 | B-ARG0-QTY 11 | B-ARG1 12 | B-ARG1-CRD 13 | B-ARG1-DIS 14 | B-ARG1-FRQ 15 | B-ARG1-PRD 16 | B-ARG1-PSE 17 | B-ARG1-PSR 18 | B-ARG1-QTY 19 | B-ARG1-TPC 20 | B-ARG2 21 | B-ARG2-CRD 22 | B-ARG2-PRD 23 | B-ARG2-PSE 24 | B-ARG2-PSR 25 | B-ARG2-QTY 26 | B-ARG3 27 | B-ARG3-TMP 28 | B-ARG4 29 | B-ARGM-ADV 30 | B-ARGM-BNF 31 | B-ARGM-CND 32 | B-ARGM-CRD 33 | B-ARGM-DGR 34 | B-ARGM-DIR 35 | B-ARGM-DIS 36 | B-ARGM-EXT 37 | B-ARGM-FRQ 38 | B-ARGM-LOC 39 | B-ARGM-MNR 40 | B-ARGM-PRD 41 | B-ARGM-PRP 42 | B-ARGM-QTY 43 | B-ARGM-T 44 | B-ARGM-TMP 45 | B-ARGM-TPC 46 | B-rel-ADV 47 | B-rel-DIS 48 | B-rel-EXT 49 | B-rel-MNR 50 | I-ARG0 51 | I-ARG0-ADV 52 | I-ARG0-CND 53 | I-ARG0-CRD 54 | I-ARG0-MNR 55 | I-ARG0-PRD 56 | I-ARG0-PSE 57 | I-ARG0-PSR 58 | I-ARG0-QTY 59 | I-ARG1 60 | I-ARG1-CRD 61 | I-ARG1-DIS 62 | I-ARG1-FRQ 63 | I-ARG1-PRD 64 | I-ARG1-PSE 65 | I-ARG1-PSR 66 | I-ARG1-QTY 67 | I-ARG1-TPC 68 | I-ARG2 69 | I-ARG2-CRD 70 | I-ARG2-PRD 71 | I-ARG2-PSE 72 | I-ARG2-PSR 73 | I-ARG2-QTY 74 | I-ARG3 75 | I-ARG3-TMP 76 | I-ARG4 77 | I-ARGM-ADV 78 | I-ARGM-BNF 79 | I-ARGM-CND 80 | I-ARGM-CRD 81 | I-ARGM-DGR 82 | I-ARGM-DIR 83 | I-ARGM-DIS 84 | I-ARGM-EXT 85 | I-ARGM-FRQ 86 | I-ARGM-LOC 87 | I-ARGM-MNR 88 | I-ARGM-PRD 89 | I-ARGM-PRP 90 | I-ARGM-QTY 91 | I-ARGM-T 92 | I-ARGM-TMP 93 | I-ARGM-TPC 94 | I-rel-ADV 95 | I-rel-DIS 96 | I-rel-EXT 97 | I-rel-MNR 98 | -------------------------------------------------------------------------------- /python/core/data/srl/vocabs/predicate.txt: -------------------------------------------------------------------------------- 1 | _ 2 | Y 3 | -------------------------------------------------------------------------------- /python/core/logs/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HIT-SCIR/ltp/6c6059d5ccad87dca003190ee7565af86e7e22a0/python/core/logs/.gitkeep -------------------------------------------------------------------------------- /python/core/ltp_core/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.1.3" 2 | -------------------------------------------------------------------------------- /python/core/ltp_core/algorithms/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | import torch 3 | from ltp_extension.algorithms import eisner as rust_eisner 4 | 5 | def eisner(scores, mask, remove_root=False): 6 | scores = scores.view(-1).cpu().numpy() 7 | length = torch.sum(mask, dim=1).cpu().numpy() 8 | 9 | result = torch.nn.utils.rnn.pad_sequence( 10 | [ 11 | torch.tensor(sequence, device=mask.device) 12 | for sequence in rust_eisner(scores.tolist(), length.tolist(), remove_root) 13 | ], 14 | batch_first=True, 15 | padding_value=0, 16 | ) 17 | 18 | return result 19 | 20 | except Exception: 21 | pass 22 | -------------------------------------------------------------------------------- /python/core/ltp_core/datamodules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HIT-SCIR/ltp/6c6059d5ccad87dca003190ee7565af86e7e22a0/python/core/ltp_core/datamodules/__init__.py -------------------------------------------------------------------------------- /python/core/ltp_core/datamodules/adapters/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HIT-SCIR/ltp/6c6059d5ccad87dca003190ee7565af86e7e22a0/python/core/ltp_core/datamodules/adapters/__init__.py -------------------------------------------------------------------------------- /python/core/ltp_core/datamodules/adapters/dependency_parsing.py: -------------------------------------------------------------------------------- 1 | from ltp_core.datamodules.components.conllu import Conllu 2 | from ltp_core.datamodules.utils.datasets import load_dataset 3 | 4 | 5 | def tokenize(examples, tokenizer, max_length): 6 | res = tokenizer( 7 | examples["form"], 8 | is_split_into_words=True, 9 | max_length=max_length, 10 | truncation=True, 11 | ) 12 | word_index = [] 13 | for encoding in res.encodings: 14 | word_index.append([]) 15 | 16 | last_word_idx = -1 17 | current_length = 0 18 | for word_idx in encoding.words[1:-1]: 19 | if word_idx != last_word_idx: 20 | word_index[-1].append(current_length) 21 | current_length += 1 22 | last_word_idx = word_idx 23 | 24 | result = res.data 25 | for ids in result["input_ids"]: 26 | ids[0] = tokenizer.cls_token_id 27 | ids[-1] = tokenizer.sep_token_id 28 | result["overflow"] = [len(encoding.overflowing) > 0 for encoding in res.encodings] 29 | result["word_index"] = word_index 30 | result["word_attention_mask"] = [[True] * len(index) for index in word_index] 31 | return result 32 | 33 | 34 | def build_dataset(data_dir, task_name, tokenizer, max_length=512, **kwargs): 35 | import os 36 | 37 | os.environ["TOKENIZERS_PARALLELISM"] = "true" 38 | dataset = load_dataset(Conllu, data_dir=data_dir, cache_dir=data_dir) 39 | dataset = dataset.remove_columns(["id", "lemma", "upos", "xpos", "feats", "deps", "misc"]) 40 | dataset = dataset.rename_column("deprel", "labels") 41 | dataset = dataset.map(lambda examples: tokenize(examples, tokenizer, max_length), batched=True) 42 | dataset = dataset.filter(lambda x: not x["overflow"]) 43 | dataset.set_format( 44 | type="torch", 45 | columns=[ 46 | "input_ids", 47 | "token_type_ids", 48 | "attention_mask", 49 | "word_index", 50 | "word_attention_mask", 51 | "head", 52 | "labels", 53 | ], 54 | ) 55 | return dataset 56 | -------------------------------------------------------------------------------- /python/core/ltp_core/datamodules/adapters/named_entity_recognition.py: -------------------------------------------------------------------------------- 1 | from ltp_core.datamodules.adapters.postagger import tokenize 2 | from ltp_core.datamodules.components.bio import Bio 3 | from ltp_core.datamodules.utils.datasets import load_dataset 4 | 5 | 6 | def build_dataset(data_dir, task_name, tokenizer, max_length=512, **kwargs): 7 | import os 8 | 9 | os.environ["TOKENIZERS_PARALLELISM"] = "true" 10 | dataset = load_dataset(Bio, data_dir=data_dir, cache_dir=data_dir) 11 | dataset = dataset.rename_column("bio", "labels") 12 | dataset = dataset.map(lambda examples: tokenize(examples, tokenizer, max_length), batched=True) 13 | dataset = dataset.filter(lambda x: not x["overflow"]) 14 | dataset.set_format( 15 | type="torch", 16 | columns=[ 17 | "input_ids", 18 | "token_type_ids", 19 | "attention_mask", 20 | "word_index", 21 | "word_attention_mask", 22 | "labels", 23 | ], 24 | ) 25 | return dataset 26 | -------------------------------------------------------------------------------- /python/core/ltp_core/datamodules/adapters/postagger.py: -------------------------------------------------------------------------------- 1 | from ltp_core.datamodules.components.conllu import Conllu 2 | from ltp_core.datamodules.utils.datasets import load_dataset 3 | 4 | 5 | def tokenize(examples, tokenizer, max_length, char_base=False): 6 | """ 7 | 8 | Args: 9 | examples: 10 | tokenizer: 11 | max_length: 12 | char_base: 这里指的是 examples 中的 form[即 word]是否是字级别的 13 | 14 | Returns: 15 | 16 | """ 17 | res = tokenizer.batch_encode_plus( 18 | examples["form"], 19 | is_split_into_words=True, 20 | max_length=max_length, 21 | truncation=True, 22 | ) 23 | result = res.data 24 | for ids in result["input_ids"]: 25 | ids[0] = tokenizer.cls_token_id 26 | ids[-1] = tokenizer.sep_token_id 27 | result["overflow"] = [len(encoding.overflowing) > 0 for encoding in res.encodings] 28 | 29 | if not char_base: 30 | word_index = [] 31 | for encoding in res.encodings: 32 | word_index.append([]) 33 | 34 | last_word_idx = -1 35 | current_length = 0 36 | for word_idx in encoding.words[1:-1]: 37 | if word_idx != last_word_idx: 38 | word_index[-1].append(current_length) 39 | current_length += 1 40 | last_word_idx = word_idx 41 | result["word_index"] = word_index 42 | result["word_attention_mask"] = [[True] * len(index) for index in word_index] 43 | return result 44 | 45 | 46 | def build_dataset(data_dir, task_name, tokenizer, max_length=512, **kwargs): 47 | import os 48 | 49 | os.environ["TOKENIZERS_PARALLELISM"] = "true" 50 | dataset = load_dataset(Conllu, data_dir=data_dir, cache_dir=data_dir) 51 | dataset = dataset.remove_columns(["id", "lemma", "upos", "feats", "head", "deprel", "deps", "misc"]) 52 | dataset = dataset.rename_column("xpos", "labels") 53 | dataset = dataset.map(lambda examples: tokenize(examples, tokenizer, max_length), batched=True) 54 | dataset = dataset.filter(lambda x: not x["overflow"]) 55 | dataset.set_format( 56 | type="torch", 57 | columns=[ 58 | "input_ids", 59 | "token_type_ids", 60 | "attention_mask", 61 | "word_index", 62 | "word_attention_mask", 63 | "labels", 64 | ], 65 | ) 66 | return dataset 67 | -------------------------------------------------------------------------------- /python/core/ltp_core/datamodules/adapters/segmention.py: -------------------------------------------------------------------------------- 1 | from ltp_core.datamodules.components.conllu import Conllu 2 | from ltp_core.datamodules.utils.datasets import load_dataset 3 | 4 | PREFIX_B = 0 5 | PREFIX_I = 1 6 | PREFIX_M = 1 7 | PREFIX_E = 2 8 | PREFIX_S = 3 9 | 10 | 11 | def length2bi(length): 12 | if length == 0: 13 | return [] 14 | elif length == 1: 15 | return [PREFIX_B] 16 | elif length == 2: 17 | return [PREFIX_B, PREFIX_I] 18 | else: 19 | return [PREFIX_B] + [PREFIX_I] * (length - 1) 20 | 21 | 22 | def length2bmes(length): 23 | if length == 0: 24 | return [] 25 | elif length == 1: 26 | return [PREFIX_S] 27 | elif length == 2: 28 | return [PREFIX_B, PREFIX_E] 29 | elif length == 3: 30 | return [PREFIX_B, PREFIX_M, PREFIX_E] 31 | else: 32 | return [PREFIX_B] + [PREFIX_M] * (length - 2) + [PREFIX_E] 33 | 34 | 35 | def tokenize(examples, tokenizer, max_length, length2labels=length2bi): 36 | res = tokenizer( 37 | examples["form"], 38 | is_split_into_words=True, 39 | max_length=max_length, 40 | truncation=True, 41 | ) 42 | labels = [] 43 | for encoding in res.encodings: 44 | labels.append([]) 45 | last_word_idx = -1 46 | word_length = 0 47 | for word_idx in encoding.words[1:-1]: 48 | if word_idx == last_word_idx: 49 | word_length += 1 50 | else: 51 | labels[-1].extend(length2labels(word_length)) 52 | last_word_idx = word_idx 53 | word_length = 1 54 | labels[-1].extend(length2labels(word_length)) 55 | 56 | result = res.data 57 | for ids in res["input_ids"]: 58 | ids[0] = tokenizer.cls_token_id 59 | ids[-1] = tokenizer.sep_token_id 60 | result["overflow"] = [len(encoding.overflowing) > 0 for encoding in res.encodings] 61 | result["labels"] = labels 62 | return result 63 | 64 | 65 | def build_dataset(data_dir, task_name, tokenizer, max_length=512, mode="bmes", **kwargs): 66 | import os 67 | 68 | os.environ["TOKENIZERS_PARALLELISM"] = "true" 69 | dataset = load_dataset(Conllu, data_dir=data_dir, cache_dir=data_dir) 70 | dataset = dataset.remove_columns(["id", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc"]) 71 | if mode == "bmes": 72 | dataset = dataset.map( 73 | lambda examples: tokenize(examples, tokenizer, max_length, length2bmes), 74 | batched=True, 75 | ) 76 | elif mode == "bi": 77 | dataset = dataset.map( 78 | lambda examples: tokenize(examples, tokenizer, max_length, length2bi), 79 | batched=True, 80 | ) 81 | else: 82 | raise NotImplementedError(f"not supported {mode} mode") 83 | dataset = dataset.filter(lambda x: not x["overflow"]) 84 | dataset.set_format( 85 | type="torch", 86 | columns=["input_ids", "token_type_ids", "attention_mask", "labels"], 87 | ) 88 | return dataset 89 | 90 | 91 | def main(): 92 | from transformers import AutoTokenizer 93 | 94 | tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-macbert-base") 95 | dataset = build_dataset(data_dir="data/seg", task_name="seg", tokenizer=tokenizer, mode="bmes") 96 | print(dataset) 97 | 98 | 99 | if __name__ == "__main__": 100 | main() 101 | -------------------------------------------------------------------------------- /python/core/ltp_core/datamodules/adapters/semantic_dependency_parsing.py: -------------------------------------------------------------------------------- 1 | from ltp_core.datamodules.components.conllu import Conllu 2 | from ltp_core.datamodules.utils.datasets import load_dataset 3 | 4 | 5 | def tokenize(examples, tokenizer, max_length): 6 | res = tokenizer( 7 | examples["form"], 8 | is_split_into_words=True, 9 | max_length=max_length, 10 | truncation=True, 11 | ) 12 | word_index = [] 13 | for encoding in res.encodings: 14 | word_index.append([]) 15 | 16 | last_word_idx = -1 17 | current_length = 0 18 | for word_idx in encoding.words[1:-1]: 19 | if word_idx != last_word_idx: 20 | word_index[-1].append(current_length) 21 | current_length += 1 22 | last_word_idx = word_idx 23 | 24 | heads = [] 25 | labels = [] 26 | for forms, deps in zip(examples["form"], examples["deps"]): 27 | sentence_len = len(forms) 28 | heads.append([[0 for j in range(sentence_len + 1)] for i in range(sentence_len)]) 29 | labels.append([[0 for j in range(sentence_len + 1)] for i in range(sentence_len)]) 30 | for idx, head, rel in zip(deps["id"], deps["head"], deps["rel"]): 31 | heads[-1][idx][head] = 1 32 | labels[-1][idx][head] = rel 33 | 34 | result = res.data 35 | for ids in result["input_ids"]: 36 | ids[0] = tokenizer.cls_token_id 37 | ids[-1] = tokenizer.sep_token_id 38 | result["overflow"] = [len(encoding.overflowing) > 0 for encoding in res.encodings] 39 | result["word_index"] = word_index 40 | result["word_attention_mask"] = [[True] * len(index) for index in word_index] 41 | 42 | result["head"] = heads 43 | result["labels"] = labels 44 | for word_index, head in zip(result["word_index"], result["head"]): 45 | assert len(word_index) == len(head) 46 | return result 47 | 48 | 49 | def build_dataset(data_dir, task_name, tokenizer, max_length=512, **kwargs): 50 | import os 51 | 52 | os.environ["TOKENIZERS_PARALLELISM"] = "true" 53 | dataset = load_dataset(Conllu, data_dir=data_dir, cache_dir=data_dir) 54 | dataset = dataset.remove_columns(["id", "lemma", "upos", "xpos", "feats", "head", "deprel", "misc"]) 55 | dataset = dataset.map(lambda examples: tokenize(examples, tokenizer, max_length), batched=True) 56 | dataset = dataset.filter(lambda x: not x["overflow"]) 57 | dataset.set_format( 58 | type="torch", 59 | columns=[ 60 | "input_ids", 61 | "token_type_ids", 62 | "attention_mask", 63 | "word_index", 64 | "word_attention_mask", 65 | "head", 66 | "labels", 67 | ], 68 | ) 69 | return dataset 70 | -------------------------------------------------------------------------------- /python/core/ltp_core/datamodules/adapters/semantic_role_labeling.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | 3 | from ltp_core.datamodules.components.srl import Srl 4 | from ltp_core.datamodules.utils.datasets import load_dataset 5 | 6 | 7 | def tokenize(examples, tokenizer, max_length): 8 | res = tokenizer( 9 | examples["form"], 10 | is_split_into_words=True, 11 | max_length=max_length, 12 | truncation=True, 13 | ) 14 | word_index = [] 15 | for encoding in res.encodings: 16 | word_index.append([]) 17 | 18 | last_word_idx = -1 19 | current_length = 0 20 | for word_idx in encoding.words[1:-1]: 21 | if word_idx != last_word_idx: 22 | word_index[-1].append(current_length) 23 | current_length += 1 24 | last_word_idx = word_idx 25 | 26 | labels = [] 27 | for predicates, roles in zip(examples["predicate"], examples["arguments"]): 28 | sentence_len = len(predicates) 29 | labels.append(numpy.zeros((sentence_len, sentence_len), dtype=numpy.int64)) 30 | 31 | for idx, predicate in enumerate(predicates): 32 | if predicate == 1: 33 | srl = numpy.asarray(roles.pop(0), dtype=numpy.int64) 34 | labels[-1][idx, :] = srl 35 | 36 | result = res.data 37 | for ids in result["input_ids"]: 38 | ids[0] = tokenizer.cls_token_id 39 | ids[-1] = tokenizer.sep_token_id 40 | result["overflow"] = [len(encoding.overflowing) > 0 for encoding in res.encodings] 41 | result["word_index"] = word_index 42 | result["word_attention_mask"] = [[True] * len(index) for index in word_index] 43 | 44 | result["labels"] = labels 45 | return result 46 | 47 | 48 | def build_dataset(data_dir, task_name, tokenizer, max_length=512, **kwargs): 49 | import os 50 | 51 | os.environ["TOKENIZERS_PARALLELISM"] = "true" 52 | dataset = load_dataset(Srl, data_dir=data_dir, cache_dir=data_dir) 53 | dataset = dataset.map(lambda examples: tokenize(examples, tokenizer, max_length), batched=True) 54 | dataset = dataset.filter(lambda x: not x["overflow"]) 55 | dataset.set_format( 56 | type="torch", 57 | columns=[ 58 | "input_ids", 59 | "token_type_ids", 60 | "attention_mask", 61 | "word_index", 62 | "word_attention_mask", 63 | "labels", 64 | ], 65 | ) 66 | return dataset 67 | -------------------------------------------------------------------------------- /python/core/ltp_core/datamodules/adapters/sentence_classification.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | 3 | 4 | # todo: implement 5 | def build_dataset(task_name): 6 | import os 7 | 8 | os.environ["TOKENIZERS_PARALLELISM"] = "true" 9 | load_dataset("glue", task_name) 10 | -------------------------------------------------------------------------------- /python/core/ltp_core/datamodules/components/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HIT-SCIR/ltp/6c6059d5ccad87dca003190ee7565af86e7e22a0/python/core/ltp_core/datamodules/components/__init__.py -------------------------------------------------------------------------------- /python/core/ltp_core/datamodules/utils/__init__.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # Author: Yunlong Feng 3 | -------------------------------------------------------------------------------- /python/core/ltp_core/datamodules/utils/collate.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # Author: Yunlong Feng 3 | 4 | import torch 5 | from torch._six import string_classes 6 | from torch.utils.data._utils.collate import ( 7 | default_collate_err_msg_format, 8 | np_str_obj_array_pattern, 9 | ) 10 | 11 | _TORCH_MAJOR, _TORCH_MINOR = map(int, torch.__version__.split(".")[0:2]) 12 | 13 | if _TORCH_MAJOR < 1 or (_TORCH_MAJOR == 1 and _TORCH_MINOR < 8): 14 | from torch._six import container_abcs, int_classes 15 | else: 16 | int_classes = int 17 | import collections.abc as container_abcs 18 | 19 | 20 | def collate(batch): 21 | r"""Puts each data field into a tensor with outer dimension batch size""" 22 | 23 | elem = batch[0] 24 | elem_type = type(elem) 25 | if isinstance(elem, torch.Tensor): 26 | try: 27 | out = None 28 | if torch.utils.data.get_worker_info() is not None: 29 | # If we're in a background process, concatenate directly into a 30 | # shared memory tensor to avoid an extra copy 31 | numel = sum(x.numel() for x in batch) 32 | storage = elem.storage()._new_shared(numel, device=elem.device) 33 | out = elem.new(storage).resize_(len(batch), *list(elem.size())) 34 | return torch.stack(batch, 0, out=out) 35 | except Exception: 36 | return torch.nn.utils.rnn.pad_sequence(batch, batch_first=True) 37 | elif elem_type.__module__ == "numpy" and elem_type.__name__ != "str_" and elem_type.__name__ != "string_": 38 | elem = batch[0] 39 | if elem_type.__name__ == "ndarray": 40 | # array of string classes and object 41 | if np_str_obj_array_pattern.search(elem.dtype.str) is not None: 42 | raise TypeError(default_collate_err_msg_format.format(elem.dtype)) 43 | 44 | return collate([torch.as_tensor(b) for b in batch]) 45 | elif elem.shape == (): # scalars 46 | return torch.as_tensor(batch) 47 | elif isinstance(elem, float): 48 | return torch.tensor(batch, dtype=torch.float64) 49 | elif isinstance(elem, int_classes): 50 | return torch.tensor(batch) 51 | elif isinstance(elem, string_classes): 52 | return batch 53 | elif isinstance(elem, container_abcs.Mapping): 54 | return {key: collate([d[key] for d in batch]) for key in elem} 55 | elif isinstance(elem, tuple) and hasattr(elem, "_fields"): # namedtuple 56 | return elem_type(*(collate(samples) for samples in zip(*batch))) 57 | elif isinstance(elem, container_abcs.Sequence): 58 | # check to make sure that the elements in batch have consistent size 59 | batch = [torch.stack(it) for it in batch] 60 | elem_sizes = [it.shape for it in batch] 61 | max_sizes = (max(sizes) for sizes in zip(*elem_sizes)) 62 | batched = torch.zeros(len(batch), *max_sizes, dtype=batch[0].dtype) 63 | for idx, (elem, elem_size) in enumerate(zip(batch, elem_sizes)): 64 | size_1, size_2 = elem_size 65 | batched[idx, :size_1, :size_2] = elem 66 | return batched 67 | 68 | raise TypeError(default_collate_err_msg_format.format(elem_type)) 69 | -------------------------------------------------------------------------------- /python/core/ltp_core/datamodules/utils/datasets.py: -------------------------------------------------------------------------------- 1 | from typing import Mapping, Optional, Sequence, Union 2 | 3 | from datasets import Dataset, DatasetBuilder, DatasetDict, Features, Split 4 | 5 | 6 | def load_dataset( 7 | builder_cls: type, 8 | config_name: Optional[str] = None, 9 | data_dir: Optional[str] = None, 10 | data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None, 11 | split: Optional[Union[str, Split]] = None, 12 | cache_dir: Optional[str] = None, 13 | features: Optional[Features] = None, 14 | save_infos: bool = False, 15 | **config_kwargs, 16 | ) -> Union[DatasetDict, Dataset]: 17 | # Instantiate the dataset builder 18 | builder_instance: DatasetBuilder = builder_cls( 19 | cache_dir=cache_dir, 20 | config_name=config_name, 21 | data_dir=data_dir, 22 | data_files=data_files, 23 | hash=hash, 24 | features=features, 25 | **config_kwargs, 26 | ) 27 | 28 | # Download and prepare data 29 | builder_instance.download_and_prepare() 30 | 31 | # Build dataset for splits 32 | ds = builder_instance.as_dataset(split=split) 33 | if save_infos: 34 | builder_instance._save_infos() 35 | 36 | return ds 37 | -------------------------------------------------------------------------------- /python/core/ltp_core/datamodules/utils/iterator.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # Author: Yunlong Feng 3 | 4 | import codecs 5 | 6 | 7 | def iter_raw_lines(filename: str, strip=None, skip: str = None): 8 | line_num = 0 9 | with codecs.open(filename, encoding="utf-8") as file: 10 | while True: 11 | line = file.readline() 12 | line_num += 1 13 | if skip is not None and line.startswith(skip): 14 | continue 15 | if not line: # EOF 16 | yield line_num, "" # 输出空行,简化上层逻辑 17 | break 18 | line = line.strip(strip) 19 | yield line_num, line 20 | 21 | 22 | def iter_lines(filename: str, split=None, strip=None, skip: str = None): 23 | for line_num, raw_line in iter_raw_lines(filename=filename, strip=strip, skip=skip): 24 | if not raw_line: # end of a sentence 25 | yield line_num, [] # 输出空行 26 | else: 27 | yield line_num, raw_line.split(split) 28 | 29 | 30 | def iter_blocks(filename: str, split=None, strip=None, skip="#"): 31 | rows = [] 32 | for line_num, line_features in iter_lines(filename, split=split, strip=strip, skip=skip): 33 | if len(line_features): 34 | rows.append(line_features) 35 | else: 36 | if len(rows): 37 | yield line_num, rows 38 | rows = [] 39 | -------------------------------------------------------------------------------- /python/core/ltp_core/datamodules/utils/multitask_dataloader.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # Author: Yunlong Feng 3 | 4 | import numpy as np 5 | 6 | 7 | def cycle(iterable): 8 | while True: 9 | yield from iterable 10 | 11 | 12 | class MultiTaskDataloader: 13 | def __init__(self, tau=1.0, **dataloaders): 14 | self.dataloaders = dataloaders 15 | 16 | Z = sum(pow(v, tau) for v in self.dataloader_sizes.values()) 17 | self.tasknames, self.sampling_weights = zip(*((k, pow(v, tau) / Z) for k, v in self.dataloader_sizes.items())) 18 | self.dataiters = {k: cycle(v) for k, v in dataloaders.items()} 19 | 20 | @property 21 | def dataloader_sizes(self): 22 | if not hasattr(self, "_dataloader_sizes"): 23 | self._dataloader_sizes = {k: len(v) for k, v in self.dataloaders.items()} 24 | return self._dataloader_sizes 25 | 26 | def __len__(self): 27 | return sum(v for k, v in self.dataloader_sizes.items()) 28 | 29 | def __iter__(self): 30 | for i in range(len(self)): 31 | taskname = np.random.choice(self.tasknames, p=self.sampling_weights) 32 | dataiter = self.dataiters[taskname] 33 | batch = next(dataiter) 34 | 35 | batch["task_name"] = taskname 36 | 37 | yield batch 38 | -------------------------------------------------------------------------------- /python/core/ltp_core/datamodules/utils/vocab_helper.py: -------------------------------------------------------------------------------- 1 | def vocab_builder(func): 2 | from datasets import BuilderConfig 3 | 4 | def func_wrapper(config: BuilderConfig, **kwargs): 5 | """We handle string, list and dicts in datafiles.""" 6 | if not config.data_files: 7 | raise ValueError(f"At least one data file must be specified, but got data_files={config.data_files}") 8 | data_files = config.data_files 9 | if isinstance(data_files, (str, list, tuple)): 10 | files = data_files 11 | if isinstance(files, str): 12 | files = [files] 13 | else: 14 | files = [] 15 | for file_list in data_files.values(): 16 | if isinstance(file_list, str): 17 | files.append(file_list) 18 | else: 19 | files.extend(file_list) 20 | res = func(config.data_dir, *files, **kwargs) 21 | return res 22 | 23 | return func_wrapper 24 | -------------------------------------------------------------------------------- /python/core/ltp_core/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HIT-SCIR/ltp/6c6059d5ccad87dca003190ee7565af86e7e22a0/python/core/ltp_core/models/__init__.py -------------------------------------------------------------------------------- /python/core/ltp_core/models/components/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HIT-SCIR/ltp/6c6059d5ccad87dca003190ee7565af86e7e22a0/python/core/ltp_core/models/components/__init__.py -------------------------------------------------------------------------------- /python/core/ltp_core/models/components/graph.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # Author: Yunlong Feng 3 | from collections import namedtuple 4 | 5 | from torch import nn 6 | 7 | from ltp_core.models.nn.biaffine import Biaffine 8 | from ltp_core.models.nn.mlp import MLP 9 | 10 | GraphResult = namedtuple("GraphResult", ["arc_logits", "rel_logits", "attention_mask"]) 11 | 12 | 13 | class BiaffineClassifier(nn.Module): 14 | def __init__( 15 | self, 16 | input_size, 17 | num_labels, 18 | dropout=0.1, 19 | arc_hidden_size=500, 20 | rel_hidden_size=100, 21 | ): 22 | super().__init__() 23 | 24 | self.label_num = num_labels 25 | self.input_size = input_size 26 | self.arc_hidden_size = arc_hidden_size 27 | self.rel_hidden_size = rel_hidden_size 28 | 29 | self.mlp_arc = MLP( 30 | [input_size, arc_hidden_size * 2], 31 | output_dropout=dropout, 32 | output_activation=nn.ReLU, 33 | ) 34 | self.mlp_rel = MLP( 35 | [input_size, rel_hidden_size * 2], 36 | output_dropout=dropout, 37 | output_activation=nn.ReLU, 38 | ) 39 | 40 | self.arc_atten = Biaffine(arc_hidden_size, arc_hidden_size, 1, bias_x=True, bias_y=False) 41 | self.rel_atten = Biaffine(rel_hidden_size, rel_hidden_size, num_labels, bias_x=True, bias_y=True) 42 | 43 | def forward(self, hidden_states, attention_mask=None): 44 | bs, seqlen = hidden_states.shape[:2] 45 | 46 | arc = self.mlp_arc(hidden_states) 47 | arc = arc.view(bs, seqlen, 2, self.arc_hidden_size) 48 | arc_h, arc_d = arc.unbind(axis=-2) 49 | 50 | rel = self.mlp_rel(hidden_states) 51 | rel = rel.view(bs, seqlen, 2, self.rel_hidden_size) 52 | rel_h, rel_d = rel.unbind(axis=-2) 53 | 54 | s_arc = self.arc_atten(arc_d, arc_h).squeeze_(1) 55 | s_rel = self.rel_atten(rel_d, rel_h).permute(0, 2, 3, 1) 56 | 57 | return GraphResult(arc_logits=s_arc, rel_logits=s_rel, attention_mask=attention_mask) 58 | -------------------------------------------------------------------------------- /python/core/ltp_core/models/components/sent.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | 3 | from torch import nn 4 | 5 | from ltp_core.models.nn.mlp import MLP 6 | 7 | SentClassifierResult = namedtuple("SentClassifierResult", ["logits"]) 8 | 9 | 10 | class MLPClassifier(nn.Module): 11 | def __init__( 12 | self, 13 | input_size, 14 | num_labels, 15 | dropout=0.1, 16 | hidden_sizes=None, 17 | ): 18 | super().__init__() 19 | if hidden_sizes is not None: 20 | self.classifier = MLP([input_size, *hidden_sizes, num_labels], dropout=dropout) 21 | else: 22 | self.classifier = MLP([input_size, num_labels], dropout=dropout) 23 | 24 | def forward(self, hidden_states, attention_mask=None) -> SentClassifierResult: 25 | logits = self.classifier(hidden_states) 26 | return SentClassifierResult(logits=logits) 27 | -------------------------------------------------------------------------------- /python/core/ltp_core/models/criterion/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HIT-SCIR/ltp/6c6059d5ccad87dca003190ee7565af86e7e22a0/python/core/ltp_core/models/criterion/__init__.py -------------------------------------------------------------------------------- /python/core/ltp_core/models/criterion/graph.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor 3 | from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, Module 4 | 5 | from ltp_core.models.components.graph import GraphResult 6 | 7 | 8 | class DEPLoss(Module): 9 | def __init__(self, loss_interpolation=0.4): 10 | super().__init__() 11 | self.loss_interpolation = loss_interpolation 12 | 13 | def forward(self, result: GraphResult, head: Tensor, labels: Tensor, **kwargs): 14 | s_arc = result.arc_logits 15 | s_rel = result.rel_logits 16 | attention_mask = result.attention_mask 17 | 18 | arc_loss = CrossEntropyLoss() 19 | rel_loss = CrossEntropyLoss() 20 | 21 | # ignore the first token of each sentence 22 | s_arc = s_arc[:, 1:, :] 23 | s_rel = s_rel[:, 1:, :] 24 | 25 | # Only keep active parts of the loss 26 | active_heads = head[attention_mask] 27 | active_labels = labels[attention_mask] 28 | s_arc, s_rel = s_arc[attention_mask], s_rel[attention_mask] 29 | 30 | s_rel = s_rel[torch.arange(len(active_heads)), active_heads] 31 | 32 | arc_loss = arc_loss(s_arc, active_heads) 33 | rel_loss = rel_loss(s_rel, active_labels) 34 | loss = 2 * ((1 - self.loss_interpolation) * arc_loss + self.loss_interpolation * rel_loss) 35 | 36 | return loss 37 | 38 | 39 | class SDPLoss(Module): 40 | def __init__(self, loss_interpolation=0.4): 41 | super().__init__() 42 | self.loss_interpolation = loss_interpolation 43 | 44 | def forward(self, result: GraphResult, head: Tensor, labels: Tensor, **kwargs): 45 | s_arc = result.arc_logits 46 | s_rel = result.rel_logits 47 | attention_mask = result.attention_mask 48 | 49 | head_loss = BCEWithLogitsLoss() 50 | rel_loss = CrossEntropyLoss() 51 | 52 | # ignore the first token of each sentence 53 | s_arc = s_arc[:, 1:, :] 54 | s_rel = s_rel[:, 1:, :] 55 | 56 | # attention mask 57 | attention_mask = attention_mask.unsqueeze(-1).expand_as(s_arc) 58 | 59 | arc_loss = head_loss(s_arc[attention_mask], head[attention_mask].float()) 60 | rel_loss = rel_loss(s_rel[head > 0], labels[head > 0]) 61 | 62 | loss = 2 * ((1 - self.loss_interpolation) * arc_loss + self.loss_interpolation * rel_loss) 63 | 64 | return loss 65 | 66 | 67 | class DEPDistillLoss(DEPLoss): 68 | def forward(self, result: GraphResult, head: Tensor, labels: Tensor, **kwargs): 69 | return super().forward(result, labels, **kwargs) 70 | 71 | 72 | class SDPDistillLoss(SDPLoss): 73 | def forward(self, result: GraphResult, head: Tensor, labels: Tensor, **kwargs): 74 | return super().forward(result, labels, **kwargs) 75 | -------------------------------------------------------------------------------- /python/core/ltp_core/models/criterion/sent.py: -------------------------------------------------------------------------------- 1 | from torch import Tensor 2 | from torch.nn import CrossEntropyLoss, Module 3 | 4 | from ltp_core.models.components.sent import SentClassifierResult 5 | 6 | 7 | class ClassificationLoss(Module): 8 | def forward(self, result: SentClassifierResult, labels: Tensor, **kwargs) -> Tensor: 9 | logits = result.logits 10 | num_tags = logits.shape[-1] 11 | 12 | loss_fct = CrossEntropyLoss() 13 | loss = loss_fct(logits.view(-1, num_tags), labels.view(-1)) 14 | return loss 15 | -------------------------------------------------------------------------------- /python/core/ltp_core/models/functional/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HIT-SCIR/ltp/6c6059d5ccad87dca003190ee7565af86e7e22a0/python/core/ltp_core/models/functional/__init__.py -------------------------------------------------------------------------------- /python/core/ltp_core/models/functional/distill.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | 5 | def flsw_temperature_scheduler_builder(beta=1, gamma=2, base_temperature=8, eps=1e-4, *args): 6 | """adapted from arXiv:1911.07471.""" 7 | 8 | def flsw_temperature_scheduler(logits_S, logits_T): 9 | v = logits_S.detach() 10 | t = logits_T.detach() 11 | with torch.no_grad(): 12 | v = v / (torch.norm(v, dim=-1, keepdim=True) + eps) 13 | t = t / (torch.norm(t, dim=-1, keepdim=True) + eps) 14 | w = torch.pow((1 - (v * t).sum(dim=-1)), gamma) 15 | tau = base_temperature + (w.mean() - w) * beta 16 | return tau 17 | 18 | return flsw_temperature_scheduler 19 | 20 | 21 | def kd_ce_loss(logits_S, logits_T, temperature=1): 22 | """Calculate the cross entropy between logits_S and logits_T. 23 | 24 | :param logits_S: Tensor of shape (batch_size, length, num_labels) or (batch_size, num_labels) 25 | :param logits_T: Tensor of shape (batch_size, length, num_labels) or (batch_size, num_labels) 26 | :param temperature: A float or a tensor of shape (batch_size, length) or (batch_size,) 27 | """ 28 | if isinstance(temperature, torch.Tensor) and temperature.dim() > 0: 29 | temperature = temperature.unsqueeze(-1) 30 | beta_logits_T = logits_T / temperature 31 | beta_logits_S = logits_S / temperature 32 | p_T = F.softmax(beta_logits_T, dim=-1) 33 | loss = -(p_T * F.log_softmax(beta_logits_S, dim=-1)) 34 | return (temperature * temperature * loss).sum(dim=-1).mean() 35 | 36 | 37 | def kd_mse_loss(logits_S, logits_T, temperature=1): 38 | """Calculate the mse loss between logits_S and logits_T. 39 | 40 | :param logits_S: Tensor of shape (batch_size, length, num_labels) or (batch_size, num_labels) 41 | :param logits_T: Tensor of shape (batch_size, length, num_labels) or (batch_size, num_labels) 42 | :param temperature: A float or a tensor of shape (batch_size, length) or (batch_size,) 43 | """ 44 | if isinstance(temperature, torch.Tensor) and temperature.dim() > 0: 45 | temperature = temperature.unsqueeze(-1) 46 | beta_logits_T = logits_T / temperature 47 | beta_logits_S = logits_S / temperature 48 | loss = F.mse_loss(beta_logits_S, beta_logits_T, reduction="none") 49 | return (temperature * temperature * loss).mean() 50 | -------------------------------------------------------------------------------- /python/core/ltp_core/models/functional/multilabel_categorical_crossentropy.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | # ref: https://github.com/bojone/bert4keras/blob/master/bert4keras/backend.py 4 | 5 | INF = 1e4 6 | EPSILON = 1e-5 7 | 8 | 9 | def multilabel_categorical_crossentropy(y_true, y_pred, mask_zero=False): 10 | """多标签分类的交叉熵 11 | 说明: 12 | 1. y_true和y_pred的shape一致,y_true的元素非0即1, 13 | 1表示对应的类为目标类,0表示对应的类为非目标类; 14 | 2. 请保证y_pred的值域是全体实数,换言之一般情况下 15 | y_pred不用加激活函数,尤其是不能加sigmoid或者 16 | softmax; 17 | 3. 预测阶段则输出y_pred大于0的类; 18 | 4. 详情请看:https://kexue.fm/archives/7359 。 19 | """ 20 | y_pred = (1 - 2 * y_true) * y_pred 21 | y_neg = y_pred - y_true * INF 22 | y_pos = y_pred - (1 - y_true) * INF 23 | zeros = torch.zeros_like(y_pred[..., :1]) 24 | y_neg = torch.cat([y_neg, zeros], dim=-1) 25 | y_pos = torch.cat([y_pos, zeros], dim=-1) 26 | neg_loss = torch.logsumexp(y_neg, dim=-1) 27 | pos_loss = torch.logsumexp(y_pos, dim=-1) 28 | return pos_loss + neg_loss 29 | 30 | 31 | def sparse_multilabel_categorical_crossentropy(y_true, y_pred, mask_zero=False): 32 | """稀疏版多标签分类的交叉熵 33 | 说明: 34 | 1. y_true.shape=[..., num_positive], 35 | y_pred.shape=[..., num_classes]; 36 | 2. 请保证y_pred的值域是全体实数,换言之一般情况下 37 | y_pred不用加激活函数,尤其是不能加sigmoid或者 38 | softmax; 39 | 3. 预测阶段则输出y_pred大于0的类; 40 | 4. 详情请看:https://kexue.fm/archives/7359 。 41 | """ 42 | zeros = torch.zeros_like(y_pred[..., :1]) 43 | y_pred = torch.cat([y_pred, zeros], dim=-1) 44 | 45 | if mask_zero: 46 | infs = zeros + INF 47 | y_pred = torch.cat([infs, y_pred[..., 1:]], dim=-1) 48 | 49 | y_pos_2 = torch.gather(y_pred, index=y_true, dim=-1) 50 | y_pos_1 = torch.cat([y_pos_2, zeros], dim=-1) 51 | 52 | if mask_zero: 53 | y_pred = torch.cat([-infs, y_pred[..., 1:]], dim=-1) 54 | y_pos_2 = torch.gather(y_pred, index=y_true, dim=-1) 55 | 56 | pos_loss = torch.logsumexp(-y_pos_1, dim=-1) 57 | all_loss = torch.logsumexp(y_pred, dim=-1) 58 | aux_loss = torch.logsumexp(y_pos_2, dim=-1) - all_loss 59 | aux_loss = torch.clamp(1 - torch.exp(aux_loss), min=EPSILON, max=1) 60 | neg_loss = all_loss + torch.log(aux_loss) 61 | return pos_loss + neg_loss 62 | -------------------------------------------------------------------------------- /python/core/ltp_core/models/ltp_model.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | import torch 4 | from torch import nn 5 | from torch.nn import ModuleDict 6 | from transformers.modeling_outputs import BaseModelOutput 7 | 8 | 9 | class LTPModule(nn.Module): 10 | def __init__( 11 | self, 12 | backbone: nn.Module, 13 | heads: Dict[str, nn.Module], 14 | processor: Dict[str, nn.Module], 15 | ): 16 | super().__init__() 17 | self.backbone = backbone 18 | self.processor = ModuleDict(processor) 19 | self.task_heads = ModuleDict(heads) 20 | 21 | def forward( 22 | self, 23 | task_name: str, 24 | input_ids: torch.Tensor, 25 | attention_mask: torch.Tensor, 26 | token_type_ids: torch.Tensor = None, 27 | word_index: torch.Tensor = None, 28 | word_attention_mask: torch.Tensor = None, 29 | ): 30 | outputs: BaseModelOutput = self.backbone(input_ids, attention_mask, token_type_ids) 31 | hidden_state, attention_mask = self.processor[task_name]( 32 | outputs, attention_mask, word_index, word_attention_mask 33 | ) 34 | return self.task_heads[task_name](hidden_state, attention_mask) 35 | -------------------------------------------------------------------------------- /python/core/ltp_core/models/metrics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HIT-SCIR/ltp/6c6059d5ccad87dca003190ee7565af86e7e22a0/python/core/ltp_core/models/metrics/__init__.py -------------------------------------------------------------------------------- /python/core/ltp_core/models/metrics/sent.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from torch import Tensor 4 | from torchmetrics import Accuracy 5 | 6 | from ltp_core.models.components.sent import SentClassifierResult 7 | 8 | 9 | class ClsAccuracy(Accuracy): 10 | is_differentiable: bool = False 11 | higher_is_better: Optional[bool] = True 12 | full_state_update: bool = False 13 | 14 | def update(self, result: SentClassifierResult, labels: Tensor, **kwargs) -> None: 15 | preds = result.logits.argmax(dim=-1) 16 | super().update(preds, labels) 17 | -------------------------------------------------------------------------------- /python/core/ltp_core/models/nn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HIT-SCIR/ltp/6c6059d5ccad87dca003190ee7565af86e7e22a0/python/core/ltp_core/models/nn/__init__.py -------------------------------------------------------------------------------- /python/core/ltp_core/models/nn/biaffine.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # Author: Yunlong Feng 3 | 4 | import math 5 | 6 | import torch 7 | from torch import Tensor, nn 8 | 9 | 10 | class Biaffine(nn.Module): 11 | __constants__ = ["in1_features", "in2_features", "out_features", "bias_x", "bias_y"] 12 | 13 | def __init__(self, in1_features, in2_features, out_features, bias_x=True, bias_y=True): 14 | super().__init__() 15 | self.bias_x = bias_x 16 | self.bias_y = bias_y 17 | self.in1_features = in1_features 18 | self.in2_features = in2_features 19 | self.out_features = out_features 20 | self.weight = nn.Parameter( 21 | torch.zeros(out_features, in1_features + bias_x, in2_features + bias_y), 22 | requires_grad=True, 23 | ) 24 | self.reset_parameters() 25 | 26 | def reset_parameters(self): 27 | bound = 1 / math.sqrt(self.weight.size(1)) 28 | nn.init.uniform_(self.weight, -bound, bound) 29 | 30 | def onnx_forward(self, x1: Tensor, x2: Tensor): 31 | if self.bias_x: 32 | x1 = torch.cat((x1, torch.ones_like(x1[..., :1])), -1) 33 | if self.bias_y: 34 | x2 = torch.cat((x2, torch.ones_like(x2[..., :1])), -1) 35 | x1 = x1.unsqueeze(1) 36 | x2 = x2.unsqueeze(1) 37 | s: Tensor = x1 @ self.weight @ x2.transpose(-1, -2) 38 | if s.size(1) == 1: 39 | s = s.squeeze(1) 40 | return s 41 | 42 | def forward(self, x1: Tensor, x2: Tensor): 43 | if self.bias_x: 44 | # [batch_size, seq_len, in1_features] -> [batch_size, seq_len, in1_features + 1] 45 | x1 = torch.cat((x1, torch.ones_like(x1[..., :1])), -1) 46 | if self.bias_y: 47 | # [batch_size, seq_len, in2_features] -> [batch_size, seq_len, in2_features + 1] 48 | x2 = torch.cat((x2, torch.ones_like(x2[..., :1])), -1) 49 | # [batch_size, n_out, seq_len, seq_len] 50 | s = torch.einsum("bxi,oij,byj->boxy", x1, self.weight, x2) 51 | return s 52 | 53 | def extra_repr(self): 54 | return "in1_features={}, in2_features={}, out_features={}, bias_x={}, bias_y={}".format( 55 | self.in1_features, 56 | self.in2_features, 57 | self.out_features, 58 | self.bias_x, 59 | self.bias_y, 60 | ) 61 | 62 | 63 | def main(): 64 | biaffine = Biaffine(in1_features=128, in2_features=128, out_features=12) 65 | inputs = torch.randn(2, 512, 128) 66 | outputs = biaffine(inputs, inputs) 67 | print(outputs.shape) 68 | 69 | 70 | if __name__ == "__main__": 71 | main() 72 | -------------------------------------------------------------------------------- /python/core/ltp_core/models/nn/mlp.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # Author: Yunlong Feng 3 | from typing import Callable, Optional, Sequence, Union 4 | 5 | from torch import nn 6 | from transformers.activations import get_activation 7 | 8 | 9 | def MLP( 10 | layer_sizes: Sequence[int], 11 | dropout: Optional[float] = None, 12 | activation: Optional[Union[str, Callable]] = None, 13 | output_dropout: Optional[Union[float, bool]] = None, 14 | output_activation: Optional[Union[str, bool, Callable]] = None, 15 | ): 16 | layers = [] 17 | num_layers = len(layer_sizes) - 1 18 | for index in range(num_layers): 19 | if index < num_layers - 1: 20 | layers.append(nn.Linear(layer_sizes[index], layer_sizes[index + 1])) 21 | 22 | if isinstance(activation, str): 23 | layers.append(get_activation(activation)) 24 | elif isinstance(activation, Callable): 25 | layers.append(activation()) 26 | 27 | if isinstance(dropout, float): 28 | layers.append(nn.Dropout(dropout)) 29 | else: 30 | layers.append(nn.Linear(layer_sizes[index], layer_sizes[index + 1])) 31 | 32 | if isinstance(output_activation, str): 33 | layers.append(get_activation(output_activation)) 34 | elif isinstance(output_activation, Callable): 35 | layers.append(output_activation()) 36 | elif output_activation is True and activation is not None: 37 | if isinstance(activation, str): 38 | layers.append(get_activation(activation)) 39 | elif isinstance(activation, Callable): 40 | layers.append(activation()) 41 | 42 | if isinstance(output_dropout, float): 43 | layers.append(nn.Dropout(p=output_dropout)) 44 | elif output_dropout is True and isinstance(dropout, float): 45 | layers.append(nn.Dropout(dropout)) 46 | 47 | return nn.Sequential(*layers) 48 | 49 | 50 | def main(): 51 | mlp = MLP([768, 768, 128]) 52 | print(mlp) 53 | 54 | 55 | if __name__ == "__main__": 56 | main() 57 | -------------------------------------------------------------------------------- /python/core/ltp_core/models/optimization/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HIT-SCIR/ltp/6c6059d5ccad87dca003190ee7565af86e7e22a0/python/core/ltp_core/models/optimization/__init__.py -------------------------------------------------------------------------------- /python/core/ltp_core/models/optimization/layer_lrs.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | def get_layer_lrs_with_crf( 5 | named_parameters, 6 | transformer_prefix, 7 | learning_rate, 8 | layer_decay, 9 | n_layers, 10 | crf_prefix="crf", 11 | crf_ratio=10.0, 12 | ): 13 | groups = [] 14 | crf_groups = [] 15 | temp_groups = [None] * (n_layers + 3) 16 | temp_no_decay_groups = [None] * (n_layers + 3) 17 | regex = rf"^{transformer_prefix}\.(embeddings|encoder)\w*\.(layer.(\d+))?.+" 18 | regex = re.compile(regex) 19 | for name, parameters in named_parameters: 20 | m = regex.match(name) 21 | 22 | is_transformer = True 23 | if m is None: 24 | depth = n_layers + 2 25 | is_transformer = False 26 | elif m.group(1) == "embeddings": 27 | depth = 0 28 | elif m.group(1) == "encoder": 29 | depth = int(m.group(3)) + 1 30 | else: 31 | raise Exception("Not Recommend!!!") 32 | 33 | if is_transformer and any(x in name for x in ["bias", "LayerNorm.bias", "LayerNorm.weight"]): 34 | if temp_no_decay_groups[depth] is None: 35 | temp_no_decay_groups[depth] = [] 36 | temp_no_decay_groups[depth].append(parameters) 37 | elif not is_transformer and crf_prefix in name: 38 | crf_groups.append(parameters) 39 | else: 40 | if temp_groups[depth] is None: 41 | temp_groups[depth] = [] 42 | temp_groups[depth].append(parameters) 43 | 44 | for depth, parameters in enumerate(temp_no_decay_groups): 45 | if parameters: 46 | groups.append( 47 | { 48 | "params": parameters, 49 | "weight_decay": 0.0, 50 | "lr": learning_rate * (layer_decay ** (n_layers + 2 - depth)), 51 | } 52 | ) 53 | for depth, parameters in enumerate(temp_groups): 54 | if parameters: 55 | groups.append( 56 | { 57 | "params": parameters, 58 | "lr": learning_rate * (layer_decay ** (n_layers + 2 - depth)), 59 | } 60 | ) 61 | if crf_groups: 62 | groups.append({"params": crf_groups, "lr": learning_rate * crf_ratio}) 63 | 64 | return groups 65 | -------------------------------------------------------------------------------- /python/core/ltp_core/models/processor/__init__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | 5 | class NOP(nn.Module): 6 | def __init__(self, dropout=0.1): 7 | super().__init__() 8 | self.dropout = nn.Dropout(dropout) 9 | 10 | def __call__(self, outputs, attention_mask=None, word_index=None, word_attention_mask=None): 11 | return self.dropout(outputs.last_hidden_state), attention_mask == 1 12 | 13 | 14 | class TokenOnly(nn.Module): 15 | def __init__(self, dropout=0.1): 16 | super().__init__() 17 | self.dropout = nn.Dropout(dropout) 18 | 19 | def __call__(self, outputs, attention_mask=None, word_index=None, word_attention_mask=None): 20 | return ( 21 | self.dropout(outputs.last_hidden_state[:, 1:-1]), 22 | attention_mask[:, 2:] == 1, 23 | ) 24 | 25 | 26 | class WordsOnly(nn.Module): 27 | def __init__(self, dropout=0.1): 28 | super().__init__() 29 | self.dropout = nn.Dropout(dropout) 30 | 31 | def __call__(self, outputs, attention_mask=None, word_index=None, word_attention_mask=None): 32 | hidden = outputs.last_hidden_state 33 | hidden = torch.gather( 34 | hidden[:, 1:-1, :], 35 | dim=1, 36 | index=word_index.unsqueeze(-1).expand(-1, -1, hidden.size(-1)), 37 | ) 38 | return self.dropout(hidden), word_attention_mask 39 | 40 | 41 | class ClsOnly(nn.Module): 42 | def __init__(self, dropout=0.1): 43 | super().__init__() 44 | self.dropout = nn.Dropout(dropout) 45 | 46 | def __call__(self, outputs, attention_mask=None, word_index=None, word_attention_mask=None): 47 | return self.dropout(outputs.last_hidden_state[:, 0]), None 48 | 49 | 50 | class WordsWithHead(nn.Module): 51 | def __init__(self, dropout=0.1): 52 | super().__init__() 53 | self.dropout = nn.Dropout(dropout) 54 | 55 | def __call__(self, outputs, attention_mask=None, word_index=None, word_attention_mask=None): 56 | hidden = outputs.last_hidden_state 57 | hidden = torch.cat( 58 | [ 59 | hidden[:, :1, :], 60 | torch.gather( 61 | hidden[:, 1:-1, :], 62 | dim=1, 63 | index=word_index.unsqueeze(-1).expand(-1, -1, hidden.size(-1)), 64 | ), 65 | ], 66 | dim=1, 67 | ) 68 | return self.dropout(hidden), word_attention_mask 69 | -------------------------------------------------------------------------------- /python/core/ltp_core/models/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HIT-SCIR/ltp/6c6059d5ccad87dca003190ee7565af86e7e22a0/python/core/ltp_core/models/utils/__init__.py -------------------------------------------------------------------------------- /python/core/ltp_core/models/utils/instantiate.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import importlib 3 | from typing import Callable 4 | 5 | 6 | def find_callable(target: str) -> Callable: 7 | target_module_path, target_callable_path = target.rsplit(".", 1) 8 | target_callable_paths = [target_callable_path] 9 | 10 | target_module = None 11 | while len(target_module_path): 12 | try: 13 | target_module = importlib.import_module(target_module_path) 14 | break 15 | except Exception as e: 16 | target_module_path, target_callable_path = target_module_path.rsplit(".", 1) 17 | if len(target_module_path) == 0: 18 | raise e 19 | target_callable_paths.append(target_callable_path) 20 | target_callable = target_module 21 | for attr in reversed(target_callable_paths): 22 | target_callable = getattr(target_callable, attr) 23 | 24 | return target_callable 25 | 26 | 27 | def instantiate(config, target="_ltp_target_", partial="_ltp_partial_"): 28 | if isinstance(config, dict) and target in config: 29 | target_path = config.get(target) 30 | target_callable = find_callable(target_path) 31 | 32 | is_partial = config.get(partial, False) 33 | target_args = {key: instantiate(value) for key, value in config.items() if key not in [target, partial]} 34 | 35 | if is_partial: 36 | return functools.partial(target_callable, **target_args) 37 | else: 38 | return target_callable(**target_args) 39 | elif isinstance(config, dict): 40 | return {key: instantiate(value) for key, value in config.items()} 41 | else: 42 | return config 43 | 44 | 45 | def instantiate_omega(config, target="_ltp_target_", partial="_ltp_partial_"): 46 | from omegaconf import DictConfig 47 | 48 | if (isinstance(config, dict) or isinstance(config, DictConfig)) and target in config: 49 | target_path = config.get(target) 50 | target_callable = find_callable(target_path) 51 | 52 | is_partial = config.get(partial, False) 53 | target_args = {key: instantiate_omega(value) for key, value in config.items() if key not in [target, partial]} 54 | 55 | if is_partial: 56 | return functools.partial(target_callable, **target_args) 57 | else: 58 | return target_callable(**target_args) 59 | elif isinstance(config, dict) or isinstance(config, DictConfig): 60 | return {key: instantiate_omega(value) for key, value in config.items()} 61 | else: 62 | return config 63 | 64 | 65 | def main(): 66 | import yaml 67 | 68 | with open("configs/model/model.yaml") as stream: 69 | try: 70 | config = yaml.safe_load(stream) 71 | model_config = config["model"] 72 | except yaml.YAMLError as exc: 73 | print(exc) 74 | 75 | model = instantiate(model_config) 76 | print(model) 77 | 78 | 79 | if __name__ == "__main__": 80 | main() 81 | -------------------------------------------------------------------------------- /python/core/ltp_core/models/utils/transformer.py: -------------------------------------------------------------------------------- 1 | def load_transformers(config): 2 | from transformers import AutoConfig, AutoModel 3 | 4 | config = AutoConfig.for_model(**config) 5 | return AutoModel.from_config(config) 6 | -------------------------------------------------------------------------------- /python/core/ltp_core/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HIT-SCIR/ltp/6c6059d5ccad87dca003190ee7565af86e7e22a0/python/core/ltp_core/utils/__init__.py -------------------------------------------------------------------------------- /python/core/ltp_core/utils/pylogger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from pytorch_lightning.utilities import rank_zero_only 4 | 5 | 6 | def get_pylogger(name=__name__) -> logging.Logger: 7 | """Initializes multi-GPU-friendly python command line logger.""" 8 | 9 | logger = logging.getLogger(name) 10 | 11 | # this ensures all logging levels get marked with the rank zero decorator 12 | # otherwise logs would get multiplied for each GPU process in multi-GPU setup 13 | logging_levels = ( 14 | "debug", 15 | "info", 16 | "warning", 17 | "error", 18 | "exception", 19 | "fatal", 20 | "critical", 21 | ) 22 | for level in logging_levels: 23 | setattr(logger, level, rank_zero_only(getattr(logger, level))) 24 | 25 | return logger 26 | -------------------------------------------------------------------------------- /python/core/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.pytest.ini_options] 2 | addopts = [ 3 | "--color=yes", 4 | "--durations=0", 5 | "--strict-markers", 6 | "--doctest-modules", 7 | ] 8 | filterwarnings = [ 9 | "ignore::DeprecationWarning", 10 | "ignore::UserWarning", 11 | ] 12 | log_cli = "True" 13 | markers = [ 14 | "slow: slow tests", 15 | ] 16 | minversion = "6.0" 17 | testpaths = "tests/" 18 | 19 | [tool.coverage.report] 20 | exclude_lines = [ 21 | "pragma: nocover", 22 | "raise NotImplementedError", 23 | "raise NotImplementedError()", 24 | "if __name__ == .__main__.:", 25 | ] 26 | 27 | [tool.ruff] 28 | # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default. 29 | select = ["E", "F"] 30 | ignore = [] 31 | 32 | # Allow autofix for all enabled rules (when `--fix`) is provided. 33 | fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"] 34 | unfixable = [] 35 | 36 | # Exclude a variety of commonly ignored directories. 37 | exclude = [ 38 | ".bzr", 39 | ".direnv", 40 | ".eggs", 41 | ".git", 42 | ".git-rewrite", 43 | ".hg", 44 | ".mypy_cache", 45 | ".nox", 46 | ".pants.d", 47 | ".pytype", 48 | ".ruff_cache", 49 | ".svn", 50 | ".tox", 51 | ".venv", 52 | "__pypackages__", 53 | "_build", 54 | "buck-out", 55 | "build", 56 | "dist", 57 | "node_modules", 58 | "venv", 59 | "ltp_core/train.py", 60 | "ltp_core/eval.py", 61 | ] 62 | per-file-ignores = {} 63 | 64 | # Same as Black. 65 | line-length = 120 66 | 67 | # Allow unused variables when underscore-prefixed. 68 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" 69 | 70 | # Assume Python 3.10. 71 | target-version = "py310" 72 | 73 | [tool.ruff.mccabe] 74 | # Unlike Flake8, default to a complexity level of 10. 75 | max-complexity = 10 -------------------------------------------------------------------------------- /python/core/requirements.txt: -------------------------------------------------------------------------------- 1 | # --------- pytorch --------- # 2 | torch>=1.6.0 3 | 4 | # --------- transformers --------- # 5 | transformers>=4.0.0 6 | 7 | # --------- train --------- # 8 | pytorch-lightning>=1.5.10 9 | torchmetrics>=0.7.0 10 | datasets>=1.0.0 11 | 12 | # --------- hydra --------- # 13 | hydra-core>=1.1.0 14 | hydra-colorlog>=1.1.0 15 | 16 | # --------- loggers --------- # 17 | wandb 18 | # neptune-client 19 | # mlflow 20 | # comet-ml 21 | # tensorboard 22 | 23 | # --------- others --------- # 24 | pyrootutils # standardizing the project root setup 25 | pre-commit # hooks for applying linters on commit 26 | rich # beautiful text formatting in terminal 27 | pytest # tests 28 | sh # for running bash commands in some tests 29 | -------------------------------------------------------------------------------- /python/core/setup.cfg: -------------------------------------------------------------------------------- 1 | [isort] 2 | line_length = 99 3 | profile = black 4 | filter_files = True 5 | 6 | 7 | [flake8] 8 | max_line_length = 99 9 | show_source = True 10 | format = pylint 11 | ignore = 12 | F401 # Module imported but unused 13 | W504 # Line break occurred after a binary operator 14 | F841 # Local variable name is assigned to but never used 15 | E501 # Line too long 16 | exclude = 17 | .git 18 | __pycache__ 19 | data/* 20 | tests/* 21 | notebooks/* 22 | logs/* 23 | 24 | 25 | [tool:pytest] 26 | testpaths = tests/ 27 | log_cli = True 28 | markers = 29 | slow 30 | addopts = 31 | --durations=0 32 | --strict-markers 33 | --doctest-modules 34 | filterwarnings = 35 | ignore::DeprecationWarning 36 | ignore::UserWarning 37 | -------------------------------------------------------------------------------- /python/core/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from setuptools import find_packages, setup 4 | 5 | project_dir, _ = os.path.split(__file__) 6 | 7 | with open(os.path.join(project_dir, "README.md"), encoding="utf-8") as fh: 8 | long_description = fh.read() 9 | 10 | setup( 11 | name="ltp_core", 12 | version="0.1.4", 13 | author="Yunlong Feng", 14 | author_email="ylfeng@ir.hit.edu.cn", 15 | url="https://github.com/HIT-SCIR/ltp", 16 | description="Language Technology Platform", 17 | long_description=long_description, 18 | long_description_content_type="text/markdown", 19 | install_requires=[ 20 | "torch>=1.6.0", 21 | "transformers>=4.0.0", 22 | ], 23 | extras_require={ 24 | "train": [ 25 | # pytorch-lightning 26 | "pytorch-lightning>=1.0.0", 27 | "torchmetrics>=0.7.0", 28 | # datasets 29 | "datasets>=1.0.0", 30 | # hydra 31 | "rich", 32 | "pyrootutils", 33 | "hydra-core>=1.1.0", 34 | "hydra-colorlog>=1.1.0", 35 | ] 36 | }, 37 | classifiers=[ 38 | "Development Status :: 1 - Planning", 39 | "Operating System :: OS Independent", 40 | "Intended Audience :: Developers", 41 | "Programming Language :: Python :: 3.6", 42 | "Programming Language :: Python :: 3.7", 43 | "Programming Language :: Python :: 3.8", 44 | "Programming Language :: Python :: 3.9", 45 | "Programming Language :: Python :: 3.10", 46 | "Programming Language :: Python :: 3.11", 47 | "Topic :: Software Development :: Libraries", 48 | ], 49 | packages=find_packages(), 50 | include_dirs=["ltp_core"], 51 | python_requires=">=3.6, <4", 52 | zip_safe=True, 53 | ) 54 | -------------------------------------------------------------------------------- /python/core/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HIT-SCIR/ltp/6c6059d5ccad87dca003190ee7565af86e7e22a0/python/core/tests/__init__.py -------------------------------------------------------------------------------- /python/extension/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "ltp-extension" 3 | version = "0.1.13" 4 | edition = "2021" 5 | authors = ["ylfeng "] 6 | description = "Rust Extension For Language Technology Platform(Python)." 7 | homepage = "https://github.com/HIT-SCIR/ltp" 8 | repository = "https://github.com/HIT-SCIR/ltp" 9 | keywords = ["ltp", "nlp"] 10 | exclude = [".github"] 11 | readme = "README.md" 12 | license-file = "LICENSE" 13 | 14 | [lib] 15 | name = "ltp_extension" 16 | crate-type = ["cdylib"] 17 | 18 | [dependencies] 19 | libc = { version = "0.2" } 20 | rayon = { version = "1.7" } 21 | rayon-cond = { version = "0.4" } 22 | anyhow = { version = "1.0" } 23 | serde = { version = "1.0", features = ["derive"] } 24 | pyo3 = { version = "0.24", features = ["extension-module", "anyhow", "serde"] } 25 | mimalloc = { version = "0.1", default-features = false, optional = true } 26 | 27 | [dependencies.ltp] 28 | version = "*" 29 | path = "../../rust/ltp" 30 | features = ["serialization", "parallel"] 31 | 32 | [features] 33 | default = ["abi3", "near-char-type"] 34 | malloc = ["mimalloc"] 35 | secure = ["mimalloc/secure"] 36 | char-type = ["ltp/char-type"] 37 | cross-char = ["ltp/cross-char"] 38 | near-char-type = ["ltp/near-char-type"] 39 | abi3 = ["pyo3/abi3", "pyo3/abi3-py37"] 40 | -------------------------------------------------------------------------------- /python/extension/LICENSE: -------------------------------------------------------------------------------- 1 | 1. 语言技术平台面向国内外大学、中科院各研究所以及个人研究者免费开放源代码,但如上述机构和个人将该平台用于商业目的(如企业合作项目等)则需要付费。 2 | 2. 除上述机构以外的企事业单位,如申请使用该平台,需付费。 3 | 3. 凡涉及付费问题,请发邮件到 car@ir.hit.edu.cn 洽商。 4 | 4. 如果您在 LTP 基础上发表论文或取得科研成果,请您在发表论文和申报成果时声明“使用了哈工大社会计算与信息检索研究中心研制的语言技术平台(LTP)”. 5 | 同时,发信给car@ir.hit.edu.cn,说明发表论文或申报成果的题目、出处等。 6 | -------------------------------------------------------------------------------- /python/extension/examples/legacy_train.py: -------------------------------------------------------------------------------- 1 | from ltp_extension.perceptron import Algorithm, CWSModel, CWSTrainer, ModelType, Trainer 2 | 3 | 4 | def train_cws(): 5 | ap = Algorithm("AP") 6 | pa = Algorithm("Pa") 7 | pai = Algorithm("PaI", 0.5) 8 | paii = Algorithm("PaII", 0.5) 9 | 10 | trainer: CWSTrainer = CWSTrainer() 11 | trainer.epoch = 10 12 | trainer.load_train_data("data/cws/val.txt") 13 | trainer.load_eval_data("data/cws/test.txt") 14 | print(trainer) 15 | 16 | for algorithm in [ap, pa, pai, paii]: 17 | print(algorithm) 18 | trainer.algorithm = algorithm 19 | _: CWSModel = trainer.train() 20 | 21 | 22 | def train_auto(): 23 | ap = Algorithm("AP") 24 | pa = Algorithm("Pa") 25 | pai = Algorithm("PaI", 0.5) 26 | paii = Algorithm("PaII", 0.5) 27 | 28 | model_type = ModelType("cws") 29 | trainer: Trainer = Trainer(model_type) 30 | trainer.epoch = 10 31 | trainer.load_train_data("data/cws/val.txt") 32 | trainer.load_eval_data("data/cws/test.txt") 33 | print(trainer) 34 | 35 | for algorithm in [ap, pa, pai, paii]: 36 | print(algorithm) 37 | trainer.algorithm = algorithm 38 | _: CWSModel = trainer.train() 39 | 40 | 41 | def main(): 42 | # train_cws() 43 | train_auto() 44 | 45 | 46 | if __name__ == "__main__": 47 | main() 48 | -------------------------------------------------------------------------------- /python/extension/ltp_extension/__init__.py: -------------------------------------------------------------------------------- 1 | from . import ltp_extension 2 | 3 | __version__ = ltp_extension.__version__ 4 | perceptron = ltp_extension.perceptron 5 | algorithms = ltp_extension.algorithms 6 | -------------------------------------------------------------------------------- /python/extension/ltp_extension/algorithms/__init__.py: -------------------------------------------------------------------------------- 1 | # Generated content DO NOT EDIT 2 | from .. import algorithms 3 | 4 | eisner = algorithms.eisner 5 | get_entities = algorithms.get_entities 6 | viterbi_decode_postprocess = algorithms.viterbi_decode_postprocess 7 | Hook = algorithms.Hook 8 | StnSplit = algorithms.StnSplit 9 | -------------------------------------------------------------------------------- /python/extension/ltp_extension/algorithms/algorithms.pyi: -------------------------------------------------------------------------------- 1 | # Generated content DO NOT EDIT 2 | def eisner(scores, stn_length, remove_root=False): 3 | """ 4 | Decode with Eisner's algorithm 5 | """ 6 | pass 7 | 8 | def get_entities(tags): 9 | """ 10 | Convert Tags to Entities 11 | """ 12 | pass 13 | 14 | def viterbi_decode_postprocess(history, last_tags, stn_length, labels_num): 15 | """ 16 | Viterbi Decode Postprocessing 17 | """ 18 | pass 19 | 20 | class Hook: 21 | def __init__(self): 22 | pass 23 | def add_word(self, word, freq=None): 24 | """ 25 | add words to the hook, the freq can be zero 26 | """ 27 | pass 28 | def hook(self, sentence, words): 29 | """ 30 | hook to the new words 31 | """ 32 | pass 33 | 34 | class StnSplit: 35 | def __init__(self): 36 | pass 37 | def batch_split(self, batch_text, threads=8): 38 | """ 39 | batch split to sentences 40 | """ 41 | pass 42 | @property 43 | def bracket_as_entity(self): 44 | """ 45 | Get the value of the bracket_as_entity option. 46 | """ 47 | pass 48 | @property 49 | def en_quote_as_entity(self): 50 | """ 51 | Get the value of the en_quote_as_entity option. 52 | """ 53 | pass 54 | def split(self, text): 55 | """ 56 | split to sentences 57 | """ 58 | pass 59 | @property 60 | def use_en(self): 61 | """ 62 | Get the value of the use_en option. 63 | """ 64 | pass 65 | @property 66 | def use_zh(self): 67 | """ 68 | Get the value of the use_zh option. 69 | """ 70 | pass 71 | @property 72 | def zh_quote_as_entity(self): 73 | """ 74 | Get the value of the zh_quote_as_entity option. 75 | """ 76 | pass 77 | -------------------------------------------------------------------------------- /python/extension/ltp_extension/ltp_extension.pyi: -------------------------------------------------------------------------------- 1 | # Generated content DO NOT EDIT 2 | -------------------------------------------------------------------------------- /python/extension/ltp_extension/perceptron/__init__.py: -------------------------------------------------------------------------------- 1 | # Generated content DO NOT EDIT 2 | from .. import perceptron 3 | 4 | Algorithm = perceptron.Algorithm 5 | CWSModel = perceptron.CWSModel 6 | CWSTrainer = perceptron.CWSTrainer 7 | CharacterType = perceptron.CharacterType 8 | Model = perceptron.Model 9 | ModelType = perceptron.ModelType 10 | NERModel = perceptron.NERModel 11 | NERTrainer = perceptron.NERTrainer 12 | POSModel = perceptron.POSModel 13 | POSTrainer = perceptron.POSTrainer 14 | Trainer = perceptron.Trainer 15 | -------------------------------------------------------------------------------- /python/extension/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["maturin>=1.0,<2.0"] 3 | build-backend = "maturin" 4 | 5 | [tool.ruff] 6 | # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default. 7 | select = ["E", "F"] 8 | ignore = [] 9 | 10 | # Allow autofix for all enabled rules (when `--fix`) is provided. 11 | fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"] 12 | unfixable = [] 13 | 14 | # Exclude a variety of commonly ignored directories. 15 | exclude = [ 16 | ".bzr", 17 | ".direnv", 18 | ".eggs", 19 | ".git", 20 | ".git-rewrite", 21 | ".hg", 22 | ".mypy_cache", 23 | ".nox", 24 | ".pants.d", 25 | ".pytype", 26 | ".ruff_cache", 27 | ".svn", 28 | ".tox", 29 | ".venv", 30 | "__pypackages__", 31 | "_build", 32 | "buck-out", 33 | "build", 34 | "dist", 35 | "node_modules", 36 | "venv", 37 | "*.pyi" 38 | ] 39 | per-file-ignores = {} 40 | 41 | # Same as Black. 42 | line-length = 120 43 | 44 | # Allow unused variables when underscore-prefixed. 45 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" 46 | 47 | # Assume Python 3.10. 48 | target-version = "py310" 49 | 50 | [tool.ruff.mccabe] 51 | # Unlike Flake8, default to a complexity level of 10. 52 | max-complexity = 10 -------------------------------------------------------------------------------- /python/extension/src/algorithms.rs: -------------------------------------------------------------------------------- 1 | use ltp::utils::{drop_get_entities, eisner, viterbi_decode_postprocessing}; 2 | use pyo3::prelude::*; 3 | 4 | /// Convert Tags to Entities 5 | #[pyfunction] 6 | #[pyo3(name = "get_entities", text_signature = "(tags)")] 7 | pub fn py_get_entities(tags: Vec<&str>) -> PyResult> { 8 | Ok(drop_get_entities(tags)) 9 | } 10 | 11 | /// Decode with Eisner's algorithm 12 | #[pyfunction] 13 | #[pyo3( 14 | name = "eisner", 15 | text_signature = "(scores, stn_length, remove_root=False)" 16 | )] 17 | pub fn py_eisner( 18 | scores: Vec, 19 | stn_length: Vec, 20 | remove_root: bool, 21 | ) -> PyResult>> { 22 | Ok(eisner(&scores, &stn_length, remove_root)) 23 | } 24 | 25 | /// Viterbi Decode Postprocessing 26 | #[pyfunction] 27 | #[pyo3( 28 | name = "viterbi_decode_postprocess", 29 | text_signature = "(history, last_tags, stn_length, labels_num)" 30 | )] 31 | pub fn py_viterbi_decode_postprocess( 32 | history: Vec, 33 | last_tags: Vec, 34 | stn_lengths: Vec, 35 | labels_num: usize, 36 | ) -> PyResult>> { 37 | Ok(viterbi_decode_postprocessing( 38 | &history, 39 | &last_tags, 40 | &stn_lengths, 41 | labels_num, 42 | )) 43 | } 44 | -------------------------------------------------------------------------------- /python/extension/src/hook.rs: -------------------------------------------------------------------------------- 1 | use ltp::utils::hook::Hook; 2 | use pyo3::prelude::*; 3 | 4 | #[pyclass(module = "ltp_extension.algorithms", name = "Hook", subclass)] 5 | #[derive(Clone, Debug)] 6 | pub struct PyHook { 7 | pub hook: Hook, 8 | } 9 | 10 | #[pymethods] 11 | impl PyHook { 12 | #[new] 13 | #[pyo3(text_signature = "(self)")] 14 | pub fn new() -> PyResult { 15 | Ok(Self { hook: Hook::new() }) 16 | } 17 | 18 | pub fn __len__(&self) -> usize { 19 | self.hook.total() 20 | } 21 | 22 | /// add words to the hook, the freq can be zero 23 | #[pyo3(text_signature = "(self, word, freq = None)")] 24 | pub fn add_word(&mut self, word: &str, freq: Option) -> usize { 25 | self.hook.add_word(word, freq) 26 | } 27 | 28 | /// hook to the new words 29 | #[pyo3(text_signature = "(self, sentence, words)")] 30 | pub fn hook<'a>(&self, sentence: &'a str, words: Vec<&str>) -> PyResult> { 31 | Ok(self.hook.hook(sentence, &words)?) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /python/extension/src/lib.rs: -------------------------------------------------------------------------------- 1 | #[cfg(feature = "mimalloc")] 2 | use mimalloc::MiMalloc; 3 | 4 | #[cfg(feature = "mimalloc")] 5 | #[global_allocator] 6 | static GLOBAL: MiMalloc = MiMalloc; 7 | 8 | mod algorithms; 9 | mod hook; 10 | mod perceptron; 11 | mod stnsplit; 12 | mod utils; 13 | 14 | use crate::perceptron::{ModelType, PyModel, PyTrainer}; 15 | pub use algorithms::{py_eisner, py_get_entities, py_viterbi_decode_postprocess}; 16 | use hook::PyHook; 17 | pub use perceptron::{ 18 | CharacterType, PyAlgorithm, PyCWSModel, PyCWSTrainer, PyNERModel, PyNERTrainer, PyPOSModel, 19 | PyPOSTrainer, 20 | }; 21 | use pyo3::prelude::*; 22 | use stnsplit::StnSplit; 23 | 24 | pub const VERSION: &str = env!("CARGO_PKG_VERSION"); 25 | 26 | // For users using multiprocessing in python, it is quite easy to fork the process running 27 | // tokenizers, ending up with a deadlock because we internaly make use of multithreading. So 28 | // we register a callback to be called in the event of a fork so that we can warn the user. 29 | static mut REGISTERED_FORK_CALLBACK: bool = false; 30 | extern "C" fn child_after_fork() { 31 | use utils::parallelism::*; 32 | if has_parallelism_been_used() && !is_parallelism_configured() { 33 | println!( 34 | "LTP: The current process just got forked, after parallelism has \ 35 | already been used. Disabling parallelism to avoid deadlocks..." 36 | ); 37 | println!("To disable this warning, you can either:"); 38 | println!( 39 | "\t- Avoid using `LTP/legacy` model before the fork if possible\n\ 40 | \t- Explicitly set the environment variable {}=(true | false)", 41 | ENV_VARIABLE 42 | ); 43 | set_parallelism(false); 44 | } 45 | } 46 | 47 | /// LTP Module 48 | #[pymodule] 49 | fn ltp_extension(py: Python, m: &PyModule) -> PyResult<()> { 50 | // Register the fork callback 51 | #[cfg(target_family = "unix")] 52 | unsafe { 53 | if !REGISTERED_FORK_CALLBACK { 54 | libc::pthread_atfork(None, None, Some(child_after_fork)); 55 | REGISTERED_FORK_CALLBACK = true; 56 | } 57 | } 58 | 59 | m.add("__version__", env!("CARGO_PKG_VERSION"))?; 60 | 61 | // Algorithms Module 62 | let algorithms = PyModule::new(py, "algorithms")?; 63 | 64 | algorithms.add_class::()?; 65 | algorithms.add_class::()?; 66 | algorithms.add_function(wrap_pyfunction!(py_eisner, m)?)?; 67 | algorithms.add_function(wrap_pyfunction!(py_get_entities, m)?)?; 68 | algorithms.add_function(wrap_pyfunction!(py_viterbi_decode_postprocess, m)?)?; 69 | 70 | // Perceptron Module 71 | let perceptron = PyModule::new(py, "perceptron")?; 72 | perceptron.add_class::()?; 73 | perceptron.add_class::()?; 74 | perceptron.add_class::()?; 75 | perceptron.add_class::()?; 76 | 77 | perceptron.add_class::()?; 78 | perceptron.add_class::()?; 79 | perceptron.add_class::()?; 80 | 81 | perceptron.add_class::()?; 82 | perceptron.add_class::()?; 83 | 84 | perceptron.add_class::()?; 85 | perceptron.add_class::()?; 86 | 87 | m.add_submodule(algorithms)?; 88 | m.add_submodule(perceptron)?; 89 | Ok(()) 90 | } 91 | -------------------------------------------------------------------------------- /python/extension/src/perceptron/alg.rs: -------------------------------------------------------------------------------- 1 | use ltp::perceptron::{Algorithm, PaMode}; 2 | use pyo3::exceptions::PyValueError; 3 | use pyo3::prelude::*; 4 | use serde::{Deserialize, Serialize}; 5 | use std::fmt::{Display, Formatter}; 6 | 7 | /// The perceptron algorithm. 8 | /// algorithm support "AP", "Pa", "PaI", "PaII" 9 | /// AP: average perceptron, param is the threads 10 | /// PA: parallel average perceptron, param is c(margin) 11 | #[pyclass(module = "ltp_extension.perceptron", name = "Algorithm", subclass)] 12 | #[derive(Clone, Serialize, Deserialize, Default, Debug, PartialEq)] 13 | pub struct PyAlgorithm { 14 | pub(crate) algorithm: Algorithm, 15 | } 16 | 17 | impl Display for PyAlgorithm { 18 | fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { 19 | write!(f, "{}", self.algorithm) 20 | } 21 | } 22 | 23 | #[pymethods] 24 | impl PyAlgorithm { 25 | #[new] 26 | #[pyo3(text_signature = "(self, algorithm, param = None)")] 27 | pub fn new(py: Python, algorithm: &str, param: Option) -> PyResult { 28 | let algorithm: Algorithm = match algorithm { 29 | "AP" => { 30 | if let Some(param) = param { 31 | let param = param.extract::(py)?; 32 | Ok(Algorithm::AP(param)) 33 | } else { 34 | Ok(Algorithm::AP(1usize)) 35 | } 36 | } 37 | "Pa" => Ok(Algorithm::PA(PaMode::Pa)), 38 | "PaI" => { 39 | if let Some(c) = param { 40 | let c = c.extract::(py)?; 41 | Ok(Algorithm::PA(PaMode::PaI(c))) 42 | } else { 43 | Err(PyValueError::new_err("param is needed")) 44 | } 45 | } 46 | "PaII" => { 47 | if let Some(c) = param { 48 | let c = c.extract::(py)?; 49 | Ok(Algorithm::PA(PaMode::PaII(c))) 50 | } else { 51 | Err(PyValueError::new_err("param is needed")) 52 | } 53 | } 54 | _ => Err(PyValueError::new_err("algorithm is not supported"))?, 55 | }?; 56 | 57 | Ok(Self { algorithm }) 58 | } 59 | 60 | fn __repr__(&self) -> String { 61 | format!("{}", self.algorithm) 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /python/extension/src/perceptron/com.rs: -------------------------------------------------------------------------------- 1 | #[macro_export] 2 | macro_rules! impl_model { 3 | ($name:ident) => { 4 | impl $name { 5 | fn inner_load(path: &str) -> anyhow::Result { 6 | use ltp::perceptron::ModelSerde; 7 | let file = std::fs::File::open(path)?; 8 | let model = if path.ends_with(".json") { 9 | ModelSerde::load(file, ltp::perceptron::Format::JSON)? 10 | } else { 11 | ModelSerde::load( 12 | file, 13 | ltp::perceptron::Format::AVRO(ltp::perceptron::Codec::Deflate), 14 | )? 15 | }; 16 | Ok(Self { model }) 17 | } 18 | 19 | fn inner_save(&self, path: &str) -> anyhow::Result<()> { 20 | use ltp::perceptron::ModelSerde; 21 | let file = std::fs::File::create(path)?; 22 | if path.ends_with(".json") { 23 | self.model.save(file, ltp::perceptron::Format::JSON)?; 24 | } else { 25 | self.model.save( 26 | file, 27 | ltp::perceptron::Format::AVRO(ltp::perceptron::Codec::Deflate), 28 | )?; 29 | } 30 | Ok(()) 31 | } 32 | } 33 | }; 34 | () => {}; 35 | } 36 | -------------------------------------------------------------------------------- /python/extension/src/perceptron/mod.rs: -------------------------------------------------------------------------------- 1 | mod alg; 2 | mod com; 3 | mod model; 4 | mod specialization; 5 | mod trainer; 6 | 7 | pub type Perceptron = ltp::perceptron::SerdeModel; 8 | pub use alg::PyAlgorithm; 9 | pub use model::{EnumModel, ModelType, PyModel}; 10 | pub use specialization::{ 11 | CharacterType, PyCWSModel, PyCWSTrainer, PyNERModel, PyNERTrainer, PyPOSModel, PyPOSTrainer, 12 | }; 13 | pub use trainer::{EnumTrainer, PyTrainer}; 14 | -------------------------------------------------------------------------------- /python/extension/src/perceptron/specialization/mod.rs: -------------------------------------------------------------------------------- 1 | mod cws; 2 | mod ner; 3 | mod pos; 4 | 5 | pub use cws::{CharacterType, PyCWSModel, PyCWSTrainer}; 6 | pub use ner::{PyNERModel, PyNERTrainer}; 7 | pub use pos::{PyPOSModel, PyPOSTrainer}; 8 | -------------------------------------------------------------------------------- /python/extension/src/utils/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod parallelism; 2 | -------------------------------------------------------------------------------- /python/interface/LICENSE: -------------------------------------------------------------------------------- 1 | 1. 语言技术平台面向国内外大学、中科院各研究所以及个人研究者免费开放源代码,但如上述机构和个人将该平台用于商业目的(如企业合作项目等)则需要付费。 2 | 2. 除上述机构以外的企事业单位,如申请使用该平台,需付费。 3 | 3. 凡涉及付费问题,请发邮件到 car@ir.hit.edu.cn 洽商。 4 | 4. 如果您在 LTP 基础上发表论文或取得科研成果,请您在发表论文和申报成果时声明“使用了哈工大社会计算与信息检索研究中心研制的语言技术平台(LTP)”. 5 | 同时,发信给car@ir.hit.edu.cn,说明发表论文或申报成果的题目、出处等。 6 | -------------------------------------------------------------------------------- /python/interface/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include requirements.txt 3 | 4 | recursive-include ltp * 5 | 6 | recursive-exclude * *.pyc 7 | recursive-exclude * .DS_Store 8 | recursive-exclude * __pycache__ 9 | -------------------------------------------------------------------------------- /python/interface/Makefile: -------------------------------------------------------------------------------- 1 | 2 | help: ## Show help 3 | @grep -E '^[.a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' 4 | 5 | clean: ## Clean autogenerated files 6 | rm -rf dist 7 | find . -type f -name "*.DS_Store" -ls -delete 8 | find . | grep -E "(__pycache__|\.pyc|\.pyo)" | xargs rm -rf 9 | find . | grep -E ".pytest_cache" | xargs rm -rf 10 | find . | grep -E ".ipynb_checkpoints" | xargs rm -rf 11 | rm -f .coverage 12 | 13 | style: ## Run pre-commit hooks 14 | pre-commit run -a 15 | 16 | sync: ## Merge changes from main branch to your current branch 17 | git fetch --all 18 | git merge main 19 | 20 | test: ## Run not slow tests 21 | pytest -k "not slow" 22 | 23 | test-full: ## Run all tests 24 | pytest 25 | -------------------------------------------------------------------------------- /python/interface/docs/README.md: -------------------------------------------------------------------------------- 1 | # 文档生成 2 | 3 | ```shell script 4 | sphinx-build -b html docs build 5 | ``` 6 | -------------------------------------------------------------------------------- /python/interface/docs/api/ltp.rst: -------------------------------------------------------------------------------- 1 | LTP 文档 2 | =========== 3 | 4 | Submodules 5 | ---------- 6 | 7 | ltp.interface module 8 | -------------------- 9 | 10 | .. automodule:: ltp.interface 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | ltp.legacy module 16 | ----------------- 17 | 18 | .. automodule:: ltp.legacy 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | ltp.nerual module 24 | ----------------- 25 | 26 | .. automodule:: ltp.nerual 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | 32 | Module contents 33 | --------------- 34 | 35 | .. automodule:: ltp 36 | :members: 37 | :undoc-members: 38 | :show-inheritance: 39 | -------------------------------------------------------------------------------- /python/interface/docs/conf.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | sys.path.insert(0, os.path.abspath("../ltp")) 5 | 6 | project = "LTP4" 7 | copyright = "2020, Feng Yunlong" 8 | author = "Feng Yunlong" 9 | 10 | from ltp import __version__ as version 11 | 12 | release = version 13 | 14 | extensions = [ 15 | "sphinx.ext.autodoc", 16 | "sphinx.ext.coverage", 17 | "sphinx.ext.doctest", 18 | "sphinx.ext.intersphinx", 19 | "sphinx.ext.viewcode", 20 | "sphinx.ext.napoleon", 21 | ] 22 | 23 | autodoc_default_options = { 24 | "members": True, 25 | "show-inheritance": False, 26 | "member-order": "bysource", 27 | "exclude-members": "__weakref__", 28 | } 29 | 30 | autodoc_typehints = "none" 31 | add_module_names = False 32 | 33 | templates_path = ["templates"] 34 | language = "zh" 35 | exclude_patterns = [] 36 | html_theme = "sphinx_rtd_theme" 37 | html_static_path = ["static"] 38 | source_suffix = [".rst", ".md"] 39 | master_doc = "index" 40 | -------------------------------------------------------------------------------- /python/interface/docs/index.rst: -------------------------------------------------------------------------------- 1 | LTP4 文档 2 | ================================ 3 | 4 | 5 | .. include:: introduction.rst 6 | .. include:: quickstart.rst 7 | .. include:: performance.rst 8 | 9 | API文档 10 | ======== 11 | .. toctree:: 12 | :titlesonly: 13 | :glob: 14 | 15 | introduction 16 | quickstart 17 | performance 18 | 19 | api/* 20 | 21 | appendix 22 | 23 | 24 | 索引和图表 25 | ================== 26 | 27 | * :ref:`genindex` 28 | * :ref:`modindex` 29 | * :ref:`search` 30 | -------------------------------------------------------------------------------- /python/interface/docs/introduction.rst: -------------------------------------------------------------------------------- 1 | 开始使用LTP 2 | ============= 3 | 4 | 如果你是第一次使用LTP,不妨花一些时间了解LTP能帮你做什么。 5 | 6 | LTP提供了一系列中文自然语言处理工具,用户可以使用这些工具对于中文文本进行分词、词性标注、句法分析等等工作。从应用角度来看,LTP为用户提供了下列组件: 7 | 8 | * 针对单一自然语言处理任务,生成统计机器学习模型的工具 9 | * 针对单一自然语言处理任务,调用模型进行分析的编程接口 10 | * 系统可调用的,用于中文语言处理的模型文件 11 | * 针对单一自然语言处理任务,基于云端的编程接口 12 | 13 | 如果你的公司需要一套高性能的中文语言分析工具以处理海量的文本,或者你的在研究工作建立在一系列底层中文自然语言处理任务之上,或者你想将自己的科研成果与前沿先进工作进行对比,LTP都可能是你的选择。 14 | -------------------------------------------------------------------------------- /python/interface/docs/performance.rst: -------------------------------------------------------------------------------- 1 | 性能 2 | =============== 3 | 4 | 分词模块 5 | --------- 6 | 7 | 基础模型在人民日报测试数据上的性能如下: 8 | 9 | 语料信息:人民日报1998年2月-6月(后10%数据作为开发集)作为训练数据,1月作为测试数据。 10 | 11 | 12 | +------+----------+ 13 | | | F1 | 14 | +======+==========+ 15 | |测试集| 98.5% | 16 | +------+----------+ 17 | 18 | 19 | 词性标注模块 20 | ------------ 21 | 22 | 基础模型在人民日报数据集上的性能如下: 23 | 24 | 语料信息:人民日报1998年2月-6月(后10%数据作为开发集)作为训练数据,1月作为测试数据。 25 | 26 | +------+----------+ 27 | | | ACC | 28 | +======+==========+ 29 | |测试集| 98.5% | 30 | +------+----------+ 31 | 32 | 命名实体识别模块 33 | ---------------- 34 | 35 | 基础模型在人民日报数据集上的性能如下: 36 | 37 | 语料信息:人民日报1998年1月做训练(后10%数据作为开发集),6月前10000句做测试作为训练数据。 38 | 39 | +------+------+ 40 | | | F1 | 41 | +======+======+ 42 | |测试集| 95.4 | 43 | +------+------+ 44 | 45 | 语义角色标注模块 46 | ----------------- 47 | 48 | 基础模型在CPB3.0上的性能如下: 49 | 50 | +------+----------+ 51 | | | F1 | 52 | +======+==========+ 53 | |测试集| 80.6% | 54 | +------+----------+ 55 | 56 | 依存句法分析模块 57 | ----------------- 58 | 59 | 在 `Chinese Dependency Treebank(CDT) `_ 数据集上的性能如下。 60 | 61 | +------+-------+ 62 | | | LAS | 63 | +======+=======+ 64 | |测试集| 89.5 | 65 | +------+-------+ 66 | 67 | 语义依存分析模块 68 | ----------------- 69 | 70 | 在 `CCL2020 `_ 数据集上的性能如下。 71 | 72 | +------+-------+ 73 | | | LAS | 74 | +======+=======+ 75 | |测试集| 75.2 | 76 | +------+-------+ 77 | -------------------------------------------------------------------------------- /python/interface/docs/quickstart.rst: -------------------------------------------------------------------------------- 1 | 快速上手 2 | ======== 3 | 4 | 快速安装 5 | ----------- 6 | 7 | 安装LTP是非常简单的,使用Pip安装只需要: 8 | 9 | .. code-block:: sh 10 | 11 | pip install ltp 12 | 13 | 载入模型 14 | -------------------------- 15 | 16 | .. code-block:: python 17 | 18 | from ltp import LTP 19 | ltp = LTP() # 默认加载 LTP/Small 模型 20 | # ltp = LTP(path = "LTP/base|LTP/small|LTP/tiny") 21 | 22 | 分句 23 | -------------------------- 24 | 25 | 使用LTP分句只需要使用 StnSplit 26 | 27 | .. code-block:: python 28 | 29 | from ltp import StnSplit 30 | sents = StnSplit().split("汤姆生病了。他去了医院。") 31 | # [ 32 | # "汤姆生病了。", 33 | # "他去了医院。" 34 | # ] 35 | 36 | sents = StnSplit().batch_split(["他叫汤姆去拿外衣。", "汤姆生病了。他去了医院。"]) 37 | 38 | # [ 39 | # "他叫汤姆去拿外衣。", 40 | # "汤姆生病了。", 41 | # "他去了医院。" 42 | # ] 43 | 44 | 用户自定义词典 45 | ------------------- 46 | 47 | .. code-block:: python 48 | 49 | from ltp import LTP 50 | ltp = LTP() 51 | # 也可以在代码中添加自定义的词语 52 | ltp.add_words(word="长江大桥", freq = 2) 53 | 54 | 55 | 分词 56 | ------------------ 57 | 58 | 使用LTP分词非常简单,下面是一个简短的例子: 59 | 60 | .. code-block:: python 61 | 62 | from ltp import LTP 63 | 64 | ltp = LTP() 65 | 66 | words = ltp.pipeline(["他叫汤姆去拿外衣。"], tasks = ["cws"], return_dict = False) 67 | # [['他', '叫', '汤姆', '去', '拿', '外衣', '。']] 68 | 69 | 70 | 词性标注 71 | ------------------ 72 | 73 | .. code-block:: python 74 | 75 | from ltp import LTP 76 | 77 | ltp = LTP() 78 | 79 | result = ltp.pipeline(["他叫汤姆去拿外衣。"], tasks = ["cws","pos"]) 80 | print(result.pos) 81 | # [['他', '叫', '汤姆', '去', '拿', '外衣', '。']] 82 | # [['r', 'v', 'nh', 'v', 'v', 'n', 'wp']] 83 | 84 | 命名实体识别 85 | ------------------ 86 | 87 | 88 | .. code-block:: python 89 | 90 | from ltp import LTP 91 | 92 | ltp = LTP() 93 | 94 | result = ltp.pipeline(["他叫汤姆去拿外衣。"], tasks = ["cws","ner"]) 95 | print(result.ner) 96 | # [['他', '叫', '汤姆', '去', '拿', '外衣', '。']] 97 | 98 | 99 | 100 | 语义角色标注 101 | ------------------ 102 | 103 | .. code-block:: python 104 | 105 | from ltp import LTP 106 | 107 | ltp = LTP() 108 | 109 | result = ltp.pipeline(["他叫汤姆去拿外衣。"], tasks = ["cws","srl"]) 110 | print(result.srl) 111 | 112 | 113 | 114 | 依存句法分析 115 | ------------------ 116 | 117 | 需要注意的是,在依存句法当中,虚节点ROOT占据了0位置,因此节点的下标从1开始。 118 | 119 | .. code-block:: python 120 | 121 | from ltp import LTP 122 | 123 | ltp = LTP() 124 | 125 | result = ltp.pipeline(["他叫汤姆去拿外衣。"], tasks = ["cws","dep"]) 126 | print(result.dep) 127 | 128 | 129 | 130 | 语义依存分析(树) 131 | ------------------ 132 | 133 | 与依存句法类似的,这里的下标也是从1开始。 134 | 135 | .. code-block:: python 136 | 137 | from ltp import LTP 138 | 139 | ltp = LTP() 140 | 141 | result = ltp.pipeline(["他叫汤姆去拿外衣。"], tasks = ["cws","sdp"]) 142 | print(result.sdp) 143 | 144 | 145 | 语义依存分析(图) 146 | ------------------ 147 | 148 | 与依存句法类似的,这里的下标也是从1开始。 149 | 150 | .. code-block:: python 151 | 152 | from ltp import LTP 153 | 154 | ltp = LTP() 155 | 156 | result = ltp.pipeline(["他叫汤姆去拿外衣。"], tasks = ["cws","sdpg"]) 157 | print(result.sdpg) 158 | 159 | 160 | LTP Server 161 | ------------------------------ 162 | 163 | LTP Server 是对 LTP 的一个简单包装,依赖于 tornado,使用方式如下: 164 | 165 | .. code-block:: bash 166 | 167 | pip install ltp, tornado 168 | python utils/server.py serve 169 | -------------------------------------------------------------------------------- /python/interface/examples/conllu.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # Author: Yunlong Feng 3 | 4 | 5 | from ltp import LTP 6 | 7 | 8 | class Token: 9 | def __init__(self, id, form, lemma, upos, xpos, feats, head, deprel, deps, misc): 10 | self.id = id 11 | self.form = form 12 | self.lemma = lemma 13 | self.upos = upos 14 | self.xpos = xpos 15 | self.feats = feats 16 | self.head = head 17 | self.deprel = deprel 18 | self.deps = deps 19 | self.misc = misc 20 | 21 | def __str__(self): 22 | return "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format( 23 | self.id, 24 | self.form, 25 | self.lemma, 26 | self.upos, 27 | self.xpos, 28 | self.feats, 29 | self.head, 30 | self.deprel, 31 | self.deps, 32 | self.misc, 33 | ) 34 | 35 | def __repr__(self): 36 | return "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format( 37 | self.id, 38 | self.form, 39 | self.lemma, 40 | self.upos, 41 | self.xpos, 42 | self.feats, 43 | self.head, 44 | self.deprel, 45 | self.deps, 46 | self.misc, 47 | ) 48 | 49 | 50 | def main(): 51 | ltp = LTP("LTP/tiny") 52 | batched_cws, batched_pos, batched_dep, batched_sdpg = ltp.pipeline( 53 | ["他叫汤姆去拿外衣。", "他点头表示同意我的意见。", "我们即将以昂扬的斗志迎来新的一年。"], ["cws", "pos", "dep", "sdpg"] 54 | ).to_tuple() 55 | 56 | for cws, pos, dep, sdpg in zip(batched_cws, batched_pos, batched_dep, batched_sdpg): 57 | sentence = [] 58 | for idx, (form, xpos, head, deprel) in enumerate(zip(cws, pos, dep["head"], dep["label"])): 59 | sentence.append( 60 | Token( 61 | id=idx + 1, 62 | form=form, 63 | lemma="_", 64 | upos="_", 65 | xpos=xpos, 66 | feats="_", 67 | head=head, 68 | deprel=deprel, 69 | deps="", 70 | misc="_", 71 | ) 72 | ) 73 | 74 | for id, head, tag in sdpg: 75 | if sentence[id - 1].deps: 76 | sentence[id - 1].deps = sentence[id - 1].deps + f"|{head}:{tag}" 77 | else: 78 | sentence[id - 1].deps = f"{head}:{tag}" 79 | 80 | sentence = [str(token) for token in sentence] 81 | sentence = "\n".join(sentence) 82 | 83 | print(sentence) 84 | print("\n") 85 | 86 | 87 | if __name__ == "__main__": 88 | main() 89 | -------------------------------------------------------------------------------- /python/interface/examples/issues.py: -------------------------------------------------------------------------------- 1 | from python.interface.examples.simple import stn_split 2 | 3 | 4 | def issue590(): 5 | from ltp import LTP 6 | ltp = LTP("LTP/tiny") 7 | ltp.add_words(words=["[ENT]"]) 8 | print(ltp.pipeline(["[ENT] Info"], tasks=["cws"])) 9 | 10 | ltp.add_words(words=["[EOS]"]) 11 | print(ltp.pipeline(["[EOS] Info"], tasks=["cws"])) 12 | 13 | 14 | def issue592(): 15 | from ltp import LTP 16 | legacy_ltp = LTP("LTP/legacy") 17 | 18 | legacy_ltp.add_words(words=["SCSG", "IP地址"]) 19 | print(legacy_ltp.pipeline(["SCSGIP地址"], tasks=["cws"])) 20 | 21 | neural_ltp = LTP("LTP/tiny") 22 | 23 | # not bug, but not work because of the bert tokenizer 24 | neural_ltp.add_words(words=["SCSG", "IP地址"]) 25 | print(neural_ltp.pipeline(["SCSGIP地址"], tasks=["cws"])) 26 | 27 | 28 | def issue600(): 29 | from ltp import LTP 30 | legacy_ltp = LTP("LTP/legacy") 31 | print(legacy_ltp.pipeline("他叫汤姆去拿外衣。", tasks=["cws"], return_dict=False)) 32 | 33 | neural_ltp = LTP("LTP/tiny") 34 | print(neural_ltp.pipeline("他叫汤姆去拿外衣。", tasks=["cws"], return_dict=False)) 35 | 36 | 37 | def issue612(): 38 | from ltp import LTP 39 | legacy_ltp = LTP("LTP/legacy") 40 | legacy_ltp.add_words(words=["五星武器"]) 41 | print(legacy_ltp.pipeline("80 抽两五星武器给我吧哥", tasks=["cws"], return_dict=False)) 42 | 43 | neural_ltp = LTP("LTP/tiny") 44 | neural_ltp.add_words(words=["五星武器"]) 45 | print(neural_ltp.pipeline("80 抽两五星武器给我吧哥", tasks=["cws"], return_dict=False)) 46 | 47 | 48 | def issue613(): 49 | import cProfile 50 | from pstats import SortKey 51 | 52 | cProfile.run('from ltp import LTP;LTP("LTP/legacy", local_files_only=True)', sort=SortKey.CUMULATIVE) 53 | 54 | 55 | def issue623(): 56 | from ltp import LTP 57 | from matplotlib import pyplot as plt 58 | from tqdm import trange 59 | ltp = LTP("LTP/legacy") 60 | 61 | def get_current_memory() -> int: 62 | import os 63 | 64 | import psutil 65 | 66 | # 获取当前进程内存占用。 67 | pid = os.getpid() 68 | p = psutil.Process(pid) 69 | info = p.memory_full_info() 70 | return info.uss / 1024 / 1024 71 | 72 | memory = [get_current_memory()] 73 | 74 | for _ in trange(10000): 75 | # ltp.pipeline('他叫汤姆去拿外衣。') 76 | # ltp.pipeline('台湾是中国领土不可分割的一部分。') 77 | ltp.pipeline(["他叫汤姆去拿外衣。", "台湾是中国领土不可分割的一部分。"]) 78 | memory.append(get_current_memory()) 79 | 80 | memory.append(get_current_memory()) 81 | 82 | plt.plot(memory) 83 | plt.show() 84 | 85 | 86 | def issue686(): 87 | from ltp_extension.algorithms import Hook 88 | sentence = b'\xc2\x28'.decode('utf-8', 'replace') 89 | hook = Hook() 90 | hook.add_word(word="[FAKE]") 91 | try: 92 | hook.hook(sentence, ['a', 'b']) 93 | except Exception as e: 94 | print(e) 95 | 96 | 97 | def issue693(): 98 | from ltp import LTP 99 | ltp = LTP("LTP/tiny") 100 | print(ltp.pipeline( 101 | ["视觉Transformers通过将图像区域表示为转换后的tokens并通过注意力权重整合它们来提取视觉信息。"], 102 | tasks=["cws"]) 103 | ) 104 | 105 | 106 | def issue714(): 107 | from ltp import StnSplit 108 | 109 | spliter = StnSplit() 110 | spliter.use_en = False 111 | sents = spliter.split("1.联通华盛电商分公司办公室内的灯火彻夜不熄,这已经成为常态。") 112 | print(sents) 113 | 114 | 115 | def main(): 116 | issue714() 117 | 118 | 119 | if __name__ == "__main__": 120 | main() 121 | -------------------------------------------------------------------------------- /python/interface/examples/rules.py: -------------------------------------------------------------------------------- 1 | from ltp import LTP 2 | from ltp.legacy import CharacterType 3 | 4 | 5 | def rules(): 6 | ltp = LTP("LTP/legacy") 7 | result = ltp(["视频4k60fps无bg"], tasks=["cws"]) 8 | print(result.cws) 9 | ltp.enable_type_cut_d(CharacterType.Roman, CharacterType.Kanji) 10 | ltp.enable_type_concat(CharacterType.Digit, CharacterType.Roman) 11 | result = ltp(["视频4k60fps无bg"], tasks=["cws"]) 12 | print(result.cws) 13 | 14 | 15 | def main(): 16 | rules() 17 | 18 | 19 | if __name__ == "__main__": 20 | main() 21 | -------------------------------------------------------------------------------- /python/interface/examples/simple.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from ltp import LTP 3 | 4 | 5 | def stn_split(): 6 | from ltp import StnSplit 7 | 8 | spliter = StnSplit() 9 | spliter.use_en = False # 关闭英文断句 10 | 11 | sents = spliter.split("汤姆生病了。他去了医院。") 12 | print(sents) 13 | # [ 14 | # "汤姆生病了。", 15 | # "他去了医院。" 16 | # ] 17 | 18 | sents = StnSplit().batch_split(["他叫汤姆去拿外衣。", "汤姆生病了。他去了医院。"]) 19 | print(sents) 20 | # [ 21 | # "他叫汤姆去拿外衣。", 22 | # "汤姆生病了。", 23 | # "他去了医院。" 24 | # ] 25 | 26 | 27 | def legacy(): 28 | ltp = LTP("LTP/legacy") 29 | ltp.add_word("汤姆去") 30 | result = ltp( 31 | ["他叫汤姆去拿外衣。", "树上停着一些小鸟。先飞走了19只,又飞走了15只。两次共飞走了多少只小鸟?"], 32 | tasks=["cws", "pos", "ner"], 33 | ) 34 | print(result.cws) 35 | print(result.pos) 36 | print(result.ner) 37 | 38 | 39 | def neural(): 40 | ltp = LTP("LTP/tiny") 41 | 42 | if torch.cuda.is_available(): 43 | ltp = ltp.to("cuda") 44 | 45 | ltp.add_word("汤姆去") 46 | 47 | # 未分词的文本 48 | result = ltp.pipeline( 49 | ["他叫汤姆去拿外衣。", "韓語:한국의 단오", "树上停着一些小鸟。先飞走了19只,又飞走了15只。两次共飞走了多少只小鸟?"], 50 | tasks=["cws", "pos", "ner", "srl", "dep", "sdp"], 51 | ) 52 | print(result.cws) 53 | print(result.pos) 54 | print(result.ner) 55 | print(result.srl) 56 | print(result.dep) 57 | print(result.sdp) 58 | 59 | # 已经分词的文本 60 | result = ltp.pipeline( 61 | [["他", "叫", "汤姆", "去", "拿", "外衣", "。"], ["가을동", "叫", "1993", "年", "的", "Ameri", "·"]], 62 | # 注意这里移除了 "cws" 任务 63 | tasks=["pos", "ner", "srl", "dep", "sdp"], 64 | ) 65 | print(result.pos) 66 | print(result.ner) 67 | print(result.srl) 68 | print(result.dep) 69 | print(result.sdp) 70 | 71 | 72 | def main(): 73 | stn_split() 74 | legacy() 75 | neural() 76 | 77 | 78 | if __name__ == "__main__": 79 | main() 80 | -------------------------------------------------------------------------------- /python/interface/ltp/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "4.2.13" 2 | 3 | from ltp_extension.algorithms import StnSplit 4 | 5 | from .interface import LTP 6 | 7 | __all__ = [ 8 | "LTP", 9 | "StnSplit", 10 | "__version__", 11 | ] 12 | -------------------------------------------------------------------------------- /python/interface/ltp/generic.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # Author: Yunlong Feng 3 | 4 | from collections import OrderedDict 5 | from dataclasses import dataclass, fields 6 | from typing import Any, List, Optional, Tuple, Union 7 | 8 | 9 | class ModelOutput(OrderedDict): 10 | """Base class for all model outputs as dataclass. Has a `__getitem__` that allows indexing by 11 | integer or slice (like a tuple) or strings (like a dictionary) that will ignore the `None` 12 | attributes. Otherwise behaves like a regular python dictionary. 13 | 14 | 15 | 16 | You can't unpack a `ModelOutput` directly. Use the [`~utils.ModelOutput.to_tuple`] method to convert it to a tuple 17 | before. 18 | 19 | 20 | """ 21 | 22 | def __post_init__(self): 23 | class_fields = fields(self) 24 | 25 | # Safety and consistency checks 26 | if not len(class_fields): 27 | raise ValueError(f"{self.__class__.__name__} has no fields.") 28 | if not all(field.default is None for field in class_fields[1:]): 29 | raise ValueError(f"{self.__class__.__name__} should not have more than one required field.") 30 | 31 | for field in class_fields: 32 | v = getattr(self, field.name) 33 | if v is not None: 34 | self[field.name] = v 35 | 36 | def __delitem__(self, *args, **kwargs): 37 | raise Exception(f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance.") 38 | 39 | def setdefault(self, *args, **kwargs): 40 | raise Exception(f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance.") 41 | 42 | def pop(self, *args, **kwargs): 43 | raise Exception(f"You cannot use ``pop`` on a {self.__class__.__name__} instance.") 44 | 45 | def update(self, *args, **kwargs): 46 | raise Exception(f"You cannot use ``update`` on a {self.__class__.__name__} instance.") 47 | 48 | def __getitem__(self, k): 49 | if isinstance(k, str): 50 | inner_dict = {k: v for (k, v) in self.items()} 51 | return inner_dict[k] 52 | else: 53 | return self.to_tuple()[k] 54 | 55 | def __setattr__(self, name, value): 56 | if name in self.keys() and value is not None: 57 | # Don't call self.__setitem__ to avoid recursion errors 58 | super().__setitem__(name, value) 59 | super().__setattr__(name, value) 60 | 61 | def __setitem__(self, key, value): 62 | # Will raise a KeyException if needed 63 | super().__setitem__(key, value) 64 | # Don't call self.__setattr__ to avoid recursion errors 65 | super().__setattr__(key, value) 66 | 67 | def to_tuple(self) -> Tuple[Any]: 68 | """Convert self to a tuple containing all the attributes/keys that are not `None`.""" 69 | return tuple(self[k] for k in self.keys()) 70 | 71 | 72 | @dataclass 73 | class LTPOutput(ModelOutput): 74 | cws: Optional[Union[List[str], List[List[str]]]] = None 75 | pos: Optional[Union[List[str], List[List[str]]]] = None 76 | ner: Optional[Union[List[str], List[List[str]]]] = None 77 | srl: Optional[Union[List[str], List[List[str]]]] = None 78 | dep: Optional[Union[List[str], List[List[str]]]] = None 79 | sdp: Optional[Union[List[str], List[List[str]]]] = None 80 | sdpg: Optional[Union[List[str], List[List[str]]]] = None 81 | -------------------------------------------------------------------------------- /python/interface/ltp/module.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Union 2 | 3 | import torch 4 | from torch.nn import Module 5 | 6 | 7 | class BaseModule(Module): 8 | __jit_unused_properties__ = ["device", "dtype"] 9 | 10 | def __init__(self): 11 | super().__init__() 12 | self._dtype = torch.get_default_dtype() 13 | self._device = torch.device("cpu") 14 | 15 | @property 16 | def dtype(self) -> Union[str, torch.dtype]: 17 | return self._dtype 18 | 19 | @dtype.setter 20 | def dtype(self, new_dtype: Union[str, torch.dtype]): 21 | # necessary to avoid infinite recursion 22 | raise RuntimeError("Cannot set the dtype explicitly. Please use module.to(new_dtype).") 23 | 24 | @property 25 | def device(self) -> Union[str, torch.device]: 26 | return self._device 27 | 28 | @device.setter 29 | def device(self, new_device: Union[str, torch.device]): 30 | raise RuntimeError("Cannot set the device explicitly. Please use module.to(new_device).") 31 | 32 | def to(self, *args, **kwargs) -> Module: 33 | out = torch._C._nn._parse_to(*args, **kwargs) 34 | self.__update_properties(device=out[0], dtype=out[1]) 35 | return super().to(*args, **kwargs) 36 | 37 | def cuda(self, device: Optional[int] = None) -> Module: 38 | self.__update_properties(device=torch.device("cuda", index=device)) 39 | return super().cuda(device=device) 40 | 41 | def cpu(self) -> Module: 42 | self.__update_properties(device=torch.device("cpu")) 43 | return super().cpu() 44 | 45 | def type(self, dst_type: Union[str, torch.dtype]) -> Module: 46 | self.__update_properties(dtype=dst_type) 47 | return super().type(dst_type=dst_type) 48 | 49 | def float(self) -> Module: 50 | self.__update_properties(dtype=torch.float) 51 | return super().float() 52 | 53 | def double(self) -> Module: 54 | self.__update_properties(dtype=torch.double) 55 | return super().double() 56 | 57 | def half(self) -> Module: 58 | self.__update_properties(dtype=torch.half) 59 | return super().half() 60 | 61 | def __update_properties(self, device: Optional[torch.device] = None, dtype: Optional[torch.dtype] = None): 62 | def apply_fn(module): 63 | if not isinstance(module, BaseModule): 64 | return 65 | if device is not None: 66 | module._device = device 67 | if dtype is not None: 68 | module._dtype = dtype 69 | 70 | self.apply(apply_fn) 71 | -------------------------------------------------------------------------------- /python/interface/ltp/utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | 4 | def get_pylogger(name=__name__) -> logging.Logger: 5 | logger = logging.getLogger(name) 6 | return logger 7 | -------------------------------------------------------------------------------- /python/interface/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.pytest.ini_options] 2 | addopts = [ 3 | "--color=yes", 4 | "--durations=0", 5 | "--strict-markers", 6 | "--doctest-modules", 7 | ] 8 | filterwarnings = [ 9 | "ignore::DeprecationWarning", 10 | "ignore::UserWarning", 11 | ] 12 | log_cli = "True" 13 | markers = [ 14 | "slow: slow tests", 15 | ] 16 | minversion = "6.0" 17 | testpaths = "tests/" 18 | 19 | [tool.coverage.report] 20 | exclude_lines = [ 21 | "pragma: nocover", 22 | "raise NotImplementedError", 23 | "raise NotImplementedError()", 24 | "if __name__ == .__main__.:", 25 | ] 26 | 27 | [tool.ruff] 28 | # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default. 29 | select = ["E", "F"] 30 | ignore = [] 31 | 32 | # Allow autofix for all enabled rules (when `--fix`) is provided. 33 | fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"] 34 | unfixable = [] 35 | 36 | # Exclude a variety of commonly ignored directories. 37 | exclude = [ 38 | ".bzr", 39 | ".direnv", 40 | ".eggs", 41 | ".git", 42 | ".git-rewrite", 43 | ".hg", 44 | ".mypy_cache", 45 | ".nox", 46 | ".pants.d", 47 | ".pytype", 48 | ".ruff_cache", 49 | ".svn", 50 | ".tox", 51 | ".venv", 52 | "__pypackages__", 53 | "_build", 54 | "buck-out", 55 | "build", 56 | "dist", 57 | "node_modules", 58 | "venv", 59 | "docs", 60 | "examples" 61 | ] 62 | per-file-ignores = { } 63 | 64 | # Same as Black. 65 | line-length = 120 66 | 67 | # Allow unused variables when underscore-prefixed. 68 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" 69 | 70 | # Assume Python 3.10. 71 | target-version = "py310" 72 | 73 | [tool.ruff.mccabe] 74 | # Unlike Flake8, default to a complexity level of 10. 75 | max-complexity = 10 -------------------------------------------------------------------------------- /python/interface/requirements.txt: -------------------------------------------------------------------------------- 1 | ltp_core>=0.1.0 2 | ltp_extension>=0.1.0 3 | -------------------------------------------------------------------------------- /python/interface/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from setuptools import find_packages, setup 4 | 5 | project_dir, _ = os.path.split(__file__) 6 | 7 | with open(os.path.join(project_dir, "README.md"), encoding="utf-8") as fh: 8 | long_description = fh.read() 9 | 10 | setup( 11 | name="ltp", 12 | version="4.2.14", 13 | author="Yunlong Feng", 14 | author_email="ylfeng@ir.hit.edu.cn", 15 | url="https://github.com/HIT-SCIR/ltp", 16 | description="Language Technology Platform", 17 | long_description=long_description, 18 | long_description_content_type="text/markdown", 19 | install_requires=[ 20 | "ltp_core>=0.1.3", 21 | "ltp_extension>=0.1.9", 22 | "huggingface_hub>=0.8.0", 23 | ], 24 | classifiers=[ 25 | "Development Status :: 1 - Planning", 26 | "Operating System :: OS Independent", 27 | "Intended Audience :: Developers", 28 | "Programming Language :: Python :: 3.6", 29 | "Programming Language :: Python :: 3.7", 30 | "Programming Language :: Python :: 3.8", 31 | "Programming Language :: Python :: 3.9", 32 | "Programming Language :: Python :: 3.10", 33 | "Programming Language :: Python :: 3.11", 34 | "Topic :: Software Development :: Libraries", 35 | ], 36 | packages=find_packages(), 37 | python_requires=">=3.6, <4", 38 | zip_safe=True, 39 | ) 40 | -------------------------------------------------------------------------------- /python/interface/utils/upload_models.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | 3 | from huggingface_hub import CommitOperationAdd, HfApi 4 | 5 | 6 | def upload_model(model_dir, repo_id): 7 | api = HfApi() 8 | operations = [ 9 | CommitOperationAdd( 10 | path_in_repo="config.json", 11 | path_or_fileobj=os.path.join(model_dir, "config.json"), 12 | ), 13 | CommitOperationAdd( 14 | path_in_repo="pytorch_model.bin", 15 | path_or_fileobj=os.path.join(model_dir, "pytorch_model.bin"), 16 | ), 17 | CommitOperationAdd( 18 | path_in_repo="vocab.txt", 19 | path_or_fileobj=os.path.join(model_dir, "vocab.txt"), 20 | ), 21 | ] 22 | 23 | api.create_commit( 24 | repo_id=repo_id, 25 | operations=operations, 26 | commit_message="Uploaded model", 27 | ) 28 | 29 | 30 | def upload_tokenizer(model_dir, repo_id): 31 | api = HfApi() 32 | operations = [ 33 | CommitOperationAdd( 34 | path_in_repo="added_tokens.json", 35 | path_or_fileobj=os.path.join(model_dir, "added_tokens.json"), 36 | ), 37 | CommitOperationAdd( 38 | path_in_repo="special_tokens_map.json", 39 | path_or_fileobj=os.path.join(model_dir, "special_tokens_map.json"), 40 | ), 41 | CommitOperationAdd( 42 | path_in_repo="tokenizer.json", 43 | path_or_fileobj=os.path.join(model_dir, "tokenizer.json"), 44 | ), 45 | CommitOperationAdd( 46 | path_in_repo="tokenizer_config.json", 47 | path_or_fileobj=os.path.join(model_dir, "tokenizer_config.json"), 48 | ), 49 | ] 50 | 51 | api.create_commit( 52 | repo_id=repo_id, 53 | operations=operations, 54 | commit_message="Uploaded tokenizer", 55 | ) 56 | 57 | 58 | def upload_readme(model_dir, repo_id): 59 | api = HfApi() 60 | operations = [ 61 | CommitOperationAdd( 62 | path_in_repo="README.md", 63 | path_or_fileobj="README.md", 64 | ), 65 | ] 66 | 67 | api.create_commit( 68 | repo_id=repo_id, 69 | operations=operations, 70 | commit_message="Uploaded model", 71 | ) 72 | 73 | 74 | def main(): 75 | for model in ["legacy", "tiny", "small", "base", "base1", "base2"]: 76 | upload_readme(None, repo_id=f"LTP/{model}") 77 | 78 | 79 | if __name__ == "__main__": 80 | main() 81 | -------------------------------------------------------------------------------- /rust/ltp-cffi/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "ltp-cffi" 3 | version = "0.1.0" 4 | edition = "2021" 5 | authors = ["ylfeng "] 6 | description = "The C bindings for LTP." 7 | homepage = "https://github.com/HIT-SCIR/ltp" 8 | repository = "https://github.com/HIT-SCIR/ltp" 9 | keywords = ["ltp", "nlp"] 10 | exclude = [".github"] 11 | readme = "README.md" 12 | license-file = "LICENSE" 13 | 14 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 15 | 16 | [lib] 17 | name = "ltp" 18 | path = "src/lib.rs" 19 | crate-type = ["cdylib", "staticlib"] 20 | 21 | [dependencies] 22 | rayon = { version = "1.5" } 23 | ltp = { version = "*", path = "../ltp", features = ["serialization", "parallel"] } 24 | mimalloc = { version = "0.1", default-features = false, optional = true } 25 | 26 | [features] 27 | malloc = ["mimalloc"] 28 | secure = ["mimalloc/secure"] 29 | -------------------------------------------------------------------------------- /rust/ltp-cffi/LICENSE: -------------------------------------------------------------------------------- 1 | 1. 语言技术平台面向国内外大学、中科院各研究所以及个人研究者免费开放源代码,但如上述机构和个人将该平台用于商业目的(如企业合作项目等)则需要付费。 2 | 2. 除上述机构以外的企事业单位,如申请使用该平台,需付费。 3 | 3. 凡涉及付费问题,请发邮件到 car@ir.hit.edu.cn 洽商。 4 | 4. 如果您在 LTP 基础上发表论文或取得科研成果,请您在发表论文和申报成果时声明“使用了哈工大社会计算与信息检索研究中心研制的语言技术平台(LTP)”. 5 | 同时,发信给car@ir.hit.edu.cn,说明发表论文或申报成果的题目、出处等。 6 | -------------------------------------------------------------------------------- /rust/ltp-cffi/README.md: -------------------------------------------------------------------------------- 1 | # LTP CFFI 2 | 3 | The C bindings for `LTP for Rust`. 4 | -------------------------------------------------------------------------------- /rust/ltp-cffi/cbindgen.toml: -------------------------------------------------------------------------------- 1 | # This is a template cbindgen.toml file with all of the default values. 2 | # Some values are commented out because their absence is the real default. 3 | # 4 | # See https://github.com/eqrion/cbindgen/blob/master/docs.md#cbindgentoml 5 | # for detailed documentation of every option here. 6 | language = "C" 7 | 8 | ############## Options for Wrapping the Contents of the Header ################# 9 | 10 | # header = "/* Text to put at the beginning of the generated file. Probably a license. */" 11 | # trailer = "/* Text to put at the end of the generated file */" 12 | include_guard = "LTP_BINDINGS_DEFINE" 13 | pragma_once = true 14 | autogen_warning = "/* Warning, this file is autogenerated by cbindgen. Don't modify this manually. */" 15 | include_version = false 16 | # namespace = "ltp" 17 | namespaces = [] 18 | using_namespaces = [] 19 | sys_includes = [] 20 | includes = [] 21 | no_includes = false 22 | cpp_compat = true 23 | after_includes = "" 24 | 25 | ############################ Code Style Options ################################ 26 | 27 | braces = "SameLine" 28 | line_length = 100 29 | tab_width = 2 30 | documentation = true 31 | documentation_style = "auto" 32 | documentation_length = "full" 33 | line_endings = "LF" # also "CR", "CRLF", "Native" 34 | 35 | ############################# Codegen Options ################################## 36 | 37 | style = "both" 38 | sort_by = "Name" # default for `fn.sort_by` and `const.sort_by` 39 | usize_is_size_t = true 40 | 41 | [defines] 42 | # "target_os = freebsd" = "DEFINE_FREEBSD" 43 | # "feature = serde" = "DEFINE_SERDE" 44 | 45 | [export] 46 | include = [] 47 | exclude = [] 48 | # prefix = "CAPI_" 49 | item_types = [] 50 | renaming_overrides_prefixing = false 51 | 52 | [export.rename] 53 | 54 | [export.body] 55 | 56 | [export.mangle] 57 | 58 | [fn] 59 | rename_args = "None" 60 | # must_use = "MUST_USE_FUNC" 61 | # no_return = "NO_RETURN" 62 | # prefix = "START_FUNC" 63 | # postfix = "END_FUNC" 64 | args = "auto" 65 | sort_by = "Name" 66 | 67 | [struct] 68 | rename_fields = "None" 69 | # must_use = "MUST_USE_STRUCT" 70 | derive_constructor = false 71 | derive_eq = false 72 | derive_neq = false 73 | derive_lt = false 74 | derive_lte = false 75 | derive_gt = false 76 | derive_gte = false 77 | 78 | [enum] 79 | rename_variants = "None" 80 | # must_use = "MUST_USE_ENUM" 81 | add_sentinel = false 82 | prefix_with_name = false 83 | derive_helper_methods = false 84 | derive_const_casts = false 85 | derive_mut_casts = false 86 | # cast_assert_name = "ASSERT" 87 | derive_tagged_enum_destructor = false 88 | derive_tagged_enum_copy_constructor = false 89 | enum_class = true 90 | private_default_tagged_enum_constructor = false 91 | 92 | [const] 93 | allow_static_const = true 94 | allow_constexpr = false 95 | sort_by = "Name" 96 | 97 | [macro_expansion] 98 | bitflags = false 99 | 100 | ############## Options for How Your Rust library Should Be Parsed ############## 101 | 102 | [parse] 103 | parse_deps = false 104 | # include = [] 105 | exclude = [] 106 | clean = false 107 | extra_bindings = [] 108 | 109 | [parse.expand] 110 | crates = [] 111 | all_features = false 112 | default_features = true 113 | features = [] 114 | -------------------------------------------------------------------------------- /rust/ltp-cffi/examples/example.c: -------------------------------------------------------------------------------- 1 | // 2 | // Created by 冯云龙 on 2022/8/12. 3 | // 4 | 5 | #include 6 | #include 7 | #include 8 | #include "ltp.h" 9 | 10 | #define MAX_WORD_LEN (10) 11 | 12 | struct State { 13 | char **results; 14 | size_t *lengths; 15 | }; 16 | 17 | void store_results(struct State *state, const uint8_t *word, size_t word_len, size_t idx, size_t length) { 18 | state->results[idx] = malloc(word_len + 1); 19 | state->lengths[idx] = word_len; 20 | 21 | strncpy(state->results[idx], (const char *) word, word_len); 22 | state->results[idx][word_len] = '\0'; 23 | 24 | if (idx < length - 1) { 25 | printf("%s ", state->results[idx]); 26 | } else { 27 | printf("%s\n", state->results[idx]); 28 | } 29 | 30 | } 31 | int main() { 32 | const char *cws_model_path = "data/legacy-models/cws_model.bin"; 33 | const char *pos_model_path = "data/legacy-models/pos_model.bin"; 34 | const char *ner_model_path = "data/legacy-models/ner_model.bin"; 35 | Model *cws_model = NULL; 36 | cws_model = model_load(cws_model_path); 37 | Model *pos_model = NULL; 38 | pos_model = model_load(pos_model_path); 39 | Model *ner_model = NULL; 40 | ner_model = model_load(ner_model_path); 41 | 42 | const char *sentence = "他叫汤姆去拿外衣"; 43 | size_t word_length[MAX_WORD_LEN] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 44 | size_t pos_length[MAX_WORD_LEN] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 45 | size_t ner_length[MAX_WORD_LEN] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 46 | char *words[MAX_WORD_LEN] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL}; 47 | char *pos[MAX_WORD_LEN] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL}; 48 | char *ner[MAX_WORD_LEN] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL}; 49 | 50 | struct State word_state = {words, word_length}; 51 | struct State pos_state = {pos, pos_length}; 52 | struct State ner_state = {ner, ner_length}; 53 | 54 | Callback cws_callback = {&word_state, store_results}; 55 | size_t length = model_cws_predict(cws_model, sentence, strlen(sentence), cws_callback); 56 | 57 | Callback pos_callback = {&pos_state, store_results}; 58 | model_pos_predict(pos_model, words, word_length, length, pos_callback); 59 | 60 | Callback ner_callback = {&ner_state, store_results}; 61 | model_ner_predict(ner_model, words, word_length, pos, pos_length, length, ner_callback); 62 | 63 | for (size_t i = 0; i < MAX_WORD_LEN; i++) { 64 | if (words[i] != NULL) { free(words[i]); words[i]=NULL;} 65 | if (pos[i] != NULL) { free(pos[i]); pos[i]=NULL;} 66 | if (ner[i] != NULL) { free(ner[i]); ner[i]=NULL;} 67 | } 68 | 69 | model_release(&cws_model); 70 | model_release(&pos_model); 71 | model_release(&ner_model); 72 | 73 | assert(cws_model == NULL); 74 | assert(pos_model == NULL); 75 | assert(ner_model == NULL); 76 | 77 | return 0; 78 | } 79 | -------------------------------------------------------------------------------- /rust/ltp-cffi/src/lib.rs: -------------------------------------------------------------------------------- 1 | #[cfg(feature = "mimalloc")] 2 | use mimalloc::MiMalloc; 3 | 4 | #[cfg(feature = "mimalloc")] 5 | #[global_allocator] 6 | static GLOBAL: MiMalloc = MiMalloc; 7 | 8 | use std::ffi::c_void; 9 | 10 | pub mod model; 11 | pub mod stnsplit; 12 | 13 | /// The LTP CFFI API. 14 | /// the call args: 15 | /// state: your design 16 | /// tag: the predicted tag 17 | /// tag_len: the length of tag 18 | /// tag_index: the index of current predict 19 | /// tag_total: the length of current predict 20 | #[repr(C)] 21 | pub struct Callback { 22 | pub state: *mut c_void, 23 | // state, char*, char_len, current idx, max_num 24 | pub call: extern "C" fn(*mut c_void, *const u8, usize, usize, usize), 25 | } 26 | 27 | /// The LTP CFFI API. 28 | /// the call args: 29 | /// state: your design 30 | /// tag: the predicted tag 31 | /// tag_len: the length of tag 32 | /// tag_index: the index of current predict 33 | /// tag_total: the length of current predict 34 | /// batch_index: the predict index of current batch 35 | /// batch_total: the batch size of current batch 36 | #[repr(C)] 37 | pub struct BatchCallback { 38 | pub state: *mut c_void, 39 | // state, char*, char_len, tag idx, tag num, batch index, batch num 40 | pub call: extern "C" fn(*mut c_void, *const u8, usize, usize, usize, usize, usize), 41 | } 42 | -------------------------------------------------------------------------------- /rust/ltp-cffi/src/stnsplit.rs: -------------------------------------------------------------------------------- 1 | use crate::Callback; 2 | use ltp::utils::stnsplit::{ 3 | stn_split as r_stn_split, stn_split_with_options as r_stn_split_with_options, SplitOptions, 4 | }; 5 | use std::slice; 6 | 7 | #[no_mangle] 8 | #[allow(clippy::not_unsafe_ptr_arg_deref)] 9 | pub extern "C" fn stn_split(text: *const u8, text_len: usize, callback: Callback) -> usize { 10 | let text = unsafe { std::str::from_utf8_unchecked(slice::from_raw_parts(text, text_len)) }; 11 | let sentences = r_stn_split(text); 12 | for (idx, sentence) in sentences.iter().enumerate() { 13 | (callback.call)( 14 | callback.state, 15 | text.as_ptr(), 16 | sentence.len(), 17 | idx, 18 | sentence.len(), 19 | ); 20 | } 21 | sentences.len() 22 | } 23 | 24 | #[no_mangle] 25 | #[allow(clippy::not_unsafe_ptr_arg_deref)] 26 | pub extern "C" fn stn_split_with_options( 27 | text: *const u8, 28 | text_len: usize, 29 | callback: Callback, 30 | use_zh: bool, 31 | use_en: bool, 32 | bracket_as_entity: bool, 33 | zh_quote_as_entity: bool, 34 | en_quote_as_entity: bool, 35 | ) -> usize { 36 | let text = unsafe { std::str::from_utf8_unchecked(slice::from_raw_parts(text, text_len)) }; 37 | let options = SplitOptions { 38 | use_zh, 39 | use_en, 40 | bracket_as_entity, 41 | zh_quote_as_entity, 42 | en_quote_as_entity, 43 | }; 44 | 45 | let sentences = r_stn_split_with_options(text, &options); 46 | for (idx, sentence) in sentences.iter().enumerate() { 47 | (callback.call)( 48 | callback.state, 49 | sentence.as_ptr(), 50 | sentence.len(), 51 | idx, 52 | sentence.len(), 53 | ); 54 | } 55 | sentences.len() 56 | } 57 | -------------------------------------------------------------------------------- /rust/ltp/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "ltp" 3 | version = "0.1.9" 4 | edition = "2021" 5 | authors = ["ylfeng "] 6 | description = "Language Technology Platform For Rust." 7 | homepage = "https://github.com/HIT-SCIR/ltp" 8 | repository = "https://github.com/HIT-SCIR/ltp" 9 | keywords = ["ltp", "nlp"] 10 | exclude = [".github"] 11 | readme = "README.md" 12 | license-file = "LICENSE" 13 | 14 | [[example]] 15 | name = "cws" 16 | path = "examples/cws.rs" 17 | required-features = ["serialization", "parallel"] 18 | 19 | [[example]] 20 | name = "pos" 21 | path = "examples/pos.rs" 22 | required-features = ["serialization", "parallel"] 23 | 24 | [[example]] 25 | name = "ner" 26 | path = "examples/ner.rs" 27 | required-features = ["serialization", "parallel"] 28 | 29 | [[example]] 30 | name = "simple" 31 | path = "examples/simple.rs" 32 | required-features = ["serialization", "parallel"] 33 | 34 | [dependencies] 35 | anyhow = "1" 36 | num-traits = "0.2" 37 | itertools = "0.14" 38 | 39 | cedarwood = "0.4" 40 | 41 | # 断句避免过多内存申请 42 | smallvec = { version = "1" } 43 | # 数据集 shuffle 44 | rand = { version = "0.9" } 45 | # 特征裁剪 46 | binary-heap-plus = { version = "0.5" } 47 | 48 | # 并行 49 | rayon = { version = "1.5", optional = true } 50 | 51 | # 序列化 52 | serde = { version = "1.0", features = ["derive"], optional = true } 53 | serde_json = { version = "1.0", optional = true } 54 | apache-avro = { version = "0.18.0", optional = true } 55 | 56 | # Todo: Nocopy Serialize 更快地加载速度 57 | compact_str = { version = "0.9", optional = true } 58 | rkyv = { version = "0.8", optional = true } 59 | 60 | [features] 61 | default = [] 62 | char-type = [] 63 | cross-char = [] 64 | near-char-type = [] 65 | parallel = ["rayon"] 66 | serialization = ["serde", "serde_json", "apache-avro"] 67 | 68 | [dev-dependencies] 69 | clap = { version = "4", features = ["derive"] } 70 | 71 | ndarray = "0.16" 72 | ndarray-npy = { version = "0.9", features = ["npz"] } 73 | -------------------------------------------------------------------------------- /rust/ltp/LICENSE: -------------------------------------------------------------------------------- 1 | 1. 语言技术平台面向国内外大学、中科院各研究所以及个人研究者免费开放源代码,但如上述机构和个人将该平台用于商业目的(如企业合作项目等)则需要付费。 2 | 2. 除上述机构以外的企事业单位,如申请使用该平台,需付费。 3 | 3. 凡涉及付费问题,请发邮件到 car@ir.hit.edu.cn 洽商。 4 | 4. 如果您在 LTP 基础上发表论文或取得科研成果,请您在发表论文和申报成果时声明“使用了哈工大社会计算与信息检索研究中心研制的语言技术平台(LTP)”. 5 | 同时,发信给car@ir.hit.edu.cn,说明发表论文或申报成果的题目、出处等。 6 | -------------------------------------------------------------------------------- /rust/ltp/examples/simple.rs: -------------------------------------------------------------------------------- 1 | use itertools::multizip; 2 | use ltp::{CWSModel, Codec, Format, ModelSerde, NERModel, POSModel}; 3 | use std::fs::File; 4 | 5 | fn main() -> Result<(), Box> { 6 | let file = File::open("data/legacy-models/cws_model.bin")?; 7 | let cws: CWSModel = ModelSerde::load(file, Format::AVRO(Codec::Deflate))?; 8 | let file = File::open("data/legacy-models/pos_model.bin")?; 9 | let pos: POSModel = ModelSerde::load(file, Format::AVRO(Codec::Deflate))?; 10 | let file = File::open("data/legacy-models/ner_model.bin")?; 11 | let ner: NERModel = ModelSerde::load(file, Format::AVRO(Codec::Deflate))?; 12 | 13 | let words = cws.predict("他叫汤姆去拿外衣。")?; 14 | let pos = pos.predict(&words)?; 15 | let ner = ner.predict((&words, &pos))?; 16 | 17 | for (w, p, n) in multizip((words, pos, ner)) { 18 | println!("{}/{}/{}", w, p, n); 19 | } 20 | 21 | Ok(()) 22 | } 23 | -------------------------------------------------------------------------------- /rust/ltp/src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod perceptron; 2 | pub mod utils; 3 | 4 | pub use perceptron::{ 5 | Algorithm, CWSDefinition, NERDefinition, POSDefinition, PaMode, Perceptron, Trainer, 6 | }; 7 | #[cfg(feature = "serialization")] 8 | pub use perceptron::{Codec, Format, ModelSerde, Reader, SerdeModel, SerdeCWSModel, SerdePOSModel, SerdeNERModel}; 9 | 10 | #[cfg(feature = "serialization")] 11 | pub type CWSModel = SerdeCWSModel; 12 | #[cfg(feature = "serialization")] 13 | pub type POSModel = SerdePOSModel; 14 | #[cfg(feature = "serialization")] 15 | pub type NERModel = SerdeNERModel; 16 | 17 | 18 | -------------------------------------------------------------------------------- /rust/ltp/src/perceptron/definition/mod.rs: -------------------------------------------------------------------------------- 1 | mod cws; 2 | mod ner; 3 | mod pos; 4 | 5 | use anyhow::Result; 6 | use std::collections::HashSet; 7 | use std::fmt::Debug; 8 | use std::io::Read; 9 | 10 | use crate::utils::get_entities; 11 | use crate::perceptron::Sample; 12 | pub use cws::CWSDefinition; 13 | pub use ner::NERDefinition; 14 | pub use pos::POSDefinition; 15 | 16 | #[macro_export] 17 | macro_rules! buf_feature { 18 | ($dst:expr, $feat:tt, $($arg:tt)*) => { 19 | write!($dst, $($arg)*)?; 20 | $feat.push($dst.len()); 21 | }; 22 | } 23 | 24 | pub trait CommonDefinePredict {} 25 | 26 | impl CommonDefinePredict for POSDefinition {} 27 | 28 | impl CommonDefinePredict for NERDefinition {} 29 | 30 | pub trait GenericItem<'a> { 31 | type Item; 32 | } 33 | 34 | pub trait Definition: Default + Debug + Clone { 35 | type Fragment: ?Sized + for<'any> GenericItem<'any>; 36 | type Prediction: ?Sized + for<'any> GenericItem<'any>; 37 | type RawFeature: ?Sized + for<'any> GenericItem<'any>; 38 | 39 | fn use_viterbi(&self) -> bool { 40 | false 41 | } 42 | 43 | fn labels(&self) -> Vec; 44 | 45 | fn label_num(&self) -> usize; 46 | 47 | fn label_to(&self, label: &str) -> usize; 48 | 49 | fn to_label(&self, index: usize) -> &str; 50 | 51 | #[allow(clippy::type_complexity)] 52 | fn parse_features( 53 | &self, 54 | raw: &::Item, 55 | ) -> Result<(::Item, Vec>)>; 56 | 57 | #[allow(clippy::type_complexity)] 58 | fn parse_features_with_buffer<'a>( 59 | &self, 60 | raw: &::Item, 61 | buf: &'a mut Vec, 62 | ) -> Result<(::Item, Vec>)>; 63 | 64 | fn parse_gold_features(&self, reader: R) -> Result>; 65 | 66 | fn to_labels(&self, index: &[usize]) -> Vec<&str> { 67 | index.iter().map(|&p| self.to_label(p)).collect() 68 | } 69 | 70 | fn predict( 71 | &self, 72 | raw: &::Item, 73 | fragments: &::Item, 74 | preds: &[usize], 75 | ) -> ::Item; 76 | 77 | fn evaluate(&self, predicts: &[usize], labels: &[usize]) -> (usize, usize, usize); 78 | 79 | fn evaluate_tags(&self, predicts: &[usize], labels: &[usize]) -> (usize, usize, usize) { 80 | ( 81 | predicts 82 | .iter() 83 | .zip(labels.iter()) 84 | .map(|(p, l)| if p == l { 1usize } else { 0usize }) 85 | .sum::(), 86 | predicts.len(), 87 | labels.len(), 88 | ) 89 | } 90 | 91 | fn evaluate_entities(&self, predicts: &[usize], labels: &[usize]) -> (usize, usize, usize) { 92 | let predicts = self.to_labels(predicts); 93 | let labels = self.to_labels(labels); 94 | 95 | let predicts: HashSet<_> = get_entities(&predicts).into_iter().collect(); 96 | let labels: HashSet<_> = get_entities(&labels).into_iter().collect(); 97 | 98 | let correct = predicts.intersection(&labels).count(); 99 | (correct, predicts.len(), labels.len()) 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /rust/ltp/src/perceptron/feature.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | use std::ops::Deref; 3 | 4 | pub trait TraitFeature { 5 | fn get_with_key(&self, key: &str) -> Option; 6 | fn get_vector_str(&self, key: &[&str]) -> Vec { 7 | key.iter() 8 | .map(|k| self.get_with_key(k)) 9 | .into_iter() 10 | .flatten() 11 | .collect() 12 | } 13 | fn get_vector_string(&self, key: &[String]) -> Vec { 14 | key.iter() 15 | .map(|k| self.get_with_key(k)) 16 | .into_iter() 17 | .flatten() 18 | .collect() 19 | } 20 | } 21 | pub trait TraitFeatureCompressUtils: Default + IntoIterator { 22 | fn features(self) -> Vec<(String, usize)>; 23 | } 24 | 25 | impl TraitFeatureCompressUtils for T 26 | where 27 | T: Default + IntoIterator, 28 | { 29 | fn features(self) -> Vec<(String, usize)> { 30 | self.into_iter().collect() 31 | } 32 | } 33 | 34 | pub trait TraitFeaturesTrainUtils: Clone { 35 | fn feature_num(&self) -> usize; 36 | fn insert_feature(&mut self, key: String, value: usize); 37 | fn remove_feature(&mut self, key: &str) -> Option; 38 | fn put_feature(&mut self, key: String, value: usize); 39 | fn del_feature(&mut self, key: &str) -> Option; 40 | } 41 | 42 | impl TraitFeature for &T 43 | where 44 | T: TraitFeature, 45 | { 46 | fn get_with_key(&self, key: &str) -> Option { 47 | self.deref().get_with_key(key) 48 | } 49 | } 50 | 51 | impl TraitFeaturesTrainUtils for &T 52 | where 53 | T: TraitFeaturesTrainUtils, 54 | { 55 | fn feature_num(&self) -> usize { 56 | self.deref().feature_num() 57 | } 58 | 59 | fn insert_feature(&mut self, key: String, value: usize) { 60 | self.deref().put_feature(key, value) 61 | } 62 | 63 | fn remove_feature(&mut self, key: &str) -> Option { 64 | self.deref().del_feature(key) 65 | } 66 | 67 | fn put_feature(&mut self, key: String, value: usize) { 68 | self.deref().insert_feature(key, value) 69 | } 70 | 71 | fn del_feature(&mut self, key: &str) -> Option { 72 | self.deref().remove_feature(key) 73 | } 74 | } 75 | 76 | // HashMap 77 | 78 | impl TraitFeature for HashMap { 79 | fn get_with_key(&self, key: &str) -> Option { 80 | self.get(key).copied() 81 | } 82 | } 83 | 84 | impl TraitFeaturesTrainUtils for HashMap { 85 | fn feature_num(&self) -> usize { 86 | self.len() 87 | } 88 | 89 | fn insert_feature(&mut self, key: String, value: usize) { 90 | self.insert(key, value); 91 | } 92 | 93 | fn remove_feature(&mut self, key: &str) -> Option { 94 | self.remove(key) 95 | } 96 | 97 | fn put_feature(&mut self, key: String, value: usize) { 98 | self.insert(key, value); 99 | } 100 | 101 | fn del_feature(&mut self, key: &str) -> Option { 102 | self.remove(key) 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /rust/ltp/src/perceptron/mod.rs: -------------------------------------------------------------------------------- 1 | mod definition; 2 | mod feature; 3 | mod model; 4 | mod parameter; 5 | #[cfg(feature = "serialization")] 6 | mod serialization; 7 | mod trainer; 8 | 9 | pub use definition::{CWSDefinition, Definition, GenericItem, NERDefinition, POSDefinition}; 10 | pub use feature::{TraitFeature, TraitFeatureCompressUtils, TraitFeaturesTrainUtils}; 11 | pub use model::{PaMode, Perceptron}; 12 | pub use parameter::{ 13 | TraitParameter, TraitParameterStorage, TraitParameterStorageCompressUtils, 14 | TraitParameterStorageTrainUtils, TraitParameterStorageUtils, 15 | }; 16 | #[cfg(feature = "serialization")] 17 | pub use serialization::{ 18 | schema, Codec, Format, ModelSerde, Reader, Schema, SerdeCWSModel, SerdeModel, SerdeNERModel, 19 | SerdePOSModel, 20 | }; 21 | pub use trainer::{Algorithm, Trainer}; 22 | pub type Sample = (Vec>, Vec); 23 | -------------------------------------------------------------------------------- /rust/ltp/src/perceptron/parameter.rs: -------------------------------------------------------------------------------- 1 | use num_traits::{Float, Num, NumAssignOps}; 2 | use std::ops::{Deref, Index, IndexMut}; 3 | 4 | pub trait TraitParameter: Float + NumAssignOps + Default {} 5 | 6 | impl TraitParameter for T where T: Float + NumAssignOps + Default {} 7 | 8 | pub trait TraitParameterStorageUtils { 9 | fn len(&self) -> usize; 10 | fn is_empty(&self) -> bool { 11 | self.len() == 0 12 | } 13 | } 14 | 15 | impl TraitParameterStorageUtils for &T 16 | where 17 | T: TraitParameterStorageUtils, 18 | { 19 | fn len(&self) -> usize { 20 | self.deref().len() 21 | } 22 | fn is_empty(&self) -> bool { 23 | self.deref().is_empty() 24 | } 25 | } 26 | 27 | pub trait TraitParameterStorage: 28 | Index + TraitParameterStorageUtils 29 | where 30 | Param: TraitParameter, 31 | { 32 | } 33 | 34 | impl TraitParameterStorage for T 35 | where 36 | T: Index + TraitParameterStorageUtils, 37 | Param: TraitParameter, 38 | { 39 | } 40 | 41 | impl TraitParameterStorageUtils for Vec { 42 | fn len(&self) -> usize { 43 | self.len() 44 | } 45 | fn is_empty(&self) -> bool { 46 | self.is_empty() 47 | } 48 | } 49 | 50 | // 模型训练需要实现的接口 51 | pub trait TraitParameterStorageTrainUtilsInit: Default { 52 | fn init(value: Param, size: usize) -> Self; 53 | } 54 | impl TraitParameterStorageTrainUtilsInit for Vec { 55 | fn init(value: Param, size: usize) -> Self { 56 | vec![value; size] 57 | } 58 | } 59 | pub trait TraitParameterStorageTrainUtils: 60 | Clone 61 | + Index 62 | + IndexMut 63 | + TraitParameterStorageTrainUtilsInit 64 | { 65 | } 66 | impl TraitParameterStorageTrainUtils for T where 67 | T: Clone 68 | + Index 69 | + IndexMut 70 | + TraitParameterStorageTrainUtilsInit 71 | { 72 | } 73 | 74 | // 模型压缩需要实现的接口 75 | pub trait TraitParameterStorageCompressUtils { 76 | fn with_capacity(capacity: usize) -> Self; 77 | fn push(&mut self, value: Param); 78 | } 79 | 80 | impl TraitParameterStorageCompressUtils for Vec { 81 | fn with_capacity(capacity: usize) -> Self { 82 | Self::with_capacity(capacity) 83 | } 84 | 85 | fn push(&mut self, value: T) { 86 | self.push(value); 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /rust/ltp/src/utils/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod eisner; 2 | pub mod entities; 3 | pub mod hook; 4 | pub mod stnsplit; 5 | pub mod viterbi; 6 | 7 | pub use eisner::eisner; 8 | pub use entities::{drop_get_entities, get_entities}; 9 | pub use stnsplit::{stn_split, stn_split_with_options, SplitOptions}; 10 | pub use viterbi::viterbi_decode_postprocessing; -------------------------------------------------------------------------------- /rust/ltp/src/utils/viterbi.rs: -------------------------------------------------------------------------------- 1 | use num_traits::PrimInt; 2 | 3 | pub fn viterbi_decode_postprocessing( 4 | history: &[T], 5 | last_tags: &[T], 6 | stn_lengths: &[usize], 7 | labels_num: usize, 8 | ) -> Vec> 9 | where 10 | T: PrimInt, 11 | { 12 | // history 13 | // max_stn_len * stn_num * labels_num 14 | let stn_num: usize = stn_lengths.iter().sum(); 15 | let b_bias = stn_num * labels_num; 16 | let i_bias = labels_num; 17 | 18 | let mut result: Vec> = Vec::new(); 19 | let mut stn_idx = 0; 20 | for &stn_len in stn_lengths { 21 | for _search_idx in 0..stn_len { 22 | let best_last_tag = last_tags[stn_idx]; 23 | let mut best_tags = vec![best_last_tag]; 24 | 25 | // history 26 | // stn_len * stn_num * labels_num 27 | for search_end in 1..(stn_len) { 28 | // last one has been used 29 | let search_end = (stn_len - 1) - search_end; 30 | let forward_best = *best_tags.last().unwrap(); 31 | let index = 32 | search_end * b_bias + stn_idx * i_bias + forward_best.to_usize().unwrap(); 33 | let last_best = history[index]; 34 | best_tags.push(last_best); 35 | } 36 | best_tags.reverse(); 37 | result.push(best_tags); 38 | stn_idx += 1; 39 | } 40 | } 41 | result 42 | } 43 | 44 | #[cfg(test)] 45 | mod tests { 46 | use super::viterbi_decode_postprocessing; 47 | use ndarray::{Array1, Array3}; 48 | use ndarray_npy::{NpzReader, ReadNpzError}; 49 | use std::fs::File; 50 | 51 | #[test] 52 | fn test_viterbi() -> Result<(), ReadNpzError> { 53 | let mut npz = NpzReader::new(File::open("test/viterbi.npz").unwrap())?; 54 | let srl_history: Array3 = npz.by_name("srl_history.npy")?; 55 | let srl_last_tags: Array1 = npz.by_name("srl_last_tags.npy")?; 56 | let word_nums: Array1 = npz.by_name("word_nums.npy")?; 57 | let correct: Array1 = npz.by_name("correct.npy")?; 58 | 59 | let label_num = srl_history.dim().2; 60 | let word_nums: Vec = word_nums.iter().map(|&x| x as usize).collect(); 61 | 62 | let output = viterbi_decode_postprocessing( 63 | srl_history.as_slice().unwrap(), 64 | srl_last_tags.as_slice().unwrap(), 65 | word_nums.as_slice(), 66 | label_num, 67 | ); 68 | 69 | let correct: Vec = correct.iter().map(|&x| x).collect(); 70 | let output: Vec = output.iter().flatten().map(|&x| x).collect(); 71 | 72 | assert_eq!(correct, output); 73 | 74 | Ok(()) 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /rust/ltp/test/eisner.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HIT-SCIR/ltp/6c6059d5ccad87dca003190ee7565af86e7e22a0/rust/ltp/test/eisner.npz -------------------------------------------------------------------------------- /rust/ltp/test/viterbi.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HIT-SCIR/ltp/6c6059d5ccad87dca003190ee7565af86e7e22a0/rust/ltp/test/viterbi.npz -------------------------------------------------------------------------------- /rust/ltp/vendor/schema/cws.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "record", 3 | "name": "cws", 4 | "fields": [ 5 | { 6 | "name": "definition", 7 | "type": "record", 8 | "fields": [] 9 | }, 10 | { 11 | "name": "features", 12 | "type": "map", 13 | "values": "long", 14 | "default": {} 15 | }, 16 | { 17 | "name": "parameters", 18 | "type": "array", 19 | "items": "double", 20 | "default": [] 21 | } 22 | ] 23 | } 24 | -------------------------------------------------------------------------------- /rust/ltp/vendor/schema/ner.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "record", 3 | "name": "ner", 4 | "fields": [ 5 | { 6 | "name": "definition", 7 | "type": "record", 8 | "fields": [ 9 | { 10 | "name": "to_labels", 11 | "type": "array", 12 | "items": "string", 13 | "default": [] 14 | }, 15 | { 16 | "name": "labels_to", 17 | "type": "map", 18 | "values": "long", 19 | "default": {} 20 | } 21 | ] 22 | }, 23 | { 24 | "name": "features", 25 | "type": "map", 26 | "values": "long", 27 | "default": {} 28 | }, 29 | { 30 | "name": "parameters", 31 | "type": "array", 32 | "items": "double", 33 | "default": [] 34 | } 35 | ] 36 | } 37 | -------------------------------------------------------------------------------- /rust/ltp/vendor/schema/pos.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type": "record", 3 | "name": "pos", 4 | "fields": [ 5 | { 6 | "name": "definition", 7 | "type": "record", 8 | "fields": [ 9 | { 10 | "name": "to_labels", 11 | "type": "array", 12 | "items": "string", 13 | "default": [] 14 | }, 15 | { 16 | "name": "labels_to", 17 | "type": "map", 18 | "values": "long", 19 | "default": {} 20 | } 21 | ] 22 | }, 23 | { 24 | "name": "features", 25 | "type": "map", 26 | "values": "long", 27 | "default": {} 28 | }, 29 | { 30 | "name": "parameters", 31 | "type": "array", 32 | "items": "double", 33 | "default": [] 34 | } 35 | ] 36 | } 37 | --------------------------------------------------------------------------------