├── .config
    └── config
├── .github
    ├── PULL_REQUEST_TEMPLATE.md
    ├── dependabot.yml
    └── workflows
    │   ├── ltp-core-publish.yml
    │   ├── ltp-extension-publish.yml
    │   ├── ltp-publish.yml
    │   ├── ruff.yaml
    │   └── test.yml
├── .gitignore
├── .ruff.toml
├── CITATION.cff
├── Cargo.toml
├── Makefile
├── README.md
├── appveyor.yml
├── data
    ├── .gitkeep
    └── examples
    │   ├── cws
    │       ├── raw.txt
    │       ├── test.txt
    │       ├── train.txt
    │       └── val.txt
    │   ├── ner
    │       ├── raw.txt
    │       ├── test.txt
    │       ├── train.txt
    │       ├── val.txt
    │       └── vocab.txt
    │   └── pos
    │       ├── raw.txt
    │       ├── test.txt
    │       ├── train.txt
    │       ├── val.txt
    │       └── vocab.txt
├── python
    ├── core
    │   ├── .env.example
    │   ├── LICENSE
    │   ├── MANIFEST.in
    │   ├── Makefile
    │   ├── README.md
    │   ├── bash
    │   │   ├── eval.sh
    │   │   └── train.sh
    │   ├── configs
    │   │   ├── callbacks
    │   │   │   ├── default.yaml
    │   │   │   ├── early_stopping.yaml
    │   │   │   ├── model_checkpoint.yaml
    │   │   │   ├── model_summary.yaml
    │   │   │   ├── none.yaml
    │   │   │   └── rich_progress_bar.yaml
    │   │   ├── datamodule
    │   │   │   ├── cls_datamodules.yaml
    │   │   │   ├── cws_datamodules.yaml
    │   │   │   ├── dep_datamodules.yaml
    │   │   │   ├── multi_datamodules.yaml
    │   │   │   ├── ner_datamodules.yaml
    │   │   │   ├── pos_datamodules.yaml
    │   │   │   ├── sdp_datamodules.yaml
    │   │   │   └── srl_datamodules.yaml
    │   │   ├── debug
    │   │   │   ├── default.yaml
    │   │   │   ├── fdr.yaml
    │   │   │   ├── limit.yaml
    │   │   │   ├── overfit.yaml
    │   │   │   └── profiler.yaml
    │   │   ├── eval.yaml
    │   │   ├── experiment
    │   │   │   ├── cls.yaml
    │   │   │   ├── cws.yaml
    │   │   │   ├── dep.yaml
    │   │   │   ├── example.yaml
    │   │   │   ├── multi.yaml
    │   │   │   ├── multi_bi.yaml
    │   │   │   ├── ner.yaml
    │   │   │   ├── pos.yaml
    │   │   │   ├── sdp.yaml
    │   │   │   └── srl.yaml
    │   │   ├── extras
    │   │   │   └── default.yaml
    │   │   ├── hparams_search
    │   │   │   └── ltp_optuna.yaml
    │   │   ├── hydra
    │   │   │   └── default.yaml
    │   │   ├── local
    │   │   │   └── .gitkeep
    │   │   ├── logger
    │   │   │   ├── comet.yaml
    │   │   │   ├── csv.yaml
    │   │   │   ├── many_loggers.yaml
    │   │   │   ├── mlflow.yaml
    │   │   │   ├── neptune.yaml
    │   │   │   ├── tensorboard.yaml
    │   │   │   └── wandb.yaml
    │   │   ├── model
    │   │   │   ├── cls_model.yaml
    │   │   │   ├── cws_model.yaml
    │   │   │   ├── dep_model.yaml
    │   │   │   ├── multi_model.yaml
    │   │   │   ├── ner_model.yaml
    │   │   │   ├── pos_model.yaml
    │   │   │   ├── sdp_model.yaml
    │   │   │   └── srl_model.yaml
    │   │   ├── paths
    │   │   │   └── default.yaml
    │   │   ├── train.yaml
    │   │   └── trainer
    │   │   │   ├── cpu.yaml
    │   │   │   ├── ddp.yaml
    │   │   │   ├── ddp_sim.yaml
    │   │   │   ├── default.yaml
    │   │   │   ├── gpu.yaml
    │   │   │   └── mps.yaml
    │   ├── data
    │   │   ├── .gitkeep
    │   │   ├── conllu
    │   │   │   ├── dev.conllu
    │   │   │   ├── test.conllu
    │   │   │   ├── train.conllu
    │   │   │   └── vocabs
    │   │   │   │   ├── deprel.txt
    │   │   │   │   ├── deps.txt
    │   │   │   │   ├── feats.txt
    │   │   │   │   ├── lemma.txt
    │   │   │   │   ├── upos.txt
    │   │   │   │   ├── word.txt
    │   │   │   │   ├── word_char.txt
    │   │   │   │   └── xpos.txt
    │   │   ├── ner
    │   │   │   ├── dev.bio
    │   │   │   ├── test.bio
    │   │   │   ├── train.bio
    │   │   │   └── vocabs
    │   │   │   │   └── bio.txt
    │   │   └── srl
    │   │   │   ├── dev.txt
    │   │   │   ├── test.txt
    │   │   │   ├── train.txt
    │   │   │   └── vocabs
    │   │   │       ├── arguments.txt
    │   │   │       └── predicate.txt
    │   ├── logs
    │   │   └── .gitkeep
    │   ├── ltp_core
    │   │   ├── __init__.py
    │   │   ├── algorithms
    │   │   │   ├── __init__.py
    │   │   │   ├── eisner.py
    │   │   │   └── get_entities.py
    │   │   ├── datamodules
    │   │   │   ├── __init__.py
    │   │   │   ├── adapters
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── dependency_parsing.py
    │   │   │   │   ├── named_entity_recognition.py
    │   │   │   │   ├── postagger.py
    │   │   │   │   ├── segmention.py
    │   │   │   │   ├── semantic_dependency_parsing.py
    │   │   │   │   ├── semantic_role_labeling.py
    │   │   │   │   └── sentence_classification.py
    │   │   │   ├── components
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── bio.py
    │   │   │   │   ├── conllu.py
    │   │   │   │   └── srl.py
    │   │   │   ├── multi_task_datamodule.py
    │   │   │   ├── task_datamodule.py
    │   │   │   └── utils
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── collate.py
    │   │   │   │   ├── datasets.py
    │   │   │   │   ├── iterator.py
    │   │   │   │   ├── multitask_dataloader.py
    │   │   │   │   └── vocab_helper.py
    │   │   ├── eval.py
    │   │   ├── models
    │   │   │   ├── __init__.py
    │   │   │   ├── components
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── graph.py
    │   │   │   │   ├── sent.py
    │   │   │   │   └── token.py
    │   │   │   ├── criterion
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── graph.py
    │   │   │   │   ├── sent.py
    │   │   │   │   └── token.py
    │   │   │   ├── functional
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── distill.py
    │   │   │   │   ├── eisner.py
    │   │   │   │   └── multilabel_categorical_crossentropy.py
    │   │   │   ├── lit_model.py
    │   │   │   ├── ltp_model.py
    │   │   │   ├── metrics
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── graph.py
    │   │   │   │   ├── sent.py
    │   │   │   │   └── token.py
    │   │   │   ├── nn
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── biaffine.py
    │   │   │   │   ├── crf.py
    │   │   │   │   ├── global_pointer.py
    │   │   │   │   ├── mlp.py
    │   │   │   │   └── relative_transformer.py
    │   │   │   ├── optimization
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── layer_lrs.py
    │   │   │   │   └── scheduler.py
    │   │   │   ├── processor
    │   │   │   │   └── __init__.py
    │   │   │   └── utils
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── instantiate.py
    │   │   │   │   └── transformer.py
    │   │   ├── train.py
    │   │   └── utils
    │   │   │   ├── __init__.py
    │   │   │   ├── pylogger.py
    │   │   │   ├── rich_utils.py
    │   │   │   └── utils.py
    │   ├── pyproject.toml
    │   ├── requirements.txt
    │   ├── setup.cfg
    │   ├── setup.py
    │   └── tests
    │   │   ├── __init__.py
    │   │   └── test_crf.py
    ├── extension
    │   ├── Cargo.toml
    │   ├── LICENSE
    │   ├── README.md
    │   ├── examples
    │   │   ├── benchmark.py
    │   │   ├── benchmark2.py
    │   │   └── legacy_train.py
    │   ├── ltp_extension
    │   │   ├── __init__.py
    │   │   ├── algorithms
    │   │   │   ├── __init__.py
    │   │   │   └── algorithms.pyi
    │   │   ├── ltp_extension.pyi
    │   │   └── perceptron
    │   │   │   ├── __init__.py
    │   │   │   └── perceptron.pyi
    │   ├── pyproject.toml
    │   ├── src
    │   │   ├── algorithms.rs
    │   │   ├── hook.rs
    │   │   ├── lib.rs
    │   │   ├── perceptron
    │   │   │   ├── alg.rs
    │   │   │   ├── com.rs
    │   │   │   ├── mod.rs
    │   │   │   ├── model.rs
    │   │   │   ├── specialization
    │   │   │   │   ├── cws.rs
    │   │   │   │   ├── mod.rs
    │   │   │   │   ├── ner.rs
    │   │   │   │   └── pos.rs
    │   │   │   └── trainer.rs
    │   │   ├── stnsplit.rs
    │   │   └── utils
    │   │   │   ├── mod.rs
    │   │   │   └── parallelism.rs
    │   └── utils
    │   │   └── stub.py
    └── interface
    │   ├── LICENSE
    │   ├── MANIFEST.in
    │   ├── Makefile
    │   ├── README.md
    │   ├── docs
    │       ├── README.md
    │       ├── api
    │       │   └── ltp.rst
    │       ├── appendix.rst
    │       ├── conf.py
    │       ├── index.rst
    │       ├── introduction.rst
    │       ├── performance.rst
    │       └── quickstart.rst
    │   ├── examples
    │       ├── conllu.py
    │       ├── issues.py
    │       ├── rules.py
    │       ├── server.py
    │       └── simple.py
    │   ├── ltp
    │       ├── __init__.py
    │       ├── generic.py
    │       ├── interface.py
    │       ├── legacy.py
    │       ├── mixin.py
    │       ├── module.py
    │       ├── nerual.py
    │       └── utils.py
    │   ├── pyproject.toml
    │   ├── requirements.txt
    │   ├── setup.py
    │   └── utils
    │       └── upload_models.py
└── rust
    ├── ltp-cffi
        ├── Cargo.toml
        ├── LICENSE
        ├── README.md
        ├── cbindgen.toml
        ├── examples
        │   └── example.c
        └── src
        │   ├── lib.rs
        │   ├── model.rs
        │   └── stnsplit.rs
    └── ltp
        ├── Cargo.toml
        ├── LICENSE
        ├── README.md
        ├── examples
            ├── cws.rs
            ├── ner.rs
            ├── pos.rs
            └── simple.rs
        ├── src
            ├── lib.rs
            ├── perceptron
            │   ├── definition
            │   │   ├── cws.rs
            │   │   ├── mod.rs
            │   │   ├── ner.rs
            │   │   └── pos.rs
            │   ├── feature.rs
            │   ├── mod.rs
            │   ├── model.rs
            │   ├── parameter.rs
            │   ├── serialization.rs
            │   └── trainer.rs
            └── utils
            │   ├── eisner.rs
            │   ├── entities.rs
            │   ├── hook.rs
            │   ├── mod.rs
            │   ├── stnsplit.rs
            │   └── viterbi.rs
        ├── test
            ├── eisner.npz
            └── viterbi.npz
        └── vendor
            └── schema
                ├── cws.avsc
                ├── ner.avsc
                └── pos.avsc


/.config/config:
--------------------------------------------------------------------------------
1 | [target.x86_64-apple-darwin]
2 | rustflags = [
3 |     "-C", "link-arg=-undefined",
4 |     "-C", "link-arg=dynamic_lookup",
5 | ]
6 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | ## What does this PR do?
 2 | 
 3 | <!--
 4 | Please include a summary of the change and which issue is fixed.
 5 | Please also include relevant motivation and context.
 6 | List any dependencies that are required for this change.
 7 | List all the breaking changes introduced by this pull request.
 8 | -->
 9 | 
10 | Fixes #\<issue_number>
11 | 
12 | ## Before submitting
13 | 
14 | - [ ] Did you make sure **title is self-explanatory** and **the description concisely explains the PR**?
15 | - [ ] Did you make sure your **PR does only one thing**, instead of bundling different changes together?
16 | - [ ] Did you list all the **breaking changes** introduced by this pull request?
17 | - [ ] Did you **test your PR locally** with `pytest` command?
18 | - [ ] Did you **run pre-commit hooks** with `pre-commit run -a` command?
19 | 
20 | ## Did you have fun?
21 | 
22 | Make sure you had fun coding 🙃
23 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # To get started with Dependabot version updates, you'll need to specify which
 2 | # package ecosystems to update and where the package manifests are located.
 3 | # Please see the documentation for all configuration options:
 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
 5 | 
 6 | version: 2
 7 | updates:
 8 |   - package-ecosystem: "pip" # See documentation for possible values
 9 |     directory: "/python/core" # Location of package manifests
10 |     schedule:
11 |       interval: "daily"
12 |     ignore:
13 |       - dependency-name: "pytorch-lightning"
14 |         update-types: ["version-update:semver-patch"]
15 |       - dependency-name: "torchmetrics"
16 |         update-types: ["version-update:semver-patch"]
17 |   - package-ecosystem: cargo
18 |     directory: "/"
19 |     schedule:
20 |       interval: monthly
21 |       time: "04:00"
22 |       timezone: Europe/Berlin
23 | 


--------------------------------------------------------------------------------
/.github/workflows/ltp-core-publish.yml:
--------------------------------------------------------------------------------
 1 | name: Upload LTP Core Python Package
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 | 
 6 | jobs:
 7 |   core:
 8 |     runs-on: ubuntu-latest
 9 |     strategy:
10 |       matrix:
11 |         target: [x86_64]
12 |     steps:
13 |       - uses: actions/checkout@v2
14 |       - uses: actions/setup-python@v2
15 |         with:
16 |           python-version: 3.9
17 |           architecture: x64
18 |       - name: Build Wheels
19 |         run: |
20 |           pip wheel --no-deps -w dist python/core
21 |       - name: Upload wheels
22 |         uses: actions/upload-artifact@v2
23 |         with:
24 |           name: wheels
25 |           path: dist
26 | 
27 |   release:
28 |     name: Release
29 |     runs-on: ubuntu-latest
30 |     needs: [core]
31 |     steps:
32 |       - uses: actions/download-artifact@v2
33 |         with:
34 |           name: wheels
35 |       - uses: actions/setup-python@v2
36 |         with:
37 |           python-version: 3.9
38 |       - name: Publish LTP Core to PyPi
39 |         env:
40 |           TWINE_USERNAME: __token__
41 |           TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD_CORE }}
42 |         run: |
43 |           pip install --upgrade twine
44 |           twine upload --skip-existing ltp_core-*
45 | 


--------------------------------------------------------------------------------
/.github/workflows/ltp-publish.yml:
--------------------------------------------------------------------------------
 1 | name: Upload LTP Python Package
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   release:
 6 |     types: [created]
 7 | 
 8 | jobs:
 9 |   interface:
10 |     runs-on: ubuntu-latest
11 |     strategy:
12 |       matrix:
13 |         target: [x86_64]
14 |     steps:
15 |       - uses: actions/checkout@v2
16 |       - uses: actions/setup-python@v2
17 |         with:
18 |           python-version: 3.9
19 |           architecture: x64
20 |       - name: Build Wheels
21 |         run: |
22 |           pip wheel --no-deps -w dist python/interface
23 |       - name: Upload wheels
24 |         uses: actions/upload-artifact@v2
25 |         with:
26 |           name: wheels
27 |           path: dist
28 | 
29 |   release:
30 |     name: Release
31 |     runs-on: ubuntu-latest
32 |     needs: [interface]
33 |     steps:
34 |       - uses: actions/download-artifact@v2
35 |         with:
36 |           name: wheels
37 |       - uses: actions/setup-python@v2
38 |         with:
39 |           python-version: 3.9
40 |       - name: Publish LTP to PyPi
41 |         env:
42 |           TWINE_USERNAME: __token__
43 |           TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
44 |         run: |
45 |           pip install --upgrade twine
46 |           twine upload --skip-existing ltp-*
47 | 


--------------------------------------------------------------------------------
/.github/workflows/ruff.yaml:
--------------------------------------------------------------------------------
 1 | # Same as `code-quality-pr.yaml` but triggered on commit to main branch
 2 | # and runs on all files (instead of only the changed ones)
 3 | 
 4 | name: Ruff
 5 | on: [ push, pull_request ]
 6 | jobs:
 7 |   ruff:
 8 |     runs-on: ubuntu-latest
 9 |     steps:
10 |       - uses: actions/checkout@v3
11 |       - uses: chartboost/ruff-action@v1


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | #on:
 4 | #  push:
 5 | #    branches: [ main ]
 6 | #  pull_request:
 7 | #    branches: [ main, "release/*" ]
 8 | 
 9 | on:
10 |   workflow_dispatch:
11 | 
12 | jobs:
13 |   run_tests:
14 |     runs-on: ${{ matrix.os }}
15 | 
16 |     strategy:
17 |       fail-fast: false
18 |       matrix:
19 |         os: ["ubuntu-latest", "macos-latest", "windows-latest"]
20 |         python-version: ["3.7", "3.8", "3.9", "3.10"]
21 | 
22 |     timeout-minutes: 10
23 | 
24 |     steps:
25 |       - name: Checkout
26 |         uses: actions/checkout@v3
27 | 
28 |       - name: Set up Python ${{ matrix.python-version }}
29 |         uses: actions/setup-python@v3
30 |         with:
31 |           python-version: ${{ matrix.python-version }}
32 | 
33 |       - name: Install dependencies
34 |         run: |
35 |           python -m pip install --upgrade pip
36 |           pip install -r requirements.txt
37 |           pip install pytest
38 |           pip install protobuf==3.20.0
39 | 
40 |       - name: List dependencies
41 |         run: |
42 |           python -m pip list
43 | 
44 |       - name: Run pytest
45 |         run: |
46 |           pytest -v python/core
47 | 
48 |   # upload code coverage report
49 |   code-coverage:
50 |     runs-on: ubuntu-latest
51 | 
52 |     steps:
53 |       - name: Checkout
54 |         uses: actions/checkout@v2
55 | 
56 |       - name: Set up Python 3.10
57 |         uses: actions/setup-python@v2
58 |         with:
59 |           python-version: "3.10"
60 | 
61 |       - name: Install dependencies
62 |         run: |
63 |           python -m pip install --upgrade pip
64 |           pip install -r requirements.txt
65 |           pip install pytest
66 |           pip install pytest-cov[toml]
67 |           pip install protobuf==3.20.0
68 | 
69 |       - name: Run tests and collect coverage
70 |         run: pytest --cov python/core/ltp
71 | 
72 |       - name: Upload coverage to Codecov
73 |         uses: codecov/codecov-action@v3
74 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | /data
  2 | /target
  3 | Cargo.lock
  4 | 
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | pip-wheel-metadata/
 28 | share/python-wheels/
 29 | *.egg-info/
 30 | .installed.cfg
 31 | *.egg
 32 | MANIFEST
 33 | 
 34 | # PyInstaller
 35 | #  Usually these files are written by a python script from a template
 36 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 37 | *.manifest
 38 | *.spec
 39 | 
 40 | # Installer logs
 41 | pip-log.txt
 42 | pip-delete-this-directory.txt
 43 | 
 44 | # Unit test / coverage reports
 45 | htmlcov/
 46 | .tox/
 47 | .nox/
 48 | .coverage
 49 | .coverage.*
 50 | .cache
 51 | nosetests.xml
 52 | coverage.xml
 53 | *.cover
 54 | *.py,cover
 55 | .hypothesis/
 56 | .pytest_cache/
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | db.sqlite3-journal
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | target/
 80 | 
 81 | # Jupyter Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # IPython
 85 | profile_default/
 86 | ipython_config.py
 87 | 
 88 | # pyenv
 89 | .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 99 | __pypackages__/
100 | 
101 | # Celery stuff
102 | celerybeat-schedule
103 | celerybeat.pid
104 | 
105 | # SageMath parsed files
106 | *.sage.py
107 | 
108 | # Environments
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 
134 | ### VisualStudioCode
135 | .vscode/*
136 | !.vscode/settings.json
137 | !.vscode/tasks.json
138 | !.vscode/launch.json
139 | !.vscode/extensions.json
140 | *.code-workspace
141 | **/.vscode
142 | 
143 | # JetBrains
144 | .idea/
145 | 
146 | # Lightning-Hydra-Template
147 | python/core/configs/local/default.yaml
148 | python/core/data/
149 | python/core/logs/
150 | python/core/wandb/
151 | python/core/.env
152 | python/core/.autoenv
153 | 
154 | .DS_Store
155 | /bindings
156 | /python/interface/models
157 | 


--------------------------------------------------------------------------------
/.ruff.toml:
--------------------------------------------------------------------------------
 1 | # Enable flake8-bugbear (`B`) rules.
 2 | select = ["E", "F", "B"]
 3 | 
 4 | # Never enforce `E501` (line length violations).
 5 | ignore = ["E501"]
 6 | 
 7 | # Avoid trying to fix flake8-bugbear (`B`) violations.
 8 | unfixable = ["B"]
 9 | 
10 | # Ignore `E402` (import violations) in all `__init__.py` files, and in `path/to/file.py`.
11 | [per-file-ignores]
12 | "__init__.py" = ["E402"]
13 | "path/to/file.py" = ["E402"]
14 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [workspace]
 2 | members = [
 3 |     "rust/ltp",
 4 |     "rust/ltp-cffi",
 5 |     "python/extension",
 6 | ]
 7 | 
 8 | [profile.release]
 9 | lto = true
10 | codegen-units = 1
11 | panic = "abort"
12 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | help:  ## Show help
 3 | 	@grep -E '^[.a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
 4 | 
 5 | sync: ## Merge changes from main branch to your current branch
 6 | 	git fetch --all
 7 | 	git merge main
 8 | 
 9 | bdist: ## build ltp and ltp_extension
10 | 	pip wheel --no-deps -w dist python/core
11 | 	pip wheel --no-deps -w dist python/interface
12 | 	maturin build --release -m python/extension/Cargo.toml --out dist
13 | 
14 | cbindgen_header:
15 | 	mkdir -p bindings/c
16 | 	cbindgen --config rust/ltp-cffi/cbindgen.toml --crate ltp-cffi --output bindings/c/ltp.h
17 | 
18 | cbindgen: cbindgen_header
19 | 	cargo build --release  --package ltp-cffi
20 | 	cp target/release/libltp.* bindings/c
21 | 
22 | cbindgen_example: cbindgen
23 | 	gcc -L "$(pwd)bindings/c" -lltp -I "$(pwd)bindings/c" -o target/c_example rust/ltp-cffi/examples/example.c
24 | 	./target/c_example
25 | 
26 | train_legacy:
27 | 	cargo run --package ltp --release --example cws -- train --train data/examples/cws/train.txt --eval data/examples/cws/val.txt --model=data/cws_model.bin
28 | 	cargo run --package ltp --release --example cws -- eval --eval data/examples/cws/test.txt --model=data/cws_model.bin
29 | 	cargo run --package ltp --release --example cws -- predict --input data/examples/cws/raw.txt --output data/examples/cws/output.txt --model=data/cws_model.bin
30 | 
31 | 	cargo run --package ltp --release --example pos -- train --train data/examples/pos/train.txt --eval data/examples/pos/val.txt --model=data/pos_model.bin --vocab data/examples/pos/vocab.txt
32 | 	cargo run --package ltp --release --example pos -- eval --eval data/examples/pos/test.txt --model=data/pos_model.bin
33 | 	cargo run --package ltp --release --example pos -- predict --input data/examples/pos/raw.txt --output data/examples/pos/output.txt --model=data/pos_model.bin
34 | 
35 | 	cargo run --package ltp --release --example ner -- train --train data/examples/ner/train.txt --eval data/examples/ner/val.txt --model=data/ner_model.bin --vocab data/examples/ner/vocab.txt
36 | 	cargo run --package ltp --release --example ner -- eval --eval data/examples/ner/test.txt --model=data/ner_model.bin
37 | 	cargo run --package ltp --release --example ner -- predict --input data/examples/ner/raw.txt --output data/examples/ner/output.txt --model=data/ner_model.bin
38 | 


--------------------------------------------------------------------------------
/appveyor.yml:
--------------------------------------------------------------------------------
 1 | branches:
 2 |   only:
 3 |     - 3.X
 4 | 
 5 | environment:
 6 |   P: "c:/projects/libs"
 7 | 
 8 | # clone directory
 9 | clone_folder: c:\projects\ltp
10 | 
11 | os: Visual Studio 2015
12 | 
13 | platform:
14 |   - x86
15 |   - x64
16 | 
17 | configuration:
18 |   - Debug
19 |   - Release
20 | 
21 | install:
22 |   # by default, all script lines are interpreted as batch
23 | 
24 | build:
25 |   project: ALL_BUILD.vcxproj # path to Visual Studio solution or project
26 | 
27 | # scripts to run before build
28 | before_build:
29 |   - echo Running cmake...
30 |   - cd c:\projects\ltp
31 |   - cmake -G "Visual Studio 14 2015 Win64" -DCMAKE_INSTALL_PREFIX=%P%
32 | 
33 | after_build:
34 |   - cd c:\projects\ltp
35 |   - 7z a ltp-win-%PLATFORM%-%CONFIGURATION%.zip bin\examples\%CONFIGURATION%\*_cmdline.exe bin\%CONFIGURATION%\ltp_test.exe lib\%CONFIGURATION%\*.dll
36 | 
37 | artifacts:
38 |   - path: ltp-win-$(platform)-$(configuration).zip
39 |     name: ltp-win-$(platform)-$(configuration).zip
40 | 


--------------------------------------------------------------------------------
/data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HIT-SCIR/ltp/6c6059d5ccad87dca003190ee7565af86e7e22a0/data/.gitkeep


--------------------------------------------------------------------------------
/data/examples/cws/raw.txt:
--------------------------------------------------------------------------------
1 | 在已恢复通车的铁轨上，百余名铁路工人正利用列车经过的间隙抓紧补修工作。1
2 | 


--------------------------------------------------------------------------------
/data/examples/cws/test.txt:
--------------------------------------------------------------------------------
1 | 在 已 恢复 通车 的 铁轨 上 ， 百 余 名 铁路 工人 正 利用 列车 经过 的 间隙 抓紧 补修 工作 。
2 | 


--------------------------------------------------------------------------------
/data/examples/cws/train.txt:
--------------------------------------------------------------------------------
1 | 在 已 恢复 通车 的 铁轨 上 ， 百 余 名 铁路 工人 正 利用 列车 经过 的 间隙 抓紧 补修 工作 。
2 | 


--------------------------------------------------------------------------------
/data/examples/cws/val.txt:
--------------------------------------------------------------------------------
1 | 在 已 恢复 通车 的 铁轨 上 ， 百 余 名 铁路 工人 正 利用 列车 经过 的 间隙 抓紧 补修 工作 。
2 | 


--------------------------------------------------------------------------------
/data/examples/ner/raw.txt:
--------------------------------------------------------------------------------
1 | 台湾/ns 是/v 中国/ns 领土/n 不可分割/i 的/u 一/m 部分/n 。/wp
2 | 


--------------------------------------------------------------------------------
/data/examples/ner/test.txt:
--------------------------------------------------------------------------------
1 | 台湾/ns/S-Ns 是/v/O 中国/ns/S-Ns 领土/n/O 不可分割/i/O 的/u/O 一/m/O 部分/n/O 。/wp/O
2 | 


--------------------------------------------------------------------------------
/data/examples/ner/train.txt:
--------------------------------------------------------------------------------
1 | 台湾/ns/S-Ns 是/v/O 中国/ns/S-Ns 领土/n/O 不可分割/i/O 的/u/O 一/m/O 部分/n/O 。/wp/O
2 | 


--------------------------------------------------------------------------------
/data/examples/ner/val.txt:
--------------------------------------------------------------------------------
1 | 台湾/ns/S-Ns 是/v/O 中国/ns/S-Ns 领土/n/O 不可分割/i/O 的/u/O 一/m/O 部分/n/O 。/wp/O
2 | 


--------------------------------------------------------------------------------
/data/examples/ner/vocab.txt:
--------------------------------------------------------------------------------
 1 | O
 2 | B-Nh
 3 | B-Ni
 4 | B-Ns
 5 | E-Nh
 6 | E-Ni
 7 | E-Ns
 8 | I-Nh
 9 | I-Ni
10 | I-Ns
11 | S-Nh
12 | S-Ni
13 | S-Ns
14 | 


--------------------------------------------------------------------------------
/data/examples/pos/raw.txt:
--------------------------------------------------------------------------------
1 | 中 葡 总理 对 两 国 关系 的 现状 给予 了 积极 的 评价 。
2 | 


--------------------------------------------------------------------------------
/data/examples/pos/test.txt:
--------------------------------------------------------------------------------
1 | 中/j 葡/j 总理/n 对/p 两/m 国/n 关系/n 的/u 现状/n 给予/v 了/u 积极/a 的/u 评价/v 。/wp
2 | 


--------------------------------------------------------------------------------
/data/examples/pos/train.txt:
--------------------------------------------------------------------------------
1 | 中/j 葡/j 总理/n 对/p 两/m 国/n 关系/n 的/u 现状/n 给予/v 了/u 积极/a 的/u 评价/v 。/wp
2 | 


--------------------------------------------------------------------------------
/data/examples/pos/val.txt:
--------------------------------------------------------------------------------
1 | 中/j 葡/j 总理/n 对/p 两/m 国/n 关系/n 的/u 现状/n 给予/v 了/u 积极/a 的/u 评价/v 。/wp
2 | 


--------------------------------------------------------------------------------
/data/examples/pos/vocab.txt:
--------------------------------------------------------------------------------
 1 | a
 2 | b
 3 | c
 4 | d
 5 | e
 6 | h
 7 | i
 8 | j
 9 | k
10 | m
11 | n
12 | nd
13 | nh
14 | ni
15 | nl
16 | ns
17 | nt
18 | nz
19 | o
20 | p
21 | q
22 | r
23 | u
24 | v
25 | wp
26 | ws
27 | z
28 | 


--------------------------------------------------------------------------------
/python/core/.env.example:
--------------------------------------------------------------------------------
1 | # example of file for storing private and user specific environment variables, like keys or system paths
2 | # rename it to ".env" (excluded from version control by default)
3 | # .env is loaded by train.py automatically
4 | # hydra allows you to reference variables in .yaml configs with special syntax: ${oc.env:MY_VAR}
5 | 
6 | MY_VAR="/home/user/my/system/path"
7 | 


--------------------------------------------------------------------------------
/python/core/LICENSE:
--------------------------------------------------------------------------------
1 | 1. 语言技术平台面向国内外大学、中科院各研究所以及个人研究者免费开放源代码，但如上述机构和个人将该平台用于商业目的（如企业合作项目等）则需要付费。
2 | 2. 除上述机构以外的企事业单位，如申请使用该平台，需付费。
3 | 3. 凡涉及付费问题，请发邮件到 car@ir.hit.edu.cn 洽商。
4 | 4. 如果您在 LTP 基础上发表论文或取得科研成果，请您在发表论文和申报成果时声明“使用了哈工大社会计算与信息检索研究中心研制的语言技术平台（LTP）”.
5 |    同时，发信给car@ir.hit.edu.cn，说明发表论文或申报成果的题目、出处等。
6 | 


--------------------------------------------------------------------------------
/python/core/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include requirements.txt
3 | 
4 | recursive-include ltp_core *
5 | 
6 | recursive-exclude * *.pyc
7 | recursive-exclude * .DS_Store
8 | recursive-exclude * __pycache__
9 | 


--------------------------------------------------------------------------------
/python/core/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | help:  ## Show help
 3 | 	@grep -E '^[.a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
 4 | 
 5 | clean: ## Clean autogenerated files
 6 | 	rm -rf dist
 7 | 	find . -type f -name "*.DS_Store" -ls -delete
 8 | 	find . | grep -E "(__pycache__|\.pyc|\.pyo)" | xargs rm -rf
 9 | 	find . | grep -E ".pytest_cache" | xargs rm -rf
10 | 	find . | grep -E ".ipynb_checkpoints" | xargs rm -rf
11 | 	rm -f .coverage
12 | 
13 | clean-logs: ## Clean logs
14 | 	rm -rf logs/**
15 | 
16 | style: ## Run pre-commit hooks
17 | 	pre-commit run -a
18 | 
19 | sync: ## Merge changes from main branch to your current branch
20 | 	git fetch --all
21 | 	git merge main
22 | 
23 | test: ## Run not slow tests
24 | 	pytest -k "not slow"
25 | 
26 | test-full: ## Run all tests
27 | 	pytest
28 | 
29 | train: ## Train the model
30 | 	python ltp_core/train.py experiment=example
31 | 
32 | debug: ## Enter debugging mode with pdb
33 | 	#
34 | 	# tips:
35 | 	# - use "import pdb; pdb.set_trace()" to set breakpoint
36 | 	# - use "h" to print all commands
37 | 	# - use "n" to execute the next line
38 | 	# - use "c" to run until the breakpoint is hit
39 | 	# - use "l" to print src code around current line, "ll" for full function code
40 | 	# - docs: https://docs.python.org/3/library/pdb.html
41 | 	#
42 | 	python -m pdb ltp_core/train.py debug=default
43 | 


--------------------------------------------------------------------------------
/python/core/README.md:
--------------------------------------------------------------------------------
1 | | Language                             | version                                                                                                                                                                                                                                                                                                                   |
2 | | ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
3 | | [Python](python/interface/README.md) | [![LTP](https://img.shields.io/pypi/v/ltp?label=LTP)](https://pypi.org/project/ltp) [![LTP-Core](https://img.shields.io/pypi/v/ltp-core?label=LTP-Core)](https://pypi.org/project/ltp-core)   [![LTP-Extension](https://img.shields.io/pypi/v/ltp-extension?label=LTP-Extension)](https://pypi.org/project/ltp-extension) |
4 | | [Rust](rust/ltp/README.md)           | [![LTP](https://img.shields.io/crates/v/ltp?label=LTP)](https://crates.io/crates/ltp)                                                                                                                                                                                                                                     |
5 | 
6 | # LTP Core
7 | 
8 | 为 LTP 神经网络模型提供支持。
9 | 


--------------------------------------------------------------------------------
/python/core/bash/eval.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | #SBATCH -N 1
3 | #SBATCH -t 7-00:00:00
4 | 
5 | export TOKENIZERS_PARALLELISM=false
6 | PYTHONPATH=. python ltp_core/eval.py "$@"
7 | 


--------------------------------------------------------------------------------
/python/core/bash/train.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | #SBATCH -N 1
3 | #SBATCH -t 7-00:00:00
4 | 
5 | export TOKENIZERS_PARALLELISM=false
6 | PYTHONPATH=. python ltp_core/train.py "$@"
7 | 


--------------------------------------------------------------------------------
/python/core/configs/callbacks/default.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - model_checkpoint.yaml
 3 |   - early_stopping.yaml
 4 |   - model_summary.yaml
 5 |   - rich_progress_bar.yaml
 6 |   - _self_
 7 | 
 8 | model_checkpoint:
 9 |   dirpath: ${paths.output_dir}/checkpoints
10 |   filename: "epoch_{epoch:03d}"
11 |   monitor: "val/acc"
12 |   mode: "max"
13 |   save_last: True
14 |   auto_insert_metric_name: False
15 | 
16 | early_stopping:
17 |   monitor: "val/acc"
18 |   patience: 100
19 |   mode: "max"
20 | 
21 | model_summary:
22 |   max_depth: -1
23 | 


--------------------------------------------------------------------------------
/python/core/configs/callbacks/early_stopping.yaml:
--------------------------------------------------------------------------------
 1 | # https://pytorch-lightning.readthedocs.io/en/latest/api/pytorch_lightning.callbacks.EarlyStopping.html
 2 | 
 3 | # Monitor a metric and stop training when it stops improving.
 4 | # Look at the above link for more detailed information.
 5 | early_stopping:
 6 |   _target_: pytorch_lightning.callbacks.EarlyStopping
 7 |   monitor: ??? # quantity to be monitored, must be specified !!!
 8 |   min_delta: 0. # minimum change in the monitored quantity to qualify as an improvement
 9 |   patience: 3 # number of checks with no improvement after which training will be stopped
10 |   verbose: False # verbosity mode
11 |   mode: "min" # "max" means higher metric value is better, can be also "min"
12 |   strict: True # whether to crash the training if monitor is not found in the validation metrics
13 |   check_finite: True # when set True, stops training when the monitor becomes NaN or infinite
14 |   stopping_threshold: null # stop training immediately once the monitored quantity reaches this threshold
15 |   divergence_threshold: null # stop training as soon as the monitored quantity becomes worse than this threshold
16 |   check_on_train_epoch_end: null # whether to run early stopping at the end of the training epoch
17 |   # log_rank_zero_only: False  # this keyword argument isn't available in stable version
18 | 


--------------------------------------------------------------------------------
/python/core/configs/callbacks/model_checkpoint.yaml:
--------------------------------------------------------------------------------
 1 | # https://pytorch-lightning.readthedocs.io/en/latest/api/pytorch_lightning.callbacks.ModelCheckpoint.html
 2 | 
 3 | # Save the model periodically by monitoring a quantity.
 4 | # Look at the above link for more detailed information.
 5 | model_checkpoint:
 6 |   _target_: pytorch_lightning.callbacks.ModelCheckpoint
 7 |   dirpath: null # directory to save the model file
 8 |   filename: null # checkpoint filename
 9 |   monitor: null # name of the logged metric which determines when model is improving
10 |   verbose: False # verbosity mode
11 |   save_last: null # additionally always save an exact copy of the last checkpoint to a file last.ckpt
12 |   save_top_k: 1 # save k best models (determined by above metric)
13 |   mode: "min" # "max" means higher metric value is better, can be also "min"
14 |   auto_insert_metric_name: True # when True, the checkpoints filenames will contain the metric name
15 |   save_weights_only: False # if True, then only the model’s weights will be saved
16 |   every_n_train_steps: null # number of training steps between checkpoints
17 |   train_time_interval: null # checkpoints are monitored at the specified time interval
18 |   every_n_epochs: null # number of epochs between checkpoints
19 |   save_on_train_epoch_end: null # whether to run checkpointing at the end of the training epoch or the end of validation
20 | 


--------------------------------------------------------------------------------
/python/core/configs/callbacks/model_summary.yaml:
--------------------------------------------------------------------------------
1 | # https://pytorch-lightning.readthedocs.io/en/latest/api/pytorch_lightning.callbacks.RichModelSummary.html
2 | 
3 | # Generates a summary of all layers in a LightningModule with rich text formatting.
4 | # Look at the above link for more detailed information.
5 | model_summary:
6 |   _target_: pytorch_lightning.callbacks.RichModelSummary
7 |   max_depth: 1 # the maximum depth of layer nesting that the summary will include
8 | 


--------------------------------------------------------------------------------
/python/core/configs/callbacks/none.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HIT-SCIR/ltp/6c6059d5ccad87dca003190ee7565af86e7e22a0/python/core/configs/callbacks/none.yaml


--------------------------------------------------------------------------------
/python/core/configs/callbacks/rich_progress_bar.yaml:
--------------------------------------------------------------------------------
1 | # https://pytorch-lightning.readthedocs.io/en/latest/api/pytorch_lightning.callbacks.RichProgressBar.html
2 | 
3 | # Create a progress bar with rich text formatting.
4 | # Look at the above link for more detailed information.
5 | rich_progress_bar:
6 |   _target_: pytorch_lightning.callbacks.RichProgressBar
7 | 


--------------------------------------------------------------------------------
/python/core/configs/datamodule/cls_datamodules.yaml:
--------------------------------------------------------------------------------
 1 | _target_: ltp_core.datamodules.TaskDataModule
 2 | 
 3 | tokenizer: ${model.model.backbone.pretrained_model_name_or_path}
 4 | 
 5 | datamodules:
 6 |   batch_size: 16
 7 |   num_workers: 4
 8 |   pin_memory: True
 9 |   load:
10 |     _target_: ltp_core.datamodules.adapters.sentence_classification.build_dataset
11 |     _partial_: true
12 |     task_name: "cola"
13 | 


--------------------------------------------------------------------------------
/python/core/configs/datamodule/cws_datamodules.yaml:
--------------------------------------------------------------------------------
 1 | _target_: ltp_core.datamodules.TaskDataModule
 2 | 
 3 | tokenizer: ${model.model.backbone.pretrained_model_name_or_path}
 4 | 
 5 | datamodules:
 6 |   batch_size: 16
 7 |   num_workers: 4
 8 |   pin_memory: True
 9 |   load:
10 |     _target_: ltp_core.datamodules.adapters.segmention.build_dataset
11 |     _partial_: true
12 |     task_name: "cws"
13 |     data_dir: "data/conllu"
14 |     mode: "bmes"
15 | 


--------------------------------------------------------------------------------
/python/core/configs/datamodule/dep_datamodules.yaml:
--------------------------------------------------------------------------------
 1 | _target_: ltp_core.datamodules.TaskDataModule
 2 | 
 3 | tokenizer: ${model.model.backbone.pretrained_model_name_or_path}
 4 | 
 5 | datamodules:
 6 |   batch_size: 16
 7 |   num_workers: 4
 8 |   pin_memory: True
 9 |   load:
10 |     _target_: ltp_core.datamodules.adapters.dependency_parsing.build_dataset
11 |     _partial_: true
12 |     task_name: "dep"
13 |     data_dir: "data/conllu"
14 | 


--------------------------------------------------------------------------------
/python/core/configs/datamodule/multi_datamodules.yaml:
--------------------------------------------------------------------------------
 1 | _target_: ltp_core.datamodules.MultiTaskDataModule
 2 | 
 3 | tau: 0.8
 4 | tokenizer: ${model.model.backbone.pretrained_model_name_or_path}
 5 | 
 6 | datamodules:
 7 |   cws:
 8 |     batch_size: 16
 9 |     num_workers: 4
10 |     pin_memory: True
11 |     load:
12 |       _target_: ltp_core.datamodules.adapters.segmention.build_dataset
13 |       _partial_: true
14 |       task_name: "cws"
15 |       data_dir: "data/conllu"
16 |       mode: "bmes"
17 | 
18 |   pos:
19 |     batch_size: 16
20 |     num_workers: 4
21 |     pin_memory: True
22 |     load:
23 |       _target_: ltp_core.datamodules.adapters.postagger.build_dataset
24 |       _partial_: true
25 |       task_name: "pos"
26 |       data_dir: "data/conllu"
27 | 
28 |   ner:
29 |     batch_size: 16
30 |     num_workers: 4
31 |     pin_memory: True
32 |     load:
33 |       _target_: ltp_core.datamodules.adapters.named_entity_recognition.build_dataset
34 |       _partial_: true
35 |       task_name: "ner"
36 |       data_dir: "data/ner"
37 | 
38 |   srl:
39 |     batch_size: 16
40 |     num_workers: 4
41 |     pin_memory: True
42 |     load:
43 |       _target_: ltp_core.datamodules.adapters.semantic_role_labeling.build_dataset
44 |       _partial_: true
45 |       task_name: "srl"
46 |       data_dir: "data/srl"
47 | 
48 |   dep:
49 |     batch_size: 16
50 |     num_workers: 4
51 |     pin_memory: True
52 |     load:
53 |       _target_: ltp_core.datamodules.adapters.dependency_parsing.build_dataset
54 |       _partial_: true
55 |       task_name: "dep"
56 |       data_dir: "data/conllu"
57 | 
58 |   sdp:
59 |     batch_size: 16
60 |     num_workers: 4
61 |     pin_memory: True
62 |     load:
63 |       _target_: ltp_core.datamodules.adapters.semantic_dependency_parsing.build_dataset
64 |       _partial_: true
65 |       task_name: "sdp"
66 |       data_dir: "data/conllu"
67 | 


--------------------------------------------------------------------------------
/python/core/configs/datamodule/ner_datamodules.yaml:
--------------------------------------------------------------------------------
 1 | _target_: ltp_core.datamodules.TaskDataModule
 2 | 
 3 | tokenizer: ${model.model.backbone.pretrained_model_name_or_path}
 4 | 
 5 | datamodules:
 6 |   batch_size: 16
 7 |   num_workers: 4
 8 |   pin_memory: True
 9 |   load:
10 |     _target_: ltp_core.datamodules.adapters.named_entity_recognition.build_dataset
11 |     _partial_: true
12 |     task_name: "ner"
13 |     data_dir: "data/ner"
14 | 


--------------------------------------------------------------------------------
/python/core/configs/datamodule/pos_datamodules.yaml:
--------------------------------------------------------------------------------
 1 | _target_: ltp_core.datamodules.TaskDataModule
 2 | 
 3 | tokenizer: ${model.model.backbone.pretrained_model_name_or_path}
 4 | 
 5 | datamodules:
 6 |   batch_size: 16
 7 |   num_workers: 4
 8 |   pin_memory: True
 9 |   load:
10 |     _target_: ltp_core.datamodules.adapters.postagger.build_dataset
11 |     _partial_: true
12 |     task_name: "pos"
13 |     data_dir: "data/conllu"
14 | 


--------------------------------------------------------------------------------
/python/core/configs/datamodule/sdp_datamodules.yaml:
--------------------------------------------------------------------------------
 1 | _target_: ltp_core.datamodules.TaskDataModule
 2 | 
 3 | tokenizer: ${model.model.backbone.pretrained_model_name_or_path}
 4 | 
 5 | datamodules:
 6 |   batch_size: 16
 7 |   num_workers: 4
 8 |   pin_memory: True
 9 |   load:
10 |     _target_: ltp_core.datamodules.adapters.semantic_dependency_parsing.build_dataset
11 |     _partial_: true
12 |     task_name: "sdp"
13 |     data_dir: "data/conllu"
14 | 


--------------------------------------------------------------------------------
/python/core/configs/datamodule/srl_datamodules.yaml:
--------------------------------------------------------------------------------
 1 | _target_: ltp_core.datamodules.TaskDataModule
 2 | 
 3 | tokenizer: ${model.model.backbone.pretrained_model_name_or_path}
 4 | 
 5 | datamodules:
 6 |   batch_size: 16
 7 |   num_workers: 4
 8 |   pin_memory: True
 9 |   load:
10 |     _target_: ltp_core.datamodules.adapters.semantic_role_labeling.build_dataset
11 |     _partial_: true
12 |     task_name: "srl"
13 |     data_dir: "data/srl"
14 | 


--------------------------------------------------------------------------------
/python/core/configs/debug/default.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # default debugging setup, runs 1 full epoch
 4 | # other debugging configs can inherit from this one
 5 | 
 6 | # overwrite task name so debugging logs are stored in separate folder
 7 | task_name: "debug"
 8 | 
 9 | # disable callbacks and loggers during debugging
10 | callbacks: null
11 | logger: null
12 | 
13 | extras:
14 |   ignore_warnings: False
15 |   enforce_tags: False
16 | 
17 | # sets level of all command line loggers to 'DEBUG'
18 | # https://hydra.cc/docs/tutorials/basic/running_your_app/logging/
19 | hydra:
20 |   job_logging:
21 |     root:
22 |       level: DEBUG
23 | 
24 |   # use this to also set hydra loggers to 'DEBUG'
25 |   # verbose: True
26 | 
27 | trainer:
28 |   max_epochs: 1
29 |   accelerator: cpu # debuggers don't like gpus
30 |   devices: 1 # debuggers don't like multiprocessing
31 |   detect_anomaly: true # raise exception if NaN or +/-inf is detected in any tensor
32 | 
33 | datamodule:
34 |   num_workers: 0 # debuggers don't like multiprocessing
35 |   pin_memory: False # disable gpu memory pin
36 | 


--------------------------------------------------------------------------------
/python/core/configs/debug/fdr.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # runs 1 train, 1 validation and 1 test step
 4 | 
 5 | defaults:
 6 |   - default.yaml
 7 | 
 8 | trainer:
 9 |   fast_dev_run: true
10 | 


--------------------------------------------------------------------------------
/python/core/configs/debug/limit.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # uses only 1% of the training data and 5% of validation/test data
 4 | 
 5 | defaults:
 6 |   - default.yaml
 7 | 
 8 | trainer:
 9 |   max_epochs: 3
10 |   limit_train_batches: 0.01
11 |   limit_val_batches: 0.05
12 |   limit_test_batches: 0.05
13 | 


--------------------------------------------------------------------------------
/python/core/configs/debug/overfit.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # overfits to 3 batches
 4 | 
 5 | defaults:
 6 |   - default.yaml
 7 | 
 8 | trainer:
 9 |   max_epochs: 20
10 |   overfit_batches: 3
11 | 
12 | # model ckpt and early stopping need to be disabled during overfitting
13 | callbacks: null
14 | 


--------------------------------------------------------------------------------
/python/core/configs/debug/profiler.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # runs with execution time profiling
 4 | 
 5 | defaults:
 6 |   - default.yaml
 7 | 
 8 | trainer:
 9 |   max_epochs: 1
10 |   profiler: "simple"
11 |   # profiler: "advanced"
12 |   # profiler: "pytorch"
13 | 


--------------------------------------------------------------------------------
/python/core/configs/eval.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | defaults:
 4 |   - _self_
 5 |   - datamodule: multi_datamodules.yaml
 6 |   - model: multi_model.yaml
 7 |   - logger: null
 8 |   - trainer: default.yaml
 9 |   - paths: default.yaml
10 |   - extras: default.yaml
11 |   - hydra: default.yaml
12 | 
13 | task_name: "eval"
14 | 
15 | tags: ["dev"]
16 | 
17 | # passing checkpoint path is necessary for evaluation
18 | ckpt_path: ???
19 | 


--------------------------------------------------------------------------------
/python/core/configs/experiment/cls.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # to execute this experiment run:
 4 | # python train.py experiment=example
 5 | 
 6 | defaults:
 7 |   - override /datamodule: cls_datamodules.yaml
 8 |   - override /model: cls_model.yaml
 9 |   - override /callbacks: default.yaml
10 |   - override /trainer: gpu.yaml
11 | 
12 | # all parameters below will be merged with parameters from default configurations set above
13 | # this allows you to overwrite only specified parameters
14 | 
15 | tags: ["cls"]
16 | 
17 | seed: 12345
18 | 
19 | trainer:
20 |   min_epochs: 1
21 |   max_epochs: 10
22 |   gradient_clip_val: 1.0
23 | 
24 | logger:
25 |   wandb:
26 |     tags: "${tags}"
27 |     name: "sent-cls-${oc.env:SLURM_JOB_ID,localhost}-${now:%Y-%m-%d_%H:%M:%S.%f}"
28 | 
29 | callbacks:
30 |   model_checkpoint:
31 |     monitor: "val/mean_metric"
32 |     mode: "max"
33 | 
34 |   early_stopping:
35 |     monitor: "val/mean_metric"
36 |     patience: 3
37 |     mode: "max"
38 | 


--------------------------------------------------------------------------------
/python/core/configs/experiment/cws.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # to execute this experiment run:
 4 | # python train.py experiment=example
 5 | 
 6 | defaults:
 7 |   - override /datamodule: cws_datamodules.yaml
 8 |   - override /model: cws_model.yaml
 9 |   - override /callbacks: default.yaml
10 |   - override /trainer: gpu.yaml
11 | 
12 | # all parameters below will be merged with parameters from default configurations set above
13 | # this allows you to overwrite only specified parameters
14 | 
15 | tags: ["cws"]
16 | 
17 | seed: 12345
18 | 
19 | trainer:
20 |   min_epochs: 1
21 |   max_epochs: 10
22 |   gradient_clip_val: 1.0
23 | 
24 | logger:
25 |   wandb:
26 |     tags: "${tags}"
27 |     name: "ltp-${oc.env:SLURM_JOB_ID,localhost}-${now:%Y-%m-%d_%H:%M:%S.%f}"
28 | 
29 | callbacks:
30 |   model_checkpoint:
31 |     monitor: "val/mean_metric"
32 |     mode: "max"
33 | 
34 |   early_stopping:
35 |     monitor: "val/mean_metric"
36 |     patience: 3
37 |     mode: "max"
38 | 


--------------------------------------------------------------------------------
/python/core/configs/experiment/dep.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # to execute this experiment run:
 4 | # python train.py experiment=example
 5 | 
 6 | defaults:
 7 |   - override /datamodule: dep_datamodules.yaml
 8 |   - override /model: dep_model.yaml
 9 |   - override /callbacks: default.yaml
10 |   - override /trainer: gpu.yaml
11 | 
12 | # all parameters below will be merged with parameters from default configurations set above
13 | # this allows you to overwrite only specified parameters
14 | 
15 | tags: ["dep"]
16 | 
17 | seed: 12345
18 | 
19 | trainer:
20 |   min_epochs: 1
21 |   max_epochs: 10
22 |   gradient_clip_val: 1.0
23 | 
24 | logger:
25 |   wandb:
26 |     tags: "${tags}"
27 |     name: "ltp-${oc.env:SLURM_JOB_ID,localhost}-${now:%Y-%m-%d_%H:%M:%S.%f}"
28 | 
29 | callbacks:
30 |   model_checkpoint:
31 |     monitor: "val/mean_metric"
32 |     mode: "max"
33 | 
34 |   early_stopping:
35 |     monitor: "val/mean_metric"
36 |     patience: 3
37 |     mode: "max"
38 | 


--------------------------------------------------------------------------------
/python/core/configs/experiment/example.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # to execute this experiment run:
 4 | # python train.py experiment=example
 5 | 
 6 | defaults:
 7 |   - override /datamodule: multi_datamodules.yaml
 8 |   - override /model: multi_model.yaml
 9 |   - override /callbacks: default.yaml
10 |   - override /trainer: gpu.yaml
11 | 
12 | # all parameters below will be merged with parameters from default configurations set above
13 | # this allows you to overwrite only specified parameters
14 | 
15 | tags: ["ltp"]
16 | 
17 | seed: 12345
18 | 
19 | trainer:
20 |   min_epochs: 1
21 |   max_epochs: 3
22 |   gradient_clip_val: 1.0
23 | 
24 | model:
25 |   model:
26 |     backbone:
27 |       pretrained_model_name_or_path: hfl/chinese-electra-small-generator
28 |     heads:
29 |       cws:
30 |         input_size: 64
31 |         num_labels: 4
32 |       pos:
33 |         input_size: 64
34 |       ner:
35 |         input_size: 64
36 |       srl:
37 |         input_size: 64
38 |         hidden_size: 32
39 |       dep:
40 |         input_size: 64
41 |       sdp:
42 |         input_size: 64
43 | 
44 | logger:
45 |   wandb:
46 |     tags: "${tags}"
47 |     name: "ltp-${oc.env:SLURM_JOB_ID,localhost}-${now:%Y-%m-%d_%H:%M:%S.%f}"
48 | 
49 | callbacks:
50 |   model_checkpoint:
51 |     monitor: "val/mean_metric"
52 |     mode: "max"
53 | 
54 |   early_stopping:
55 |     monitor: "val/mean_metric"
56 |     patience: 3
57 |     mode: "max"
58 | 


--------------------------------------------------------------------------------
/python/core/configs/experiment/multi.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # to execute this experiment run:
 4 | # python train.py experiment=example
 5 | 
 6 | defaults:
 7 |   - override /datamodule: multi_datamodules.yaml
 8 |   - override /model: multi_model.yaml
 9 |   - override /callbacks: default.yaml
10 |   - override /trainer: gpu.yaml
11 | 
12 | # all parameters below will be merged with parameters from default configurations set above
13 | # this allows you to overwrite only specified parameters
14 | 
15 | tags: ["ltp"]
16 | 
17 | seed: 12345
18 | 
19 | trainer:
20 |   min_epochs: 1
21 |   max_epochs: 10
22 |   gradient_clip_val: 1.0
23 | 
24 | logger:
25 |   wandb:
26 |     tags: "${tags}"
27 |     name: "ltp-${oc.env:SLURM_JOB_ID,localhost}-${now:%Y-%m-%d_%H:%M:%S.%f}"
28 | 
29 | callbacks:
30 |   model_checkpoint:
31 |     monitor: "val/mean_metric"
32 |     mode: "max"
33 | 
34 |   early_stopping:
35 |     monitor: "val/mean_metric"
36 |     patience: 3
37 |     mode: "max"
38 | 


--------------------------------------------------------------------------------
/python/core/configs/experiment/multi_bi.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # to execute this experiment run:
 4 | # python train.py experiment=example
 5 | 
 6 | defaults:
 7 |   - override /datamodule: multi_datamodules.yaml
 8 |   - override /model: multi_model.yaml
 9 |   - override /callbacks: default.yaml
10 |   - override /trainer: gpu.yaml
11 | 
12 | # all parameters below will be merged with parameters from default configurations set above
13 | # this allows you to overwrite only specified parameters
14 | 
15 | tags: ["ltp"]
16 | 
17 | seed: 12345
18 | 
19 | trainer:
20 |   min_epochs: 1
21 |   max_epochs: 10
22 |   gradient_clip_val: 1.0
23 | 
24 | logger:
25 |   wandb:
26 |     tags: "${tags}"
27 |     name: "ltp-${oc.env:SLURM_JOB_ID,localhost}-${now:%Y-%m-%d_%H:%M:%S.%f}"
28 | 
29 | callbacks:
30 |   model_checkpoint:
31 |     monitor: "val/mean_metric"
32 |     mode: "max"
33 | 
34 |   early_stopping:
35 |     monitor: "val/mean_metric"
36 |     patience: 3
37 |     mode: "max"
38 | 
39 | datamodule:
40 |   datamodules:
41 |     cws:
42 |       load:
43 |         mode: "bi"
44 | 
45 | model:
46 |   metrics:
47 |     cws:
48 |       _ltp_target_: ltp_core.models.metrics.token.SeqEvalF1
49 |       tags_or_path: ["B", "I"]
50 |   model:
51 |     heads:
52 |       cws:
53 |         num_labels: 2
54 | 


--------------------------------------------------------------------------------
/python/core/configs/experiment/ner.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # to execute this experiment run:
 4 | # python train.py experiment=example
 5 | 
 6 | defaults:
 7 |   - override /datamodule: ner_datamodules.yaml
 8 |   - override /model: ner_model.yaml
 9 |   - override /callbacks: default.yaml
10 |   - override /trainer: gpu.yaml
11 | 
12 | # all parameters below will be merged with parameters from default configurations set above
13 | # this allows you to overwrite only specified parameters
14 | 
15 | tags: ["ner"]
16 | 
17 | seed: 12345
18 | 
19 | trainer:
20 |   min_epochs: 1
21 |   max_epochs: 10
22 |   gradient_clip_val: 1.0
23 | 
24 | logger:
25 |   wandb:
26 |     tags: "${tags}"
27 |     name: "ltp-${oc.env:SLURM_JOB_ID,localhost}-${now:%Y-%m-%d_%H:%M:%S.%f}"
28 | 
29 | callbacks:
30 |   model_checkpoint:
31 |     monitor: "val/mean_metric"
32 |     mode: "max"
33 | 
34 |   early_stopping:
35 |     monitor: "val/mean_metric"
36 |     patience: 3
37 |     mode: "max"
38 | 


--------------------------------------------------------------------------------
/python/core/configs/experiment/pos.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # to execute this experiment run:
 4 | # python train.py experiment=example
 5 | 
 6 | defaults:
 7 |   - override /datamodule: pos_datamodules.yaml
 8 |   - override /model: pos_model.yaml
 9 |   - override /callbacks: default.yaml
10 |   - override /trainer: gpu.yaml
11 | 
12 | # all parameters below will be merged with parameters from default configurations set above
13 | # this allows you to overwrite only specified parameters
14 | 
15 | tags: ["pos"]
16 | 
17 | seed: 12345
18 | 
19 | trainer:
20 |   min_epochs: 1
21 |   max_epochs: 10
22 |   gradient_clip_val: 1.0
23 | 
24 | logger:
25 |   wandb:
26 |     tags: "${tags}"
27 |     name: "ltp-${oc.env:SLURM_JOB_ID,localhost}-${now:%Y-%m-%d_%H:%M:%S.%f}"
28 | 
29 | callbacks:
30 |   model_checkpoint:
31 |     monitor: "val/mean_metric"
32 |     mode: "max"
33 | 
34 |   early_stopping:
35 |     monitor: "val/mean_metric"
36 |     patience: 3
37 |     mode: "max"
38 | 


--------------------------------------------------------------------------------
/python/core/configs/experiment/sdp.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # to execute this experiment run:
 4 | # python train.py experiment=example
 5 | 
 6 | defaults:
 7 |   - override /datamodule: sdp_datamodules.yaml
 8 |   - override /model: sdp_model.yaml
 9 |   - override /callbacks: default.yaml
10 |   - override /trainer: gpu.yaml
11 | 
12 | # all parameters below will be merged with parameters from default configurations set above
13 | # this allows you to overwrite only specified parameters
14 | 
15 | tags: ["sdp"]
16 | 
17 | seed: 12345
18 | 
19 | trainer:
20 |   min_epochs: 1
21 |   max_epochs: 10
22 |   gradient_clip_val: 1.0
23 | 
24 | logger:
25 |   wandb:
26 |     tags: "${tags}"
27 |     name: "ltp-${oc.env:SLURM_JOB_ID,localhost}-${now:%Y-%m-%d_%H:%M:%S.%f}"
28 | 
29 | callbacks:
30 |   model_checkpoint:
31 |     monitor: "val/mean_metric"
32 |     mode: "max"
33 | 
34 |   early_stopping:
35 |     monitor: "val/mean_metric"
36 |     patience: 3
37 |     mode: "max"
38 | 


--------------------------------------------------------------------------------
/python/core/configs/experiment/srl.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # to execute this experiment run:
 4 | # python train.py experiment=example
 5 | 
 6 | defaults:
 7 |   - override /datamodule: srl_datamodules.yaml
 8 |   - override /model: srl_model.yaml
 9 |   - override /callbacks: default.yaml
10 |   - override /trainer: gpu.yaml
11 | 
12 | # all parameters below will be merged with parameters from default configurations set above
13 | # this allows you to overwrite only specified parameters
14 | 
15 | tags: ["srl"]
16 | 
17 | seed: 12345
18 | 
19 | trainer:
20 |   min_epochs: 1
21 |   max_epochs: 10
22 |   gradient_clip_val: 1.0
23 | 
24 | logger:
25 |   wandb:
26 |     tags: "${tags}"
27 |     name: "ltp-${oc.env:SLURM_JOB_ID,localhost}-${now:%Y-%m-%d_%H:%M:%S.%f}"
28 | 
29 | callbacks:
30 |   model_checkpoint:
31 |     monitor: "val/mean_metric"
32 |     mode: "max"
33 | 
34 |   early_stopping:
35 |     monitor: "val/mean_metric"
36 |     patience: 3
37 |     mode: "max"
38 | 


--------------------------------------------------------------------------------
/python/core/configs/extras/default.yaml:
--------------------------------------------------------------------------------
1 | # disable python warnings if they annoy you
2 | ignore_warnings: False
3 | 
4 | # ask user for tags if none are provided in the config
5 | enforce_tags: True
6 | 
7 | # pretty print config tree at the start of the run using Rich library
8 | print_config: True
9 | 


--------------------------------------------------------------------------------
/python/core/configs/hparams_search/ltp_optuna.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # example hyperparameter optimization of some experiment with Optuna:
 4 | # python train.py -m hparams_search=mnist_optuna experiment=example
 5 | 
 6 | defaults:
 7 |   - override /hydra/sweeper: optuna
 8 | 
 9 | # choose metric which will be optimized by Optuna
10 | # make sure this is the correct name of some metric logged in lightning module!
11 | optimized_metric: "val/mean_metric"
12 | 
13 | # here we define Optuna hyperparameter search
14 | # it optimizes for value returned from function with @hydra.main decorator
15 | # docs: https://hydra.cc/docs/next/plugins/optuna_sweeper
16 | hydra:
17 |   mode: "MULTIRUN" # set hydra to multirun by default if this config is attached
18 | 
19 |   sweeper:
20 |     _target_: hydra_plugins.hydra_optuna_sweeper.optuna_sweeper.OptunaSweeper
21 | 
22 |     # storage URL to persist optimization results
23 |     # for example, you can use SQLite if you set 'sqlite:///example.db'
24 |     storage: null
25 | 
26 |     # name of the study to persist optimization results
27 |     study_name: null
28 | 
29 |     # number of parallel workers
30 |     n_jobs: 1
31 | 
32 |     # 'minimize' or 'maximize' the objective
33 |     direction: maximize
34 | 
35 |     # total number of runs that will be executed
36 |     n_trials: 20
37 | 
38 |     # choose Optuna hyperparameter sampler
39 |     # you can choose bayesian sampler (tpe), random search (without optimization), grid sampler, and others
40 |     # docs: https://optuna.readthedocs.io/en/stable/reference/samplers.html
41 |     sampler:
42 |       _target_: optuna.samplers.TPESampler
43 |       seed: 1234
44 |       n_startup_trials: 10 # number of random sampling runs before optimization starts
45 | 
46 |     # define hyperparameter search space
47 |     params:
48 |       model.optimizer.lr: interval(0.0001, 0.1)
49 | 


--------------------------------------------------------------------------------
/python/core/configs/hydra/default.yaml:
--------------------------------------------------------------------------------
 1 | # https://hydra.cc/docs/configure_hydra/intro/
 2 | 
 3 | # enable color logging
 4 | defaults:
 5 |   - override hydra_logging: colorlog
 6 |   - override job_logging: colorlog
 7 | 
 8 | # output directory, generated dynamically on each run
 9 | run:
10 |   dir: ${paths.log_dir}/${task_name}/runs/${now:%Y-%m-%d}_${now:%H-%M-%S}
11 | sweep:
12 |   dir: ${paths.log_dir}/${task_name}/multiruns/${now:%Y-%m-%d}_${now:%H-%M-%S}
13 |   subdir: ${hydra.job.num}
14 | 


--------------------------------------------------------------------------------
/python/core/configs/local/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HIT-SCIR/ltp/6c6059d5ccad87dca003190ee7565af86e7e22a0/python/core/configs/local/.gitkeep


--------------------------------------------------------------------------------
/python/core/configs/logger/comet.yaml:
--------------------------------------------------------------------------------
 1 | # https://www.comet.ml
 2 | 
 3 | comet:
 4 |   _target_: pytorch_lightning.loggers.comet.CometLogger
 5 |   api_key: ${oc.env:COMET_API_TOKEN} # api key is loaded from environment variable
 6 |   save_dir: "${paths.output_dir}"
 7 |   project_name: "ltp"
 8 |   rest_api_key: null
 9 |   # experiment_name: ""
10 |   experiment_key: null # set to resume experiment
11 |   offline: False
12 |   prefix: ""
13 | 


--------------------------------------------------------------------------------
/python/core/configs/logger/csv.yaml:
--------------------------------------------------------------------------------
1 | # csv logger built in lightning
2 | 
3 | csv:
4 |   _target_: pytorch_lightning.loggers.csv_logs.CSVLogger
5 |   save_dir: "${paths.output_dir}"
6 |   name: "csv/"
7 |   prefix: ""
8 | 


--------------------------------------------------------------------------------
/python/core/configs/logger/many_loggers.yaml:
--------------------------------------------------------------------------------
 1 | # train with many loggers at once
 2 | 
 3 | defaults:
 4 |   - wandb.yaml
 5 |   - tensorboard.yaml
 6 |   - csv.yaml
 7 |   # - comet.yaml
 8 |   # - mlflow.yaml
 9 |   # - neptune.yaml
10 | 


--------------------------------------------------------------------------------
/python/core/configs/logger/mlflow.yaml:
--------------------------------------------------------------------------------
 1 | # https://mlflow.org
 2 | 
 3 | mlflow:
 4 |   _target_: pytorch_lightning.loggers.mlflow.MLFlowLogger
 5 |   # experiment_name: ""
 6 |   # run_name: ""
 7 |   tracking_uri: ${paths.log_dir}/mlflow/mlruns # run `mlflow ui` command inside the `logs/mlflow/` dir to open the UI
 8 |   tags: null
 9 |   # save_dir: "./mlruns"
10 |   prefix: ""
11 |   artifact_location: null
12 |   # run_id: ""
13 | 


--------------------------------------------------------------------------------
/python/core/configs/logger/neptune.yaml:
--------------------------------------------------------------------------------
 1 | # https://neptune.ai
 2 | 
 3 | neptune:
 4 |   _target_: pytorch_lightning.loggers.neptune.NeptuneLogger
 5 |   api_key: ${oc.env:NEPTUNE_API_TOKEN} # api key is loaded from environment variable
 6 |   project: "ltp"
 7 |   # name: ""
 8 |   log_model_checkpoints: True
 9 |   prefix: ""
10 | 


--------------------------------------------------------------------------------
/python/core/configs/logger/tensorboard.yaml:
--------------------------------------------------------------------------------
 1 | # https://www.tensorflow.org/tensorboard/
 2 | 
 3 | tensorboard:
 4 |   _target_: pytorch_lightning.loggers.tensorboard.TensorBoardLogger
 5 |   save_dir: "${paths.output_dir}/tensorboard/"
 6 |   name: null
 7 |   log_graph: False
 8 |   default_hp_metric: True
 9 |   prefix: ""
10 |   # version: ""
11 | 


--------------------------------------------------------------------------------
/python/core/configs/logger/wandb.yaml:
--------------------------------------------------------------------------------
 1 | # https://wandb.ai
 2 | 
 3 | wandb:
 4 |   _target_: pytorch_lightning.loggers.wandb.WandbLogger
 5 |   # name: "" # name of the run (normally generated by wandb)
 6 |   save_dir: "${paths.output_dir}"
 7 |   offline: False
 8 |   id: null # pass correct id to resume experiment!
 9 |   anonymous: null # enable anonymous logging
10 |   project: "ltp"
11 |   log_model: False # upload lightning ckpts
12 |   prefix: "" # a string to put at the beginning of metric keys
13 |   # entity: "" # set to name of your wandb team
14 |   group: ""
15 |   tags: []
16 |   job_type: ""
17 | 


--------------------------------------------------------------------------------
/python/core/configs/model/cls_model.yaml:
--------------------------------------------------------------------------------
 1 | _target_: ltp_core.models.lit_model.LTPLitModule
 2 | 
 3 | optimizer:
 4 |   _ltp_target_: torch.optim.AdamW
 5 |   _ltp_partial_: true
 6 |   lr: 2e-5
 7 |   weight_decay: 0.0
 8 | 
 9 | layer_lrs:
10 |   _ltp_target_: ltp_core.models.optimization.layer_lrs.get_layer_lrs_with_crf
11 |   _ltp_partial_: true
12 |   transformer_prefix: backbone
13 |   learning_rate: ${model.optimizer.lr}
14 |   layer_decay: 0.8 # 0.8 for Base/Small, 0.9 for Large
15 |   n_layers: 12
16 |   crf_prefix: "crf"
17 |   crf_ratio: 10.0
18 | 
19 | scheduler:
20 |   _ltp_target_: ltp_core.models.optimization.scheduler.compose_with_scheduler
21 |   _ltp_partial_: true
22 |   scheduler_type: "exponential"
23 |   scheduler_args: null
24 |   warmup_ratio: 0.1
25 |   interval: "epoch"
26 |   frequency: 3
27 | 
28 | criterions:
29 |   cls:
30 |     _ltp_target_: ltp_core.models.criterion.sent.ClassificationLoss
31 | 
32 | metrics:
33 |   cls:
34 |     _ltp_target_: ltp_core.models.metrics.sent.ClsAccuracy
35 | 
36 | model:
37 |   _ltp_target_: ltp_core.models.ltp_model.LTPModule
38 |   backbone:
39 |     _ltp_target_: transformers.AutoModel.from_pretrained
40 |     pretrained_model_name_or_path: hfl/chinese-electra-180g-base-discriminator
41 | 
42 |   processor:
43 |     cls:
44 |       _ltp_target_: ltp_core.models.processor.ClsOnly
45 | 
46 |   heads:
47 |     cls:
48 |       _ltp_target_: ltp_core.models.components.sent.MLPClassifier
49 |       input_size: 768
50 |       num_labels: 2
51 |       dropout: 0.1
52 | 


--------------------------------------------------------------------------------
/python/core/configs/model/cws_model.yaml:
--------------------------------------------------------------------------------
 1 | _target_: ltp_core.models.lit_model.LTPLitModule
 2 | 
 3 | optimizer:
 4 |   _ltp_target_: torch.optim.AdamW
 5 |   _ltp_partial_: true
 6 |   lr: 2e-5
 7 |   weight_decay: 0.0
 8 | 
 9 | layer_lrs:
10 |   _ltp_target_: ltp_core.models.optimization.layer_lrs.get_layer_lrs_with_crf
11 |   _ltp_partial_: true
12 |   transformer_prefix: backbone
13 |   learning_rate: ${model.optimizer.lr}
14 |   layer_decay: 0.8 # 0.8 for Base/Small, 0.9 for Large
15 |   n_layers: 12
16 |   crf_prefix: "crf"
17 |   crf_ratio: 10.0
18 | 
19 | scheduler:
20 |   _ltp_target_: ltp_core.models.optimization.scheduler.compose_with_scheduler
21 |   _ltp_partial_: true
22 |   scheduler_type: "linear"
23 |   scheduler_args: null
24 |   warmup_ratio: 0.02
25 |   interval: "step"
26 |   frequency: 1
27 | 
28 | criterions:
29 |   cws:
30 |     _ltp_target_: ltp_core.models.criterion.token.TokenLoss
31 | 
32 | metrics:
33 |   cws:
34 |     _ltp_target_: ltp_core.models.metrics.token.SeqEvalF1
35 |     tags_or_path: ["B", "M", "E", "S"]
36 | 
37 | model:
38 |   _ltp_target_: ltp_core.models.ltp_model.LTPModule
39 |   backbone:
40 |     _ltp_target_: transformers.AutoModel.from_pretrained
41 |     pretrained_model_name_or_path: hfl/chinese-electra-180g-base-discriminator
42 | 
43 |   processor:
44 |     cws:
45 |       _ltp_target_: ltp_core.models.processor.TokenOnly
46 | 
47 |   heads:
48 |     cws:
49 |       _ltp_target_: ltp_core.models.components.token.MLPTokenClassifier
50 |       input_size: 768
51 |       num_labels: 4
52 |       dropout: 0.1
53 | 


--------------------------------------------------------------------------------
/python/core/configs/model/dep_model.yaml:
--------------------------------------------------------------------------------
 1 | _target_: ltp_core.models.lit_model.LTPLitModule
 2 | 
 3 | optimizer:
 4 |   _ltp_target_: torch.optim.AdamW
 5 |   _ltp_partial_: true
 6 |   lr: 2e-5
 7 |   weight_decay: 0.0
 8 | 
 9 | layer_lrs:
10 |   _ltp_target_: ltp_core.models.optimization.layer_lrs.get_layer_lrs_with_crf
11 |   _ltp_partial_: true
12 |   transformer_prefix: backbone
13 |   learning_rate: ${model.optimizer.lr}
14 |   layer_decay: 0.8 # 0.8 for Base/Small, 0.9 for Large
15 |   n_layers: 12
16 |   crf_prefix: "crf"
17 |   crf_ratio: 10.0
18 | 
19 | scheduler:
20 |   _ltp_target_: ltp_core.models.optimization.scheduler.compose_with_scheduler
21 |   _ltp_partial_: true
22 |   scheduler_type: "linear"
23 |   scheduler_args: null
24 |   warmup_ratio: 0.02
25 |   interval: "step"
26 |   frequency: 1
27 | 
28 | criterions:
29 |   dep:
30 |     _ltp_target_: ltp_core.models.criterion.graph.DEPLoss
31 | 
32 | metrics:
33 |   dep:
34 |     _ltp_target_: ltp_core.models.metrics.graph.DEPLas
35 | 
36 | model:
37 |   _ltp_target_: ltp_core.models.ltp_model.LTPModule
38 |   backbone:
39 |     _ltp_target_: transformers.AutoModel.from_pretrained
40 |     pretrained_model_name_or_path: hfl/chinese-electra-180g-base-discriminator
41 | 
42 |   processor:
43 |     dep:
44 |       _ltp_target_: ltp_core.models.processor.WordsWithHead
45 | 
46 |   heads:
47 |     dep:
48 |       _ltp_target_: ltp_core.models.components.graph.BiaffineClassifier
49 |       input_size: 768
50 |       num_labels: 14
51 |       dropout: 0.1
52 | 


--------------------------------------------------------------------------------
/python/core/configs/model/ner_model.yaml:
--------------------------------------------------------------------------------
 1 | _target_: ltp_core.models.lit_model.LTPLitModule
 2 | 
 3 | optimizer:
 4 |   _ltp_target_: torch.optim.AdamW
 5 |   _ltp_partial_: true
 6 |   lr: 2e-5
 7 |   weight_decay: 0.0
 8 | 
 9 | layer_lrs:
10 |   _ltp_target_: ltp_core.models.optimization.layer_lrs.get_layer_lrs_with_crf
11 |   _ltp_partial_: true
12 |   transformer_prefix: backbone
13 |   learning_rate: ${model.optimizer.lr}
14 |   layer_decay: 0.8 # 0.8 for Base/Small, 0.9 for Large
15 |   n_layers: 12
16 |   crf_prefix: "crf"
17 |   crf_ratio: 10.0
18 | 
19 | scheduler:
20 |   _ltp_target_: ltp_core.models.optimization.scheduler.compose_with_scheduler
21 |   _ltp_partial_: true
22 |   scheduler_type: "linear"
23 |   scheduler_args: null
24 |   warmup_ratio: 0.02
25 |   interval: "step"
26 |   frequency: 1
27 | 
28 | criterions:
29 |   ner:
30 |     _ltp_target_: ltp_core.models.criterion.token.TokenLoss
31 | 
32 | metrics:
33 |   ner:
34 |     _ltp_target_: ltp_core.models.metrics.token.SeqEvalF1
35 |     tags_or_path: ${datamodule.datamodules.load.data_dir}/vocabs/bio.txt
36 | 
37 | model:
38 |   _ltp_target_: ltp_core.models.ltp_model.LTPModule
39 |   backbone:
40 |     _ltp_target_: transformers.AutoModel.from_pretrained
41 |     pretrained_model_name_or_path: hfl/chinese-electra-180g-base-discriminator
42 | 
43 |   processor:
44 |     ner:
45 |       _ltp_target_: ltp_core.models.processor.WordsOnly
46 | 
47 |   heads:
48 |     ner:
49 |       _ltp_target_: ltp_core.models.components.token.MLPTokenClassifier
50 |       input_size: 768
51 |       num_labels: 13
52 |       dropout: 0.1
53 | 


--------------------------------------------------------------------------------
/python/core/configs/model/pos_model.yaml:
--------------------------------------------------------------------------------
 1 | _target_: ltp_core.models.lit_model.LTPLitModule
 2 | 
 3 | optimizer:
 4 |   _ltp_target_: torch.optim.AdamW
 5 |   _ltp_partial_: true
 6 |   lr: 2e-5
 7 |   weight_decay: 0.0
 8 | 
 9 | layer_lrs:
10 |   _ltp_target_: ltp_core.models.optimization.layer_lrs.get_layer_lrs_with_crf
11 |   _ltp_partial_: true
12 |   transformer_prefix: backbone
13 |   learning_rate: ${model.optimizer.lr}
14 |   layer_decay: 0.8 # 0.8 for Base/Small, 0.9 for Large
15 |   n_layers: 12
16 |   crf_prefix: "crf"
17 |   crf_ratio: 10.0
18 | 
19 | scheduler:
20 |   _ltp_target_: ltp_core.models.optimization.scheduler.compose_with_scheduler
21 |   _ltp_partial_: true
22 |   scheduler_type: "linear"
23 |   scheduler_args: null
24 |   warmup_ratio: 0.02
25 |   interval: "step"
26 |   frequency: 1
27 | 
28 | criterions:
29 |   pos:
30 |     _ltp_target_: ltp_core.models.criterion.token.TokenLoss
31 | 
32 | metrics:
33 |   pos:
34 |     _ltp_target_: ltp_core.models.metrics.token.TokenAccuracy
35 | 
36 | model:
37 |   _ltp_target_: ltp_core.models.ltp_model.LTPModule
38 |   backbone:
39 |     _ltp_target_: transformers.AutoModel.from_pretrained
40 |     pretrained_model_name_or_path: hfl/chinese-electra-180g-base-discriminator
41 | 
42 |   processor:
43 |     pos:
44 |       _ltp_target_: ltp_core.models.processor.WordsOnly
45 | 
46 |   heads:
47 |     pos:
48 |       _ltp_target_: ltp_core.models.components.token.MLPTokenClassifier
49 |       input_size: 768
50 |       num_labels: 27
51 |       dropout: 0.1
52 | 


--------------------------------------------------------------------------------
/python/core/configs/model/sdp_model.yaml:
--------------------------------------------------------------------------------
 1 | _target_: ltp_core.models.lit_model.LTPLitModule
 2 | 
 3 | optimizer:
 4 |   _ltp_target_: torch.optim.AdamW
 5 |   _ltp_partial_: true
 6 |   lr: 2e-5
 7 |   weight_decay: 0.0
 8 | 
 9 | layer_lrs:
10 |   _ltp_target_: ltp_core.models.optimization.layer_lrs.get_layer_lrs_with_crf
11 |   _ltp_partial_: true
12 |   transformer_prefix: backbone
13 |   learning_rate: ${model.optimizer.lr}
14 |   layer_decay: 0.8 # 0.8 for Base/Small, 0.9 for Large
15 |   n_layers: 12
16 |   crf_prefix: "crf"
17 |   crf_ratio: 10.0
18 | 
19 | scheduler:
20 |   _ltp_target_: ltp_core.models.optimization.scheduler.compose_with_scheduler
21 |   _ltp_partial_: true
22 |   scheduler_type: "linear"
23 |   scheduler_args: null
24 |   warmup_ratio: 0.02
25 |   interval: "step"
26 |   frequency: 1
27 | 
28 | criterions:
29 |   sdp:
30 |     _ltp_target_: ltp_core.models.criterion.graph.SDPLoss
31 | 
32 | metrics:
33 |   sdp:
34 |     _ltp_target_: ltp_core.models.metrics.graph.SDPLas
35 | 
36 | model:
37 |   _ltp_target_: ltp_core.models.ltp_model.LTPModule
38 |   backbone:
39 |     _ltp_target_: transformers.AutoModel.from_pretrained
40 |     pretrained_model_name_or_path: hfl/chinese-electra-180g-base-discriminator
41 | 
42 |   processor:
43 |     sdp:
44 |       _ltp_target_: ltp_core.models.processor.WordsWithHead
45 | 
46 |   heads:
47 |     sdp:
48 |       _ltp_target_: ltp_core.models.components.graph.BiaffineClassifier
49 |       input_size: 768
50 |       num_labels: 56
51 |       dropout: 0.1
52 | 


--------------------------------------------------------------------------------
/python/core/configs/model/srl_model.yaml:
--------------------------------------------------------------------------------
 1 | _target_: ltp_core.models.lit_model.LTPLitModule
 2 | 
 3 | optimizer:
 4 |   _ltp_target_: torch.optim.AdamW
 5 |   _ltp_partial_: true
 6 |   lr: 2e-5
 7 |   weight_decay: 0.0
 8 | 
 9 | layer_lrs:
10 |   _ltp_target_: ltp_core.models.optimization.layer_lrs.get_layer_lrs_with_crf
11 |   _ltp_partial_: true
12 |   transformer_prefix: backbone
13 |   learning_rate: ${model.optimizer.lr}
14 |   layer_decay: 0.8 # 0.8 for Base/Small, 0.9 for Large
15 |   n_layers: 12
16 |   crf_prefix: "crf"
17 |   crf_ratio: 10.0
18 | 
19 | scheduler:
20 |   _ltp_target_: ltp_core.models.optimization.scheduler.compose_with_scheduler
21 |   _ltp_partial_: true
22 |   scheduler_type: "linear"
23 |   scheduler_args: null
24 |   warmup_ratio: 0.02
25 |   interval: "step"
26 |   frequency: 1
27 | 
28 | criterions:
29 |   srl:
30 |     _ltp_target_: ltp_core.models.criterion.token.SRLLoss
31 | 
32 | metrics:
33 |   srl:
34 |     _ltp_target_: ltp_core.models.metrics.token.SRLEvalF1
35 |     tags_or_path: ${datamodule.datamodules.load.data_dir}/vocabs/arguments.txt
36 | 
37 | model:
38 |   _ltp_target_: ltp_core.models.ltp_model.LTPModule
39 |   backbone:
40 |     _ltp_target_: transformers.AutoModel.from_pretrained
41 |     pretrained_model_name_or_path: hfl/chinese-electra-180g-base-discriminator
42 | 
43 |   processor:
44 |     srl:
45 |       _ltp_target_: ltp_core.models.processor.WordsOnly
46 | 
47 |   heads:
48 |     srl:
49 |       _ltp_target_: ltp_core.models.components.token.BiaffineTokenClassifier
50 |       input_size: 768
51 |       hidden_size: 300
52 |       num_labels: 97
53 |       dropout: 0.1
54 |       use_crf: True
55 | 


--------------------------------------------------------------------------------
/python/core/configs/paths/default.yaml:
--------------------------------------------------------------------------------
 1 | # path to root directory
 2 | # this requires PROJECT_ROOT environment variable to exist
 3 | # PROJECT_ROOT is inferred and set by pyrootutils package in `train.py` and `eval.py`
 4 | root_dir: ${oc.env:PROJECT_ROOT}
 5 | 
 6 | # path to data directory
 7 | data_dir: ${paths.root_dir}/data/
 8 | 
 9 | # path to logging directory
10 | log_dir: ${paths.root_dir}/logs/
11 | 
12 | # path to output directory, created dynamically by hydra
13 | # path generation pattern is specified in `configs/hydra/default.yaml`
14 | # use it to store all files generated during the run, like ckpts and metrics
15 | output_dir: ${hydra:runtime.output_dir}
16 | 
17 | # path to working directory
18 | work_dir: ${hydra:runtime.cwd}
19 | 


--------------------------------------------------------------------------------
/python/core/configs/train.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # specify here default configuration
 4 | # order of defaults determines the order in which configs override each other
 5 | defaults:
 6 |   - _self_
 7 |   - datamodule: multi_datamodules.yaml
 8 |   - model: multi_model.yaml
 9 |   - callbacks: default.yaml
10 |   - logger: null # set logger here or use command line (e.g. `python train.py logger=tensorboard`)
11 |   - trainer: default.yaml
12 |   - paths: default.yaml
13 |   - extras: default.yaml
14 |   - hydra: default.yaml
15 | 
16 |   # experiment configs allow for version control of specific hyperparameters
17 |   # e.g. best hyperparameters for given model and datamodule
18 |   - experiment: null
19 | 
20 |   # config for hyperparameter optimization
21 |   - hparams_search: null
22 | 
23 |   # optional local config for machine/user specific settings
24 |   # it's optional since it doesn't need to exist and is excluded from version control
25 |   - optional local: default.yaml
26 | 
27 |   # debugging config (enable through command line, e.g. `python train.py debug=default)
28 |   - debug: null
29 | 
30 | # task name, determines output directory path
31 | task_name: "train"
32 | 
33 | # tags to help you identify your experiments
34 | # you can overwrite this in experiment configs
35 | # overwrite from command line with `python train.py tags="[first_tag, second_tag]"`
36 | # appending lists from command line is currently not supported :(
37 | # https://github.com/facebookresearch/hydra/issues/1547
38 | tags: ["dev"]
39 | 
40 | # set False to skip model training
41 | train: True
42 | 
43 | # evaluate on test set, using best model weights achieved during training
44 | # lightning chooses best weights based on the metric specified in checkpoint callback
45 | test: True
46 | 
47 | # simply provide checkpoint path to resume training
48 | ckpt_path: null
49 | 
50 | # seed for random number generators in pytorch, numpy and python.random
51 | seed: null
52 | 


--------------------------------------------------------------------------------
/python/core/configs/trainer/cpu.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 |   - default.yaml
3 | 
4 | accelerator: cpu
5 | devices: 1
6 | 


--------------------------------------------------------------------------------
/python/core/configs/trainer/ddp.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - default.yaml
 3 | 
 4 | # use "ddp_spawn" instead of "ddp",
 5 | # it's slower but normal "ddp" currently doesn't work ideally with hydra
 6 | # https://github.com/facebookresearch/hydra/issues/2070
 7 | # https://pytorch-lightning.readthedocs.io/en/latest/accelerators/gpu_intermediate.html#distributed-data-parallel-spawn
 8 | strategy: ddp_spawn
 9 | 
10 | accelerator: gpu
11 | devices: 4
12 | num_nodes: 1
13 | sync_batchnorm: True
14 | 


--------------------------------------------------------------------------------
/python/core/configs/trainer/ddp_sim.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 |   - default.yaml
3 | 
4 | # simulate DDP on CPU, useful for debugging
5 | accelerator: cpu
6 | devices: 2
7 | strategy: ddp_spawn
8 | 


--------------------------------------------------------------------------------
/python/core/configs/trainer/default.yaml:
--------------------------------------------------------------------------------
 1 | _target_: pytorch_lightning.Trainer
 2 | 
 3 | default_root_dir: ${paths.output_dir}
 4 | 
 5 | min_epochs: 1 # prevents early stopping
 6 | max_epochs: 10
 7 | 
 8 | accelerator: cpu
 9 | devices: 1
10 | 
11 | # mixed precision for extra speed-up
12 | # precision: 16
13 | 
14 | # set True to to ensure deterministic results
15 | # makes training slower but gives more reproducibility than just setting seeds
16 | deterministic: False
17 | 


--------------------------------------------------------------------------------
/python/core/configs/trainer/gpu.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 |   - default.yaml
3 | 
4 | accelerator: gpu
5 | devices: 1
6 | 


--------------------------------------------------------------------------------
/python/core/configs/trainer/mps.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 |   - default.yaml
3 | 
4 | accelerator: mps
5 | devices: 1
6 | 


--------------------------------------------------------------------------------
/python/core/data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HIT-SCIR/ltp/6c6059d5ccad87dca003190ee7565af86e7e22a0/python/core/data/.gitkeep


--------------------------------------------------------------------------------
/python/core/data/conllu/dev.conllu:
--------------------------------------------------------------------------------
1 | 1	他	_	_	r	_	3	SBV	3:AGT	_
2 | 2	点头	_	_	v	_	3	ADV	3:MANN	_
3 | 3	表示	_	_	v	_	0	HED	0:Root	_
4 | 4	同意	_	_	v	_	3	VOB	3:dCONT	_
5 | 5	我	_	_	r	_	7	ATT	7:FEAT	_
6 | 6	的	_	_	u	_	5	RAD	5:mDEPD	_
7 | 7	意见	_	_	n	_	4	VOB	4:CONT	_
8 | 8	。	_	_	wp	_	3	WP	3:mPUNC	_
9 | 


--------------------------------------------------------------------------------
/python/core/data/conllu/test.conllu:
--------------------------------------------------------------------------------
 1 | 1	我们	_	_	r	_	7	SBV	7:AGT	_
 2 | 2	即将	_	_	d	_	7	ADV	7:mDEPD	_
 3 | 3	以	_	_	p	_	7	ADV	6:mRELA	_
 4 | 4	昂扬	_	_	a	_	6	ATT	6:FEAT	_
 5 | 5	的	_	_	u	_	4	RAD	4:mDEPD	_
 6 | 6	斗志	_	_	n	_	3	POB	7:DATV	_
 7 | 7	迎来	_	_	v	_	0	HED	0:Root	_
 8 | 8	新	_	_	a	_	11	ATT	11:FEAT	_
 9 | 9	的	_	_	u	_	8	RAD	8:mDEPD	_
10 | 10	一	_	_	m	_	11	ATT	11:MEAS	_
11 | 11	年	_	_	q	_	7	VOB	7:TIME	_
12 | 12	。	_	_	wp	_	7	WP	7:mPUNC	_
13 | 


--------------------------------------------------------------------------------
/python/core/data/conllu/train.conllu:
--------------------------------------------------------------------------------
1 | 1	他	_	_	r	_	2	SBV	2:AGT	_
2 | 2	叫	_	_	v	_	0	HED	0:Root	_
3 | 3	汤姆	_	_	nh	_	2	DBL	2:DATV|4:AGT	_
4 | 4	去	_	_	v	_	2	VOB	2:eSUCC	_
5 | 5	拿	_	_	v	_	4	COO	2:eSUCC|4:eSUCC	_
6 | 6	外衣	_	_	n	_	5	VOB	5:PAT	_
7 | 7	。	_	_	wp	_	2	WP	2:mPUNC	_
8 | 


--------------------------------------------------------------------------------
/python/core/data/conllu/vocabs/deprel.txt:
--------------------------------------------------------------------------------
 1 | ADV
 2 | ATT
 3 | CMP
 4 | COO
 5 | DBL
 6 | FOB
 7 | HED
 8 | IOB
 9 | LAD
10 | POB
11 | RAD
12 | SBV
13 | VOB
14 | WP
15 | 


--------------------------------------------------------------------------------
/python/core/data/conllu/vocabs/deps.txt:
--------------------------------------------------------------------------------
 1 | AGT
 2 | CONT
 3 | DATV
 4 | EXP
 5 | FEAT
 6 | LINK
 7 | LOC
 8 | MANN
 9 | MATL
10 | MEAS
11 | PAT
12 | REAS
13 | Root
14 | SCO
15 | STAT
16 | TIME
17 | TOOL
18 | dAGT
19 | dCONT
20 | dDATV
21 | dEXP
22 | dFEAT
23 | dLINK
24 | dLOC
25 | dMANN
26 | dMATL
27 | dMEAS
28 | dPAT
29 | dREAS
30 | dSCO
31 | dSTAT
32 | dTIME
33 | dTOOL
34 | eCOO
35 | ePREC
36 | eSUCC
37 | mDEPD
38 | mNEG
39 | mPUNC
40 | mRELA
41 | rAGT
42 | rCONT
43 | rDATV
44 | rEXP
45 | rFEAT
46 | rLINK
47 | rLOC
48 | rMANN
49 | rMATL
50 | rMEAS
51 | rPAT
52 | rREAS
53 | rSCO
54 | rSTAT
55 | rTIME
56 | rTOOL
57 | 


--------------------------------------------------------------------------------
/python/core/data/conllu/vocabs/feats.txt:
--------------------------------------------------------------------------------
1 | _
2 | 


--------------------------------------------------------------------------------
/python/core/data/conllu/vocabs/lemma.txt:
--------------------------------------------------------------------------------
1 | _
2 | 


--------------------------------------------------------------------------------
/python/core/data/conllu/vocabs/upos.txt:
--------------------------------------------------------------------------------
1 | _
2 | 


--------------------------------------------------------------------------------
/python/core/data/conllu/vocabs/word.txt:
--------------------------------------------------------------------------------
1 | _
2 | 


--------------------------------------------------------------------------------
/python/core/data/conllu/vocabs/word_char.txt:
--------------------------------------------------------------------------------
1 | _
2 | 


--------------------------------------------------------------------------------
/python/core/data/conllu/vocabs/xpos.txt:
--------------------------------------------------------------------------------
 1 | a
 2 | b
 3 | c
 4 | d
 5 | e
 6 | h
 7 | i
 8 | j
 9 | k
10 | m
11 | n
12 | nd
13 | nh
14 | ni
15 | nl
16 | ns
17 | nt
18 | nz
19 | o
20 | p
21 | q
22 | r
23 | u
24 | v
25 | wp
26 | ws
27 | z
28 | 


--------------------------------------------------------------------------------
/python/core/data/ner/dev.bio:
--------------------------------------------------------------------------------
 1 | 正在	O
 2 | 执行	O
 3 | 第十四	O
 4 | 次	O
 5 | 南极	S-Ns
 6 | 考察	O
 7 | 任务	O
 8 | 的	O
 9 | 中国	S-Ns
10 | 考察队员	O
11 | ，	O
12 | 目前	O
13 | 分别	O
14 | 在	O
15 | 长城站	O
16 | 、	O
17 | 中山站	O
18 | 和	O
19 | “	O
20 | 雪龙	O
21 | ”	O
22 | 号	O
23 | 船上	O
24 | 。	O
25 | 


--------------------------------------------------------------------------------
/python/core/data/ner/test.bio:
--------------------------------------------------------------------------------
 1 | 编者	O
 2 | 的	O
 3 | 话	O
 4 | ∶	O
 5 | 党中央	B-Ni
 6 | 国务院	E-Ni
 7 | 最近	O
 8 | 召开	O
 9 | 的	O
10 | 国有	O
11 | 企业	O
12 | 下岗	O
13 | 职工	O
14 | 基本	O
15 | 生活	O
16 | 保障	O
17 | 和	O
18 | 再	O
19 | 就业	O
20 | 工作	O
21 | 会议	O
22 | ，	O
23 | 提出	O
24 | 要	O
25 | 把	O
26 | 这项	O
27 | 工作	O
28 | 作为	O
29 | 当前	O
30 | 一个	O
31 | 头等	O
32 | 大事	O
33 | 来	O
34 | 抓	O
35 | ，	O
36 | 并	O
37 | 做	O
38 | 了	O
39 | 全面	O
40 | 的	O
41 | 动员	O
42 | 和	O
43 | 部署	O
44 | ，	O
45 | 为了	O
46 | 配合	O
47 | 会议	O
48 | 精神	O
49 | 的	O
50 | 贯彻	O
51 | 落实	O
52 | ，	O
53 | 我们	O
54 | 将	O
55 | 组织	O
56 | 一	O
57 | 系列	O
58 | 报道	O
59 | ，	O
60 | 多	O
61 | 层次	O
62 | 、	O
63 | 多	O
64 | 侧面	O
65 | 的	O
66 | 宣传	O
67 | 中央	O
68 | 精神	O
69 | ，	O
70 | 报道	O
71 | 各地	O
72 | 新	O
73 | 经验	O
74 | 、	O
75 | 新	O
76 | 做法	O
77 | 。	O
78 | 


--------------------------------------------------------------------------------
/python/core/data/ner/train.bio:
--------------------------------------------------------------------------------
 1 | 台湾	S-Ns
 2 | 是	O
 3 | 中国	S-Ns
 4 | 领土	O
 5 | 不可分割	O
 6 | 的	O
 7 | 一	O
 8 | 部分	O
 9 | 。	O
10 | 


--------------------------------------------------------------------------------
/python/core/data/ner/vocabs/bio.txt:
--------------------------------------------------------------------------------
 1 | O
 2 | B-Nh
 3 | B-Ni
 4 | B-Ns
 5 | E-Nh
 6 | E-Ni
 7 | E-Ns
 8 | I-Nh
 9 | I-Ni
10 | I-Ns
11 | S-Nh
12 | S-Ni
13 | S-Ns
14 | 


--------------------------------------------------------------------------------
/python/core/data/srl/dev.txt:
--------------------------------------------------------------------------------
 1 | 请	Y	O	O	O
 2 | 守住	Y	O	O	O
 3 | 你	_	O	B-ARG1	O
 4 | 的	_	O	I-ARG1	O
 5 | 道德	_	O	I-ARG1	O
 6 | 底线	_	O	I-ARG1	O
 7 | ，	_	O	O	O
 8 | 即使	_	O	B-ARGM-ADV	B-ARGM-DIS
 9 | 你	_	O	I-ARGM-ADV	B-ARG0
10 | 没有	Y	O	I-ARGM-ADV	O
11 | 一	_	O	I-ARGM-ADV	B-ARG1
12 | 个	_	O	I-ARGM-ADV	I-ARG1
13 | 十几	_	O	I-ARGM-ADV	I-ARG1
14 | 岁	_	O	I-ARGM-ADV	I-ARG1
15 | 的	_	O	I-ARGM-ADV	I-ARG1
16 | 女儿	_	O	I-ARGM-ADV	I-ARG1
17 | 。	_	O	O	O
18 | 


--------------------------------------------------------------------------------
/python/core/data/srl/test.txt:
--------------------------------------------------------------------------------
 1 | 百团大战	_	B-ARG0	O	O	O
 2 | 的	_	I-ARG0	O	O	O
 3 | 战略	_	I-ARG0	O	O	O
 4 | 目的	_	I-ARG0	O	O	O
 5 | 是	Y	O	O	O	O
 6 | 要	Y	B-ARG1	O	O	O
 7 | 打破	Y	I-ARG1	O	O	O
 8 | 敌人	_	I-ARG1	O	B-ARG1	O
 9 | 对	_	I-ARG1	O	I-ARG1	O
10 | 根据地	_	I-ARG1	O	I-ARG1	O
11 | 的	_	I-ARG1	O	I-ARG1	O
12 | 封锁	_	I-ARG1	O	I-ARG1	O
13 | ，	_	O	O	O	O
14 | 因此	_	O	O	O	B-ARGM-DIS
15 | 破路	_	O	O	O	B-ARG0
16 | ，	_	O	O	O	I-ARG0
17 | 拔	_	O	O	O	I-ARG0
18 | 据点	_	O	O	O	I-ARG0
19 | 十分	_	O	O	O	B-ARGM-ADV
20 | 重要	Y	O	O	O	O
21 | 。	_	O	O	O	O
22 | 


--------------------------------------------------------------------------------
/python/core/data/srl/train.txt:
--------------------------------------------------------------------------------
 1 | 站	Y	O	O	B-ARGM-TPC	B-ARG0
 2 | 在	_	B-ARGM-ADV	O	I-ARGM-TPC	I-ARG0
 3 | 楼下	_	I-ARGM-ADV	O	I-ARGM-TPC	I-ARG0
 4 | 的	_	O	O	I-ARGM-TPC	I-ARG0
 5 | 居民	_	O	O	I-ARGM-TPC	I-ARG0
 6 | 很多	_	O	O	B-ARG0	B-ARG0
 7 | 人	_	O	O	I-ARG0	I-ARG0
 8 | 都	_	O	O	B-ARGM-ADV	B-ARGM-ADV
 9 | 是	Y	O	O	O	O
10 | 捏	Y	O	O	O	O
11 | 着	_	O	O	O	O
12 | 鼻子	_	O	O	B-ARG1	O
13 | 在	_	O	O	O	B-ARGM-LOC
14 | 一旁	_	O	O	O	I-ARGM-LOC
15 | 观看	Y	O	O	O	O
16 | 。	_	O	O	O	O
17 | 


--------------------------------------------------------------------------------
/python/core/data/srl/vocabs/arguments.txt:
--------------------------------------------------------------------------------
 1 | O
 2 | B-ARG0
 3 | B-ARG0-ADV
 4 | B-ARG0-CND
 5 | B-ARG0-CRD
 6 | B-ARG0-MNR
 7 | B-ARG0-PRD
 8 | B-ARG0-PSE
 9 | B-ARG0-PSR
10 | B-ARG0-QTY
11 | B-ARG1
12 | B-ARG1-CRD
13 | B-ARG1-DIS
14 | B-ARG1-FRQ
15 | B-ARG1-PRD
16 | B-ARG1-PSE
17 | B-ARG1-PSR
18 | B-ARG1-QTY
19 | B-ARG1-TPC
20 | B-ARG2
21 | B-ARG2-CRD
22 | B-ARG2-PRD
23 | B-ARG2-PSE
24 | B-ARG2-PSR
25 | B-ARG2-QTY
26 | B-ARG3
27 | B-ARG3-TMP
28 | B-ARG4
29 | B-ARGM-ADV
30 | B-ARGM-BNF
31 | B-ARGM-CND
32 | B-ARGM-CRD
33 | B-ARGM-DGR
34 | B-ARGM-DIR
35 | B-ARGM-DIS
36 | B-ARGM-EXT
37 | B-ARGM-FRQ
38 | B-ARGM-LOC
39 | B-ARGM-MNR
40 | B-ARGM-PRD
41 | B-ARGM-PRP
42 | B-ARGM-QTY
43 | B-ARGM-T
44 | B-ARGM-TMP
45 | B-ARGM-TPC
46 | B-rel-ADV
47 | B-rel-DIS
48 | B-rel-EXT
49 | B-rel-MNR
50 | I-ARG0
51 | I-ARG0-ADV
52 | I-ARG0-CND
53 | I-ARG0-CRD
54 | I-ARG0-MNR
55 | I-ARG0-PRD
56 | I-ARG0-PSE
57 | I-ARG0-PSR
58 | I-ARG0-QTY
59 | I-ARG1
60 | I-ARG1-CRD
61 | I-ARG1-DIS
62 | I-ARG1-FRQ
63 | I-ARG1-PRD
64 | I-ARG1-PSE
65 | I-ARG1-PSR
66 | I-ARG1-QTY
67 | I-ARG1-TPC
68 | I-ARG2
69 | I-ARG2-CRD
70 | I-ARG2-PRD
71 | I-ARG2-PSE
72 | I-ARG2-PSR
73 | I-ARG2-QTY
74 | I-ARG3
75 | I-ARG3-TMP
76 | I-ARG4
77 | I-ARGM-ADV
78 | I-ARGM-BNF
79 | I-ARGM-CND
80 | I-ARGM-CRD
81 | I-ARGM-DGR
82 | I-ARGM-DIR
83 | I-ARGM-DIS
84 | I-ARGM-EXT
85 | I-ARGM-FRQ
86 | I-ARGM-LOC
87 | I-ARGM-MNR
88 | I-ARGM-PRD
89 | I-ARGM-PRP
90 | I-ARGM-QTY
91 | I-ARGM-T
92 | I-ARGM-TMP
93 | I-ARGM-TPC
94 | I-rel-ADV
95 | I-rel-DIS
96 | I-rel-EXT
97 | I-rel-MNR
98 | 


--------------------------------------------------------------------------------
/python/core/data/srl/vocabs/predicate.txt:
--------------------------------------------------------------------------------
1 | _
2 | Y
3 | 


--------------------------------------------------------------------------------
/python/core/logs/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HIT-SCIR/ltp/6c6059d5ccad87dca003190ee7565af86e7e22a0/python/core/logs/.gitkeep


--------------------------------------------------------------------------------
/python/core/ltp_core/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.1.3"
2 | 


--------------------------------------------------------------------------------
/python/core/ltp_core/algorithms/__init__.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     import torch
 3 |     from ltp_extension.algorithms import eisner as rust_eisner
 4 | 
 5 |     def eisner(scores, mask, remove_root=False):
 6 |         scores = scores.view(-1).cpu().numpy()
 7 |         length = torch.sum(mask, dim=1).cpu().numpy()
 8 | 
 9 |         result = torch.nn.utils.rnn.pad_sequence(
10 |             [
11 |                 torch.tensor(sequence, device=mask.device)
12 |                 for sequence in rust_eisner(scores.tolist(), length.tolist(), remove_root)
13 |             ],
14 |             batch_first=True,
15 |             padding_value=0,
16 |         )
17 | 
18 |         return result
19 | 
20 | except Exception:
21 |     pass
22 | 


--------------------------------------------------------------------------------
/python/core/ltp_core/datamodules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HIT-SCIR/ltp/6c6059d5ccad87dca003190ee7565af86e7e22a0/python/core/ltp_core/datamodules/__init__.py


--------------------------------------------------------------------------------
/python/core/ltp_core/datamodules/adapters/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HIT-SCIR/ltp/6c6059d5ccad87dca003190ee7565af86e7e22a0/python/core/ltp_core/datamodules/adapters/__init__.py


--------------------------------------------------------------------------------
/python/core/ltp_core/datamodules/adapters/dependency_parsing.py:
--------------------------------------------------------------------------------
 1 | from ltp_core.datamodules.components.conllu import Conllu
 2 | from ltp_core.datamodules.utils.datasets import load_dataset
 3 | 
 4 | 
 5 | def tokenize(examples, tokenizer, max_length):
 6 |     res = tokenizer(
 7 |         examples["form"],
 8 |         is_split_into_words=True,
 9 |         max_length=max_length,
10 |         truncation=True,
11 |     )
12 |     word_index = []
13 |     for encoding in res.encodings:
14 |         word_index.append([])
15 | 
16 |         last_word_idx = -1
17 |         current_length = 0
18 |         for word_idx in encoding.words[1:-1]:
19 |             if word_idx != last_word_idx:
20 |                 word_index[-1].append(current_length)
21 |             current_length += 1
22 |             last_word_idx = word_idx
23 | 
24 |     result = res.data
25 |     for ids in result["input_ids"]:
26 |         ids[0] = tokenizer.cls_token_id
27 |         ids[-1] = tokenizer.sep_token_id
28 |     result["overflow"] = [len(encoding.overflowing) > 0 for encoding in res.encodings]
29 |     result["word_index"] = word_index
30 |     result["word_attention_mask"] = [[True] * len(index) for index in word_index]
31 |     return result
32 | 
33 | 
34 | def build_dataset(data_dir, task_name, tokenizer, max_length=512, **kwargs):
35 |     import os
36 | 
37 |     os.environ["TOKENIZERS_PARALLELISM"] = "true"
38 |     dataset = load_dataset(Conllu, data_dir=data_dir, cache_dir=data_dir)
39 |     dataset = dataset.remove_columns(["id", "lemma", "upos", "xpos", "feats", "deps", "misc"])
40 |     dataset = dataset.rename_column("deprel", "labels")
41 |     dataset = dataset.map(lambda examples: tokenize(examples, tokenizer, max_length), batched=True)
42 |     dataset = dataset.filter(lambda x: not x["overflow"])
43 |     dataset.set_format(
44 |         type="torch",
45 |         columns=[
46 |             "input_ids",
47 |             "token_type_ids",
48 |             "attention_mask",
49 |             "word_index",
50 |             "word_attention_mask",
51 |             "head",
52 |             "labels",
53 |         ],
54 |     )
55 |     return dataset
56 | 


--------------------------------------------------------------------------------
/python/core/ltp_core/datamodules/adapters/named_entity_recognition.py:
--------------------------------------------------------------------------------
 1 | from ltp_core.datamodules.adapters.postagger import tokenize
 2 | from ltp_core.datamodules.components.bio import Bio
 3 | from ltp_core.datamodules.utils.datasets import load_dataset
 4 | 
 5 | 
 6 | def build_dataset(data_dir, task_name, tokenizer, max_length=512, **kwargs):
 7 |     import os
 8 | 
 9 |     os.environ["TOKENIZERS_PARALLELISM"] = "true"
10 |     dataset = load_dataset(Bio, data_dir=data_dir, cache_dir=data_dir)
11 |     dataset = dataset.rename_column("bio", "labels")
12 |     dataset = dataset.map(lambda examples: tokenize(examples, tokenizer, max_length), batched=True)
13 |     dataset = dataset.filter(lambda x: not x["overflow"])
14 |     dataset.set_format(
15 |         type="torch",
16 |         columns=[
17 |             "input_ids",
18 |             "token_type_ids",
19 |             "attention_mask",
20 |             "word_index",
21 |             "word_attention_mask",
22 |             "labels",
23 |         ],
24 |     )
25 |     return dataset
26 | 


--------------------------------------------------------------------------------
/python/core/ltp_core/datamodules/adapters/postagger.py:
--------------------------------------------------------------------------------
 1 | from ltp_core.datamodules.components.conllu import Conllu
 2 | from ltp_core.datamodules.utils.datasets import load_dataset
 3 | 
 4 | 
 5 | def tokenize(examples, tokenizer, max_length, char_base=False):
 6 |     """
 7 | 
 8 |     Args:
 9 |         examples:
10 |         tokenizer:
11 |         max_length:
12 |         char_base: 这里指的是 examples 中的 form[即 word]是否是字级别的
13 | 
14 |     Returns:
15 | 
16 |     """
17 |     res = tokenizer.batch_encode_plus(
18 |         examples["form"],
19 |         is_split_into_words=True,
20 |         max_length=max_length,
21 |         truncation=True,
22 |     )
23 |     result = res.data
24 |     for ids in result["input_ids"]:
25 |         ids[0] = tokenizer.cls_token_id
26 |         ids[-1] = tokenizer.sep_token_id
27 |     result["overflow"] = [len(encoding.overflowing) > 0 for encoding in res.encodings]
28 | 
29 |     if not char_base:
30 |         word_index = []
31 |         for encoding in res.encodings:
32 |             word_index.append([])
33 | 
34 |             last_word_idx = -1
35 |             current_length = 0
36 |             for word_idx in encoding.words[1:-1]:
37 |                 if word_idx != last_word_idx:
38 |                     word_index[-1].append(current_length)
39 |                 current_length += 1
40 |                 last_word_idx = word_idx
41 |         result["word_index"] = word_index
42 |         result["word_attention_mask"] = [[True] * len(index) for index in word_index]
43 |     return result
44 | 
45 | 
46 | def build_dataset(data_dir, task_name, tokenizer, max_length=512, **kwargs):
47 |     import os
48 | 
49 |     os.environ["TOKENIZERS_PARALLELISM"] = "true"
50 |     dataset = load_dataset(Conllu, data_dir=data_dir, cache_dir=data_dir)
51 |     dataset = dataset.remove_columns(["id", "lemma", "upos", "feats", "head", "deprel", "deps", "misc"])
52 |     dataset = dataset.rename_column("xpos", "labels")
53 |     dataset = dataset.map(lambda examples: tokenize(examples, tokenizer, max_length), batched=True)
54 |     dataset = dataset.filter(lambda x: not x["overflow"])
55 |     dataset.set_format(
56 |         type="torch",
57 |         columns=[
58 |             "input_ids",
59 |             "token_type_ids",
60 |             "attention_mask",
61 |             "word_index",
62 |             "word_attention_mask",
63 |             "labels",
64 |         ],
65 |     )
66 |     return dataset
67 | 


--------------------------------------------------------------------------------
/python/core/ltp_core/datamodules/adapters/segmention.py:
--------------------------------------------------------------------------------
  1 | from ltp_core.datamodules.components.conllu import Conllu
  2 | from ltp_core.datamodules.utils.datasets import load_dataset
  3 | 
  4 | PREFIX_B = 0
  5 | PREFIX_I = 1
  6 | PREFIX_M = 1
  7 | PREFIX_E = 2
  8 | PREFIX_S = 3
  9 | 
 10 | 
 11 | def length2bi(length):
 12 |     if length == 0:
 13 |         return []
 14 |     elif length == 1:
 15 |         return [PREFIX_B]
 16 |     elif length == 2:
 17 |         return [PREFIX_B, PREFIX_I]
 18 |     else:
 19 |         return [PREFIX_B] + [PREFIX_I] * (length - 1)
 20 | 
 21 | 
 22 | def length2bmes(length):
 23 |     if length == 0:
 24 |         return []
 25 |     elif length == 1:
 26 |         return [PREFIX_S]
 27 |     elif length == 2:
 28 |         return [PREFIX_B, PREFIX_E]
 29 |     elif length == 3:
 30 |         return [PREFIX_B, PREFIX_M, PREFIX_E]
 31 |     else:
 32 |         return [PREFIX_B] + [PREFIX_M] * (length - 2) + [PREFIX_E]
 33 | 
 34 | 
 35 | def tokenize(examples, tokenizer, max_length, length2labels=length2bi):
 36 |     res = tokenizer(
 37 |         examples["form"],
 38 |         is_split_into_words=True,
 39 |         max_length=max_length,
 40 |         truncation=True,
 41 |     )
 42 |     labels = []
 43 |     for encoding in res.encodings:
 44 |         labels.append([])
 45 |         last_word_idx = -1
 46 |         word_length = 0
 47 |         for word_idx in encoding.words[1:-1]:
 48 |             if word_idx == last_word_idx:
 49 |                 word_length += 1
 50 |             else:
 51 |                 labels[-1].extend(length2labels(word_length))
 52 |                 last_word_idx = word_idx
 53 |                 word_length = 1
 54 |         labels[-1].extend(length2labels(word_length))
 55 | 
 56 |     result = res.data
 57 |     for ids in res["input_ids"]:
 58 |         ids[0] = tokenizer.cls_token_id
 59 |         ids[-1] = tokenizer.sep_token_id
 60 |     result["overflow"] = [len(encoding.overflowing) > 0 for encoding in res.encodings]
 61 |     result["labels"] = labels
 62 |     return result
 63 | 
 64 | 
 65 | def build_dataset(data_dir, task_name, tokenizer, max_length=512, mode="bmes", **kwargs):
 66 |     import os
 67 | 
 68 |     os.environ["TOKENIZERS_PARALLELISM"] = "true"
 69 |     dataset = load_dataset(Conllu, data_dir=data_dir, cache_dir=data_dir)
 70 |     dataset = dataset.remove_columns(["id", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc"])
 71 |     if mode == "bmes":
 72 |         dataset = dataset.map(
 73 |             lambda examples: tokenize(examples, tokenizer, max_length, length2bmes),
 74 |             batched=True,
 75 |         )
 76 |     elif mode == "bi":
 77 |         dataset = dataset.map(
 78 |             lambda examples: tokenize(examples, tokenizer, max_length, length2bi),
 79 |             batched=True,
 80 |         )
 81 |     else:
 82 |         raise NotImplementedError(f"not supported {mode} mode")
 83 |     dataset = dataset.filter(lambda x: not x["overflow"])
 84 |     dataset.set_format(
 85 |         type="torch",
 86 |         columns=["input_ids", "token_type_ids", "attention_mask", "labels"],
 87 |     )
 88 |     return dataset
 89 | 
 90 | 
 91 | def main():
 92 |     from transformers import AutoTokenizer
 93 | 
 94 |     tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-macbert-base")
 95 |     dataset = build_dataset(data_dir="data/seg", task_name="seg", tokenizer=tokenizer, mode="bmes")
 96 |     print(dataset)
 97 | 
 98 | 
 99 | if __name__ == "__main__":
100 |     main()
101 | 


--------------------------------------------------------------------------------
/python/core/ltp_core/datamodules/adapters/semantic_dependency_parsing.py:
--------------------------------------------------------------------------------
 1 | from ltp_core.datamodules.components.conllu import Conllu
 2 | from ltp_core.datamodules.utils.datasets import load_dataset
 3 | 
 4 | 
 5 | def tokenize(examples, tokenizer, max_length):
 6 |     res = tokenizer(
 7 |         examples["form"],
 8 |         is_split_into_words=True,
 9 |         max_length=max_length,
10 |         truncation=True,
11 |     )
12 |     word_index = []
13 |     for encoding in res.encodings:
14 |         word_index.append([])
15 | 
16 |         last_word_idx = -1
17 |         current_length = 0
18 |         for word_idx in encoding.words[1:-1]:
19 |             if word_idx != last_word_idx:
20 |                 word_index[-1].append(current_length)
21 |             current_length += 1
22 |             last_word_idx = word_idx
23 | 
24 |     heads = []
25 |     labels = []
26 |     for forms, deps in zip(examples["form"], examples["deps"]):
27 |         sentence_len = len(forms)
28 |         heads.append([[0 for j in range(sentence_len + 1)] for i in range(sentence_len)])
29 |         labels.append([[0 for j in range(sentence_len + 1)] for i in range(sentence_len)])
30 |         for idx, head, rel in zip(deps["id"], deps["head"], deps["rel"]):
31 |             heads[-1][idx][head] = 1
32 |             labels[-1][idx][head] = rel
33 | 
34 |     result = res.data
35 |     for ids in result["input_ids"]:
36 |         ids[0] = tokenizer.cls_token_id
37 |         ids[-1] = tokenizer.sep_token_id
38 |     result["overflow"] = [len(encoding.overflowing) > 0 for encoding in res.encodings]
39 |     result["word_index"] = word_index
40 |     result["word_attention_mask"] = [[True] * len(index) for index in word_index]
41 | 
42 |     result["head"] = heads
43 |     result["labels"] = labels
44 |     for word_index, head in zip(result["word_index"], result["head"]):
45 |         assert len(word_index) == len(head)
46 |     return result
47 | 
48 | 
49 | def build_dataset(data_dir, task_name, tokenizer, max_length=512, **kwargs):
50 |     import os
51 | 
52 |     os.environ["TOKENIZERS_PARALLELISM"] = "true"
53 |     dataset = load_dataset(Conllu, data_dir=data_dir, cache_dir=data_dir)
54 |     dataset = dataset.remove_columns(["id", "lemma", "upos", "xpos", "feats", "head", "deprel", "misc"])
55 |     dataset = dataset.map(lambda examples: tokenize(examples, tokenizer, max_length), batched=True)
56 |     dataset = dataset.filter(lambda x: not x["overflow"])
57 |     dataset.set_format(
58 |         type="torch",
59 |         columns=[
60 |             "input_ids",
61 |             "token_type_ids",
62 |             "attention_mask",
63 |             "word_index",
64 |             "word_attention_mask",
65 |             "head",
66 |             "labels",
67 |         ],
68 |     )
69 |     return dataset
70 | 


--------------------------------------------------------------------------------
/python/core/ltp_core/datamodules/adapters/semantic_role_labeling.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | 
 3 | from ltp_core.datamodules.components.srl import Srl
 4 | from ltp_core.datamodules.utils.datasets import load_dataset
 5 | 
 6 | 
 7 | def tokenize(examples, tokenizer, max_length):
 8 |     res = tokenizer(
 9 |         examples["form"],
10 |         is_split_into_words=True,
11 |         max_length=max_length,
12 |         truncation=True,
13 |     )
14 |     word_index = []
15 |     for encoding in res.encodings:
16 |         word_index.append([])
17 | 
18 |         last_word_idx = -1
19 |         current_length = 0
20 |         for word_idx in encoding.words[1:-1]:
21 |             if word_idx != last_word_idx:
22 |                 word_index[-1].append(current_length)
23 |             current_length += 1
24 |             last_word_idx = word_idx
25 | 
26 |     labels = []
27 |     for predicates, roles in zip(examples["predicate"], examples["arguments"]):
28 |         sentence_len = len(predicates)
29 |         labels.append(numpy.zeros((sentence_len, sentence_len), dtype=numpy.int64))
30 | 
31 |         for idx, predicate in enumerate(predicates):
32 |             if predicate == 1:
33 |                 srl = numpy.asarray(roles.pop(0), dtype=numpy.int64)
34 |                 labels[-1][idx, :] = srl
35 | 
36 |     result = res.data
37 |     for ids in result["input_ids"]:
38 |         ids[0] = tokenizer.cls_token_id
39 |         ids[-1] = tokenizer.sep_token_id
40 |     result["overflow"] = [len(encoding.overflowing) > 0 for encoding in res.encodings]
41 |     result["word_index"] = word_index
42 |     result["word_attention_mask"] = [[True] * len(index) for index in word_index]
43 | 
44 |     result["labels"] = labels
45 |     return result
46 | 
47 | 
48 | def build_dataset(data_dir, task_name, tokenizer, max_length=512, **kwargs):
49 |     import os
50 | 
51 |     os.environ["TOKENIZERS_PARALLELISM"] = "true"
52 |     dataset = load_dataset(Srl, data_dir=data_dir, cache_dir=data_dir)
53 |     dataset = dataset.map(lambda examples: tokenize(examples, tokenizer, max_length), batched=True)
54 |     dataset = dataset.filter(lambda x: not x["overflow"])
55 |     dataset.set_format(
56 |         type="torch",
57 |         columns=[
58 |             "input_ids",
59 |             "token_type_ids",
60 |             "attention_mask",
61 |             "word_index",
62 |             "word_attention_mask",
63 |             "labels",
64 |         ],
65 |     )
66 |     return dataset
67 | 


--------------------------------------------------------------------------------
/python/core/ltp_core/datamodules/adapters/sentence_classification.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset
 2 | 
 3 | 
 4 | # todo: implement
 5 | def build_dataset(task_name):
 6 |     import os
 7 | 
 8 |     os.environ["TOKENIZERS_PARALLELISM"] = "true"
 9 |     load_dataset("glue", task_name)
10 | 


--------------------------------------------------------------------------------
/python/core/ltp_core/datamodules/components/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HIT-SCIR/ltp/6c6059d5ccad87dca003190ee7565af86e7e22a0/python/core/ltp_core/datamodules/components/__init__.py


--------------------------------------------------------------------------------
/python/core/ltp_core/datamodules/utils/__init__.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | # Author: Yunlong Feng <ylfeng@ir.hit.edu.cn>
3 | 


--------------------------------------------------------------------------------
/python/core/ltp_core/datamodules/utils/collate.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # Author: Yunlong Feng <ylfeng@ir.hit.edu.cn>
 3 | 
 4 | import torch
 5 | from torch._six import string_classes
 6 | from torch.utils.data._utils.collate import (
 7 |     default_collate_err_msg_format,
 8 |     np_str_obj_array_pattern,
 9 | )
10 | 
11 | _TORCH_MAJOR, _TORCH_MINOR = map(int, torch.__version__.split(".")[0:2])
12 | 
13 | if _TORCH_MAJOR < 1 or (_TORCH_MAJOR == 1 and _TORCH_MINOR < 8):
14 |     from torch._six import container_abcs, int_classes
15 | else:
16 |     int_classes = int
17 |     import collections.abc as container_abcs
18 | 
19 | 
20 | def collate(batch):
21 |     r"""Puts each data field into a tensor with outer dimension batch size"""
22 | 
23 |     elem = batch[0]
24 |     elem_type = type(elem)
25 |     if isinstance(elem, torch.Tensor):
26 |         try:
27 |             out = None
28 |             if torch.utils.data.get_worker_info() is not None:
29 |                 # If we're in a background process, concatenate directly into a
30 |                 # shared memory tensor to avoid an extra copy
31 |                 numel = sum(x.numel() for x in batch)
32 |                 storage = elem.storage()._new_shared(numel, device=elem.device)
33 |                 out = elem.new(storage).resize_(len(batch), *list(elem.size()))
34 |             return torch.stack(batch, 0, out=out)
35 |         except Exception:
36 |             return torch.nn.utils.rnn.pad_sequence(batch, batch_first=True)
37 |     elif elem_type.__module__ == "numpy" and elem_type.__name__ != "str_" and elem_type.__name__ != "string_":
38 |         elem = batch[0]
39 |         if elem_type.__name__ == "ndarray":
40 |             # array of string classes and object
41 |             if np_str_obj_array_pattern.search(elem.dtype.str) is not None:
42 |                 raise TypeError(default_collate_err_msg_format.format(elem.dtype))
43 | 
44 |             return collate([torch.as_tensor(b) for b in batch])
45 |         elif elem.shape == ():  # scalars
46 |             return torch.as_tensor(batch)
47 |     elif isinstance(elem, float):
48 |         return torch.tensor(batch, dtype=torch.float64)
49 |     elif isinstance(elem, int_classes):
50 |         return torch.tensor(batch)
51 |     elif isinstance(elem, string_classes):
52 |         return batch
53 |     elif isinstance(elem, container_abcs.Mapping):
54 |         return {key: collate([d[key] for d in batch]) for key in elem}
55 |     elif isinstance(elem, tuple) and hasattr(elem, "_fields"):  # namedtuple
56 |         return elem_type(*(collate(samples) for samples in zip(*batch)))
57 |     elif isinstance(elem, container_abcs.Sequence):
58 |         # check to make sure that the elements in batch have consistent size
59 |         batch = [torch.stack(it) for it in batch]
60 |         elem_sizes = [it.shape for it in batch]
61 |         max_sizes = (max(sizes) for sizes in zip(*elem_sizes))
62 |         batched = torch.zeros(len(batch), *max_sizes, dtype=batch[0].dtype)
63 |         for idx, (elem, elem_size) in enumerate(zip(batch, elem_sizes)):
64 |             size_1, size_2 = elem_size
65 |             batched[idx, :size_1, :size_2] = elem
66 |         return batched
67 | 
68 |     raise TypeError(default_collate_err_msg_format.format(elem_type))
69 | 


--------------------------------------------------------------------------------
/python/core/ltp_core/datamodules/utils/datasets.py:
--------------------------------------------------------------------------------
 1 | from typing import Mapping, Optional, Sequence, Union
 2 | 
 3 | from datasets import Dataset, DatasetBuilder, DatasetDict, Features, Split
 4 | 
 5 | 
 6 | def load_dataset(
 7 |     builder_cls: type,
 8 |     config_name: Optional[str] = None,
 9 |     data_dir: Optional[str] = None,
10 |     data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,
11 |     split: Optional[Union[str, Split]] = None,
12 |     cache_dir: Optional[str] = None,
13 |     features: Optional[Features] = None,
14 |     save_infos: bool = False,
15 |     **config_kwargs,
16 | ) -> Union[DatasetDict, Dataset]:
17 |     # Instantiate the dataset builder
18 |     builder_instance: DatasetBuilder = builder_cls(
19 |         cache_dir=cache_dir,
20 |         config_name=config_name,
21 |         data_dir=data_dir,
22 |         data_files=data_files,
23 |         hash=hash,
24 |         features=features,
25 |         **config_kwargs,
26 |     )
27 | 
28 |     # Download and prepare data
29 |     builder_instance.download_and_prepare()
30 | 
31 |     # Build dataset for splits
32 |     ds = builder_instance.as_dataset(split=split)
33 |     if save_infos:
34 |         builder_instance._save_infos()
35 | 
36 |     return ds
37 | 


--------------------------------------------------------------------------------
/python/core/ltp_core/datamodules/utils/iterator.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # Author: Yunlong Feng <ylfeng@ir.hit.edu.cn>
 3 | 
 4 | import codecs
 5 | 
 6 | 
 7 | def iter_raw_lines(filename: str, strip=None, skip: str = None):
 8 |     line_num = 0
 9 |     with codecs.open(filename, encoding="utf-8") as file:
10 |         while True:
11 |             line = file.readline()
12 |             line_num += 1
13 |             if skip is not None and line.startswith(skip):
14 |                 continue
15 |             if not line:  # EOF
16 |                 yield line_num, ""  # 输出空行，简化上层逻辑
17 |                 break
18 |             line = line.strip(strip)
19 |             yield line_num, line
20 | 
21 | 
22 | def iter_lines(filename: str, split=None, strip=None, skip: str = None):
23 |     for line_num, raw_line in iter_raw_lines(filename=filename, strip=strip, skip=skip):
24 |         if not raw_line:  # end of a sentence
25 |             yield line_num, []  # 输出空行
26 |         else:
27 |             yield line_num, raw_line.split(split)
28 | 
29 | 
30 | def iter_blocks(filename: str, split=None, strip=None, skip="#"):
31 |     rows = []
32 |     for line_num, line_features in iter_lines(filename, split=split, strip=strip, skip=skip):
33 |         if len(line_features):
34 |             rows.append(line_features)
35 |         else:
36 |             if len(rows):
37 |                 yield line_num, rows
38 |                 rows = []
39 | 


--------------------------------------------------------------------------------
/python/core/ltp_core/datamodules/utils/multitask_dataloader.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # Author: Yunlong Feng <ylfeng@ir.hit.edu.cn>
 3 | 
 4 | import numpy as np
 5 | 
 6 | 
 7 | def cycle(iterable):
 8 |     while True:
 9 |         yield from iterable
10 | 
11 | 
12 | class MultiTaskDataloader:
13 |     def __init__(self, tau=1.0, **dataloaders):
14 |         self.dataloaders = dataloaders
15 | 
16 |         Z = sum(pow(v, tau) for v in self.dataloader_sizes.values())
17 |         self.tasknames, self.sampling_weights = zip(*((k, pow(v, tau) / Z) for k, v in self.dataloader_sizes.items()))
18 |         self.dataiters = {k: cycle(v) for k, v in dataloaders.items()}
19 | 
20 |     @property
21 |     def dataloader_sizes(self):
22 |         if not hasattr(self, "_dataloader_sizes"):
23 |             self._dataloader_sizes = {k: len(v) for k, v in self.dataloaders.items()}
24 |         return self._dataloader_sizes
25 | 
26 |     def __len__(self):
27 |         return sum(v for k, v in self.dataloader_sizes.items())
28 | 
29 |     def __iter__(self):
30 |         for i in range(len(self)):
31 |             taskname = np.random.choice(self.tasknames, p=self.sampling_weights)
32 |             dataiter = self.dataiters[taskname]
33 |             batch = next(dataiter)
34 | 
35 |             batch["task_name"] = taskname
36 | 
37 |             yield batch
38 | 


--------------------------------------------------------------------------------
/python/core/ltp_core/datamodules/utils/vocab_helper.py:
--------------------------------------------------------------------------------
 1 | def vocab_builder(func):
 2 |     from datasets import BuilderConfig
 3 | 
 4 |     def func_wrapper(config: BuilderConfig, **kwargs):
 5 |         """We handle string, list and dicts in datafiles."""
 6 |         if not config.data_files:
 7 |             raise ValueError(f"At least one data file must be specified, but got data_files={config.data_files}")
 8 |         data_files = config.data_files
 9 |         if isinstance(data_files, (str, list, tuple)):
10 |             files = data_files
11 |             if isinstance(files, str):
12 |                 files = [files]
13 |         else:
14 |             files = []
15 |             for file_list in data_files.values():
16 |                 if isinstance(file_list, str):
17 |                     files.append(file_list)
18 |                 else:
19 |                     files.extend(file_list)
20 |         res = func(config.data_dir, *files, **kwargs)
21 |         return res
22 | 
23 |     return func_wrapper
24 | 


--------------------------------------------------------------------------------
/python/core/ltp_core/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HIT-SCIR/ltp/6c6059d5ccad87dca003190ee7565af86e7e22a0/python/core/ltp_core/models/__init__.py


--------------------------------------------------------------------------------
/python/core/ltp_core/models/components/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HIT-SCIR/ltp/6c6059d5ccad87dca003190ee7565af86e7e22a0/python/core/ltp_core/models/components/__init__.py


--------------------------------------------------------------------------------
/python/core/ltp_core/models/components/graph.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # Author: Yunlong Feng <ylfeng@ir.hit.edu.cn>
 3 | from collections import namedtuple
 4 | 
 5 | from torch import nn
 6 | 
 7 | from ltp_core.models.nn.biaffine import Biaffine
 8 | from ltp_core.models.nn.mlp import MLP
 9 | 
10 | GraphResult = namedtuple("GraphResult", ["arc_logits", "rel_logits", "attention_mask"])
11 | 
12 | 
13 | class BiaffineClassifier(nn.Module):
14 |     def __init__(
15 |         self,
16 |         input_size,
17 |         num_labels,
18 |         dropout=0.1,
19 |         arc_hidden_size=500,
20 |         rel_hidden_size=100,
21 |     ):
22 |         super().__init__()
23 | 
24 |         self.label_num = num_labels
25 |         self.input_size = input_size
26 |         self.arc_hidden_size = arc_hidden_size
27 |         self.rel_hidden_size = rel_hidden_size
28 | 
29 |         self.mlp_arc = MLP(
30 |             [input_size, arc_hidden_size * 2],
31 |             output_dropout=dropout,
32 |             output_activation=nn.ReLU,
33 |         )
34 |         self.mlp_rel = MLP(
35 |             [input_size, rel_hidden_size * 2],
36 |             output_dropout=dropout,
37 |             output_activation=nn.ReLU,
38 |         )
39 | 
40 |         self.arc_atten = Biaffine(arc_hidden_size, arc_hidden_size, 1, bias_x=True, bias_y=False)
41 |         self.rel_atten = Biaffine(rel_hidden_size, rel_hidden_size, num_labels, bias_x=True, bias_y=True)
42 | 
43 |     def forward(self, hidden_states, attention_mask=None):
44 |         bs, seqlen = hidden_states.shape[:2]
45 | 
46 |         arc = self.mlp_arc(hidden_states)
47 |         arc = arc.view(bs, seqlen, 2, self.arc_hidden_size)
48 |         arc_h, arc_d = arc.unbind(axis=-2)
49 | 
50 |         rel = self.mlp_rel(hidden_states)
51 |         rel = rel.view(bs, seqlen, 2, self.rel_hidden_size)
52 |         rel_h, rel_d = rel.unbind(axis=-2)
53 | 
54 |         s_arc = self.arc_atten(arc_d, arc_h).squeeze_(1)
55 |         s_rel = self.rel_atten(rel_d, rel_h).permute(0, 2, 3, 1)
56 | 
57 |         return GraphResult(arc_logits=s_arc, rel_logits=s_rel, attention_mask=attention_mask)
58 | 


--------------------------------------------------------------------------------
/python/core/ltp_core/models/components/sent.py:
--------------------------------------------------------------------------------
 1 | from collections import namedtuple
 2 | 
 3 | from torch import nn
 4 | 
 5 | from ltp_core.models.nn.mlp import MLP
 6 | 
 7 | SentClassifierResult = namedtuple("SentClassifierResult", ["logits"])
 8 | 
 9 | 
10 | class MLPClassifier(nn.Module):
11 |     def __init__(
12 |         self,
13 |         input_size,
14 |         num_labels,
15 |         dropout=0.1,
16 |         hidden_sizes=None,
17 |     ):
18 |         super().__init__()
19 |         if hidden_sizes is not None:
20 |             self.classifier = MLP([input_size, *hidden_sizes, num_labels], dropout=dropout)
21 |         else:
22 |             self.classifier = MLP([input_size, num_labels], dropout=dropout)
23 | 
24 |     def forward(self, hidden_states, attention_mask=None) -> SentClassifierResult:
25 |         logits = self.classifier(hidden_states)
26 |         return SentClassifierResult(logits=logits)
27 | 


--------------------------------------------------------------------------------
/python/core/ltp_core/models/criterion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HIT-SCIR/ltp/6c6059d5ccad87dca003190ee7565af86e7e22a0/python/core/ltp_core/models/criterion/__init__.py


--------------------------------------------------------------------------------
/python/core/ltp_core/models/criterion/graph.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import Tensor
 3 | from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, Module
 4 | 
 5 | from ltp_core.models.components.graph import GraphResult
 6 | 
 7 | 
 8 | class DEPLoss(Module):
 9 |     def __init__(self, loss_interpolation=0.4):
10 |         super().__init__()
11 |         self.loss_interpolation = loss_interpolation
12 | 
13 |     def forward(self, result: GraphResult, head: Tensor, labels: Tensor, **kwargs):
14 |         s_arc = result.arc_logits
15 |         s_rel = result.rel_logits
16 |         attention_mask = result.attention_mask
17 | 
18 |         arc_loss = CrossEntropyLoss()
19 |         rel_loss = CrossEntropyLoss()
20 | 
21 |         # ignore the first token of each sentence
22 |         s_arc = s_arc[:, 1:, :]
23 |         s_rel = s_rel[:, 1:, :]
24 | 
25 |         # Only keep active parts of the loss
26 |         active_heads = head[attention_mask]
27 |         active_labels = labels[attention_mask]
28 |         s_arc, s_rel = s_arc[attention_mask], s_rel[attention_mask]
29 | 
30 |         s_rel = s_rel[torch.arange(len(active_heads)), active_heads]
31 | 
32 |         arc_loss = arc_loss(s_arc, active_heads)
33 |         rel_loss = rel_loss(s_rel, active_labels)
34 |         loss = 2 * ((1 - self.loss_interpolation) * arc_loss + self.loss_interpolation * rel_loss)
35 | 
36 |         return loss
37 | 
38 | 
39 | class SDPLoss(Module):
40 |     def __init__(self, loss_interpolation=0.4):
41 |         super().__init__()
42 |         self.loss_interpolation = loss_interpolation
43 | 
44 |     def forward(self, result: GraphResult, head: Tensor, labels: Tensor, **kwargs):
45 |         s_arc = result.arc_logits
46 |         s_rel = result.rel_logits
47 |         attention_mask = result.attention_mask
48 | 
49 |         head_loss = BCEWithLogitsLoss()
50 |         rel_loss = CrossEntropyLoss()
51 | 
52 |         # ignore the first token of each sentence
53 |         s_arc = s_arc[:, 1:, :]
54 |         s_rel = s_rel[:, 1:, :]
55 | 
56 |         # attention mask
57 |         attention_mask = attention_mask.unsqueeze(-1).expand_as(s_arc)
58 | 
59 |         arc_loss = head_loss(s_arc[attention_mask], head[attention_mask].float())
60 |         rel_loss = rel_loss(s_rel[head > 0], labels[head > 0])
61 | 
62 |         loss = 2 * ((1 - self.loss_interpolation) * arc_loss + self.loss_interpolation * rel_loss)
63 | 
64 |         return loss
65 | 
66 | 
67 | class DEPDistillLoss(DEPLoss):
68 |     def forward(self, result: GraphResult, head: Tensor, labels: Tensor, **kwargs):
69 |         return super().forward(result, labels, **kwargs)
70 | 
71 | 
72 | class SDPDistillLoss(SDPLoss):
73 |     def forward(self, result: GraphResult, head: Tensor, labels: Tensor, **kwargs):
74 |         return super().forward(result, labels, **kwargs)
75 | 


--------------------------------------------------------------------------------
/python/core/ltp_core/models/criterion/sent.py:
--------------------------------------------------------------------------------
 1 | from torch import Tensor
 2 | from torch.nn import CrossEntropyLoss, Module
 3 | 
 4 | from ltp_core.models.components.sent import SentClassifierResult
 5 | 
 6 | 
 7 | class ClassificationLoss(Module):
 8 |     def forward(self, result: SentClassifierResult, labels: Tensor, **kwargs) -> Tensor:
 9 |         logits = result.logits
10 |         num_tags = logits.shape[-1]
11 | 
12 |         loss_fct = CrossEntropyLoss()
13 |         loss = loss_fct(logits.view(-1, num_tags), labels.view(-1))
14 |         return loss
15 | 


--------------------------------------------------------------------------------
/python/core/ltp_core/models/functional/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HIT-SCIR/ltp/6c6059d5ccad87dca003190ee7565af86e7e22a0/python/core/ltp_core/models/functional/__init__.py


--------------------------------------------------------------------------------
/python/core/ltp_core/models/functional/distill.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | 
 4 | 
 5 | def flsw_temperature_scheduler_builder(beta=1, gamma=2, base_temperature=8, eps=1e-4, *args):
 6 |     """adapted from arXiv:1911.07471."""
 7 | 
 8 |     def flsw_temperature_scheduler(logits_S, logits_T):
 9 |         v = logits_S.detach()
10 |         t = logits_T.detach()
11 |         with torch.no_grad():
12 |             v = v / (torch.norm(v, dim=-1, keepdim=True) + eps)
13 |             t = t / (torch.norm(t, dim=-1, keepdim=True) + eps)
14 |             w = torch.pow((1 - (v * t).sum(dim=-1)), gamma)
15 |             tau = base_temperature + (w.mean() - w) * beta
16 |         return tau
17 | 
18 |     return flsw_temperature_scheduler
19 | 
20 | 
21 | def kd_ce_loss(logits_S, logits_T, temperature=1):
22 |     """Calculate the cross entropy between logits_S and logits_T.
23 | 
24 |     :param logits_S: Tensor of shape (batch_size, length, num_labels) or (batch_size, num_labels)
25 |     :param logits_T: Tensor of shape (batch_size, length, num_labels) or (batch_size, num_labels)
26 |     :param temperature: A float or a tensor of shape (batch_size, length) or (batch_size,)
27 |     """
28 |     if isinstance(temperature, torch.Tensor) and temperature.dim() > 0:
29 |         temperature = temperature.unsqueeze(-1)
30 |     beta_logits_T = logits_T / temperature
31 |     beta_logits_S = logits_S / temperature
32 |     p_T = F.softmax(beta_logits_T, dim=-1)
33 |     loss = -(p_T * F.log_softmax(beta_logits_S, dim=-1))
34 |     return (temperature * temperature * loss).sum(dim=-1).mean()
35 | 
36 | 
37 | def kd_mse_loss(logits_S, logits_T, temperature=1):
38 |     """Calculate the mse loss between logits_S and logits_T.
39 | 
40 |     :param logits_S: Tensor of shape (batch_size, length, num_labels) or (batch_size, num_labels)
41 |     :param logits_T: Tensor of shape (batch_size, length, num_labels) or (batch_size, num_labels)
42 |     :param temperature: A float or a tensor of shape (batch_size, length) or (batch_size,)
43 |     """
44 |     if isinstance(temperature, torch.Tensor) and temperature.dim() > 0:
45 |         temperature = temperature.unsqueeze(-1)
46 |     beta_logits_T = logits_T / temperature
47 |     beta_logits_S = logits_S / temperature
48 |     loss = F.mse_loss(beta_logits_S, beta_logits_T, reduction="none")
49 |     return (temperature * temperature * loss).mean()
50 | 


--------------------------------------------------------------------------------
/python/core/ltp_core/models/functional/multilabel_categorical_crossentropy.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | # ref: https://github.com/bojone/bert4keras/blob/master/bert4keras/backend.py
 4 | 
 5 | INF = 1e4
 6 | EPSILON = 1e-5
 7 | 
 8 | 
 9 | def multilabel_categorical_crossentropy(y_true, y_pred, mask_zero=False):
10 |     """多标签分类的交叉熵
11 |     说明：
12 |         1. y_true和y_pred的shape一致，y_true的元素非0即1，
13 |            1表示对应的类为目标类，0表示对应的类为非目标类；
14 |         2. 请保证y_pred的值域是全体实数，换言之一般情况下
15 |            y_pred不用加激活函数，尤其是不能加sigmoid或者
16 |            softmax；
17 |         3. 预测阶段则输出y_pred大于0的类；
18 |         4. 详情请看：https://kexue.fm/archives/7359 。
19 |     """
20 |     y_pred = (1 - 2 * y_true) * y_pred
21 |     y_neg = y_pred - y_true * INF
22 |     y_pos = y_pred - (1 - y_true) * INF
23 |     zeros = torch.zeros_like(y_pred[..., :1])
24 |     y_neg = torch.cat([y_neg, zeros], dim=-1)
25 |     y_pos = torch.cat([y_pos, zeros], dim=-1)
26 |     neg_loss = torch.logsumexp(y_neg, dim=-1)
27 |     pos_loss = torch.logsumexp(y_pos, dim=-1)
28 |     return pos_loss + neg_loss
29 | 
30 | 
31 | def sparse_multilabel_categorical_crossentropy(y_true, y_pred, mask_zero=False):
32 |     """稀疏版多标签分类的交叉熵
33 |     说明：
34 |         1. y_true.shape=[..., num_positive]，
35 |            y_pred.shape=[..., num_classes]；
36 |         2. 请保证y_pred的值域是全体实数，换言之一般情况下
37 |            y_pred不用加激活函数，尤其是不能加sigmoid或者
38 |            softmax；
39 |         3. 预测阶段则输出y_pred大于0的类；
40 |         4. 详情请看：https://kexue.fm/archives/7359 。
41 |     """
42 |     zeros = torch.zeros_like(y_pred[..., :1])
43 |     y_pred = torch.cat([y_pred, zeros], dim=-1)
44 | 
45 |     if mask_zero:
46 |         infs = zeros + INF
47 |         y_pred = torch.cat([infs, y_pred[..., 1:]], dim=-1)
48 | 
49 |     y_pos_2 = torch.gather(y_pred, index=y_true, dim=-1)
50 |     y_pos_1 = torch.cat([y_pos_2, zeros], dim=-1)
51 | 
52 |     if mask_zero:
53 |         y_pred = torch.cat([-infs, y_pred[..., 1:]], dim=-1)
54 |         y_pos_2 = torch.gather(y_pred, index=y_true, dim=-1)
55 | 
56 |     pos_loss = torch.logsumexp(-y_pos_1, dim=-1)
57 |     all_loss = torch.logsumexp(y_pred, dim=-1)
58 |     aux_loss = torch.logsumexp(y_pos_2, dim=-1) - all_loss
59 |     aux_loss = torch.clamp(1 - torch.exp(aux_loss), min=EPSILON, max=1)
60 |     neg_loss = all_loss + torch.log(aux_loss)
61 |     return pos_loss + neg_loss
62 | 


--------------------------------------------------------------------------------
/python/core/ltp_core/models/ltp_model.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | 
 3 | import torch
 4 | from torch import nn
 5 | from torch.nn import ModuleDict
 6 | from transformers.modeling_outputs import BaseModelOutput
 7 | 
 8 | 
 9 | class LTPModule(nn.Module):
10 |     def __init__(
11 |         self,
12 |         backbone: nn.Module,
13 |         heads: Dict[str, nn.Module],
14 |         processor: Dict[str, nn.Module],
15 |     ):
16 |         super().__init__()
17 |         self.backbone = backbone
18 |         self.processor = ModuleDict(processor)
19 |         self.task_heads = ModuleDict(heads)
20 | 
21 |     def forward(
22 |         self,
23 |         task_name: str,
24 |         input_ids: torch.Tensor,
25 |         attention_mask: torch.Tensor,
26 |         token_type_ids: torch.Tensor = None,
27 |         word_index: torch.Tensor = None,
28 |         word_attention_mask: torch.Tensor = None,
29 |     ):
30 |         outputs: BaseModelOutput = self.backbone(input_ids, attention_mask, token_type_ids)
31 |         hidden_state, attention_mask = self.processor[task_name](
32 |             outputs, attention_mask, word_index, word_attention_mask
33 |         )
34 |         return self.task_heads[task_name](hidden_state, attention_mask)
35 | 


--------------------------------------------------------------------------------
/python/core/ltp_core/models/metrics/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HIT-SCIR/ltp/6c6059d5ccad87dca003190ee7565af86e7e22a0/python/core/ltp_core/models/metrics/__init__.py


--------------------------------------------------------------------------------
/python/core/ltp_core/models/metrics/sent.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from torch import Tensor
 4 | from torchmetrics import Accuracy
 5 | 
 6 | from ltp_core.models.components.sent import SentClassifierResult
 7 | 
 8 | 
 9 | class ClsAccuracy(Accuracy):
10 |     is_differentiable: bool = False
11 |     higher_is_better: Optional[bool] = True
12 |     full_state_update: bool = False
13 | 
14 |     def update(self, result: SentClassifierResult, labels: Tensor, **kwargs) -> None:
15 |         preds = result.logits.argmax(dim=-1)
16 |         super().update(preds, labels)
17 | 


--------------------------------------------------------------------------------
/python/core/ltp_core/models/nn/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HIT-SCIR/ltp/6c6059d5ccad87dca003190ee7565af86e7e22a0/python/core/ltp_core/models/nn/__init__.py


--------------------------------------------------------------------------------
/python/core/ltp_core/models/nn/biaffine.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # Author: Yunlong Feng <ylfeng@ir.hit.edu.cn>
 3 | 
 4 | import math
 5 | 
 6 | import torch
 7 | from torch import Tensor, nn
 8 | 
 9 | 
10 | class Biaffine(nn.Module):
11 |     __constants__ = ["in1_features", "in2_features", "out_features", "bias_x", "bias_y"]
12 | 
13 |     def __init__(self, in1_features, in2_features, out_features, bias_x=True, bias_y=True):
14 |         super().__init__()
15 |         self.bias_x = bias_x
16 |         self.bias_y = bias_y
17 |         self.in1_features = in1_features
18 |         self.in2_features = in2_features
19 |         self.out_features = out_features
20 |         self.weight = nn.Parameter(
21 |             torch.zeros(out_features, in1_features + bias_x, in2_features + bias_y),
22 |             requires_grad=True,
23 |         )
24 |         self.reset_parameters()
25 | 
26 |     def reset_parameters(self):
27 |         bound = 1 / math.sqrt(self.weight.size(1))
28 |         nn.init.uniform_(self.weight, -bound, bound)
29 | 
30 |     def onnx_forward(self, x1: Tensor, x2: Tensor):
31 |         if self.bias_x:
32 |             x1 = torch.cat((x1, torch.ones_like(x1[..., :1])), -1)
33 |         if self.bias_y:
34 |             x2 = torch.cat((x2, torch.ones_like(x2[..., :1])), -1)
35 |         x1 = x1.unsqueeze(1)
36 |         x2 = x2.unsqueeze(1)
37 |         s: Tensor = x1 @ self.weight @ x2.transpose(-1, -2)
38 |         if s.size(1) == 1:
39 |             s = s.squeeze(1)
40 |         return s
41 | 
42 |     def forward(self, x1: Tensor, x2: Tensor):
43 |         if self.bias_x:
44 |             # [batch_size, seq_len, in1_features] -> [batch_size, seq_len, in1_features + 1]
45 |             x1 = torch.cat((x1, torch.ones_like(x1[..., :1])), -1)
46 |         if self.bias_y:
47 |             # [batch_size, seq_len, in2_features] -> [batch_size, seq_len, in2_features + 1]
48 |             x2 = torch.cat((x2, torch.ones_like(x2[..., :1])), -1)
49 |         # [batch_size, n_out, seq_len, seq_len]
50 |         s = torch.einsum("bxi,oij,byj->boxy", x1, self.weight, x2)
51 |         return s
52 | 
53 |     def extra_repr(self):
54 |         return "in1_features={}, in2_features={}, out_features={}, bias_x={}, bias_y={}".format(
55 |             self.in1_features,
56 |             self.in2_features,
57 |             self.out_features,
58 |             self.bias_x,
59 |             self.bias_y,
60 |         )
61 | 
62 | 
63 | def main():
64 |     biaffine = Biaffine(in1_features=128, in2_features=128, out_features=12)
65 |     inputs = torch.randn(2, 512, 128)
66 |     outputs = biaffine(inputs, inputs)
67 |     print(outputs.shape)
68 | 
69 | 
70 | if __name__ == "__main__":
71 |     main()
72 | 


--------------------------------------------------------------------------------
/python/core/ltp_core/models/nn/mlp.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # Author: Yunlong Feng <ylfeng@ir.hit.edu.cn>
 3 | from typing import Callable, Optional, Sequence, Union
 4 | 
 5 | from torch import nn
 6 | from transformers.activations import get_activation
 7 | 
 8 | 
 9 | def MLP(
10 |     layer_sizes: Sequence[int],
11 |     dropout: Optional[float] = None,
12 |     activation: Optional[Union[str, Callable]] = None,
13 |     output_dropout: Optional[Union[float, bool]] = None,
14 |     output_activation: Optional[Union[str, bool, Callable]] = None,
15 | ):
16 |     layers = []
17 |     num_layers = len(layer_sizes) - 1
18 |     for index in range(num_layers):
19 |         if index < num_layers - 1:
20 |             layers.append(nn.Linear(layer_sizes[index], layer_sizes[index + 1]))
21 | 
22 |             if isinstance(activation, str):
23 |                 layers.append(get_activation(activation))
24 |             elif isinstance(activation, Callable):
25 |                 layers.append(activation())
26 | 
27 |             if isinstance(dropout, float):
28 |                 layers.append(nn.Dropout(dropout))
29 |         else:
30 |             layers.append(nn.Linear(layer_sizes[index], layer_sizes[index + 1]))
31 | 
32 |             if isinstance(output_activation, str):
33 |                 layers.append(get_activation(output_activation))
34 |             elif isinstance(output_activation, Callable):
35 |                 layers.append(output_activation())
36 |             elif output_activation is True and activation is not None:
37 |                 if isinstance(activation, str):
38 |                     layers.append(get_activation(activation))
39 |                 elif isinstance(activation, Callable):
40 |                     layers.append(activation())
41 | 
42 |             if isinstance(output_dropout, float):
43 |                 layers.append(nn.Dropout(p=output_dropout))
44 |             elif output_dropout is True and isinstance(dropout, float):
45 |                 layers.append(nn.Dropout(dropout))
46 | 
47 |     return nn.Sequential(*layers)
48 | 
49 | 
50 | def main():
51 |     mlp = MLP([768, 768, 128])
52 |     print(mlp)
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     main()
57 | 


--------------------------------------------------------------------------------
/python/core/ltp_core/models/optimization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HIT-SCIR/ltp/6c6059d5ccad87dca003190ee7565af86e7e22a0/python/core/ltp_core/models/optimization/__init__.py


--------------------------------------------------------------------------------
/python/core/ltp_core/models/optimization/layer_lrs.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | def get_layer_lrs_with_crf(
 5 |     named_parameters,
 6 |     transformer_prefix,
 7 |     learning_rate,
 8 |     layer_decay,
 9 |     n_layers,
10 |     crf_prefix="crf",
11 |     crf_ratio=10.0,
12 | ):
13 |     groups = []
14 |     crf_groups = []
15 |     temp_groups = [None] * (n_layers + 3)
16 |     temp_no_decay_groups = [None] * (n_layers + 3)
17 |     regex = rf"^{transformer_prefix}\.(embeddings|encoder)\w*\.(layer.(\d+))?.+"
18 |     regex = re.compile(regex)
19 |     for name, parameters in named_parameters:
20 |         m = regex.match(name)
21 | 
22 |         is_transformer = True
23 |         if m is None:
24 |             depth = n_layers + 2
25 |             is_transformer = False
26 |         elif m.group(1) == "embeddings":
27 |             depth = 0
28 |         elif m.group(1) == "encoder":
29 |             depth = int(m.group(3)) + 1
30 |         else:
31 |             raise Exception("Not Recommend!!!")
32 | 
33 |         if is_transformer and any(x in name for x in ["bias", "LayerNorm.bias", "LayerNorm.weight"]):
34 |             if temp_no_decay_groups[depth] is None:
35 |                 temp_no_decay_groups[depth] = []
36 |             temp_no_decay_groups[depth].append(parameters)
37 |         elif not is_transformer and crf_prefix in name:
38 |             crf_groups.append(parameters)
39 |         else:
40 |             if temp_groups[depth] is None:
41 |                 temp_groups[depth] = []
42 |             temp_groups[depth].append(parameters)
43 | 
44 |     for depth, parameters in enumerate(temp_no_decay_groups):
45 |         if parameters:
46 |             groups.append(
47 |                 {
48 |                     "params": parameters,
49 |                     "weight_decay": 0.0,
50 |                     "lr": learning_rate * (layer_decay ** (n_layers + 2 - depth)),
51 |                 }
52 |             )
53 |     for depth, parameters in enumerate(temp_groups):
54 |         if parameters:
55 |             groups.append(
56 |                 {
57 |                     "params": parameters,
58 |                     "lr": learning_rate * (layer_decay ** (n_layers + 2 - depth)),
59 |                 }
60 |             )
61 |     if crf_groups:
62 |         groups.append({"params": crf_groups, "lr": learning_rate * crf_ratio})
63 | 
64 |     return groups
65 | 


--------------------------------------------------------------------------------
/python/core/ltp_core/models/processor/__init__.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | 
 4 | 
 5 | class NOP(nn.Module):
 6 |     def __init__(self, dropout=0.1):
 7 |         super().__init__()
 8 |         self.dropout = nn.Dropout(dropout)
 9 | 
10 |     def __call__(self, outputs, attention_mask=None, word_index=None, word_attention_mask=None):
11 |         return self.dropout(outputs.last_hidden_state), attention_mask == 1
12 | 
13 | 
14 | class TokenOnly(nn.Module):
15 |     def __init__(self, dropout=0.1):
16 |         super().__init__()
17 |         self.dropout = nn.Dropout(dropout)
18 | 
19 |     def __call__(self, outputs, attention_mask=None, word_index=None, word_attention_mask=None):
20 |         return (
21 |             self.dropout(outputs.last_hidden_state[:, 1:-1]),
22 |             attention_mask[:, 2:] == 1,
23 |         )
24 | 
25 | 
26 | class WordsOnly(nn.Module):
27 |     def __init__(self, dropout=0.1):
28 |         super().__init__()
29 |         self.dropout = nn.Dropout(dropout)
30 | 
31 |     def __call__(self, outputs, attention_mask=None, word_index=None, word_attention_mask=None):
32 |         hidden = outputs.last_hidden_state
33 |         hidden = torch.gather(
34 |             hidden[:, 1:-1, :],
35 |             dim=1,
36 |             index=word_index.unsqueeze(-1).expand(-1, -1, hidden.size(-1)),
37 |         )
38 |         return self.dropout(hidden), word_attention_mask
39 | 
40 | 
41 | class ClsOnly(nn.Module):
42 |     def __init__(self, dropout=0.1):
43 |         super().__init__()
44 |         self.dropout = nn.Dropout(dropout)
45 | 
46 |     def __call__(self, outputs, attention_mask=None, word_index=None, word_attention_mask=None):
47 |         return self.dropout(outputs.last_hidden_state[:, 0]), None
48 | 
49 | 
50 | class WordsWithHead(nn.Module):
51 |     def __init__(self, dropout=0.1):
52 |         super().__init__()
53 |         self.dropout = nn.Dropout(dropout)
54 | 
55 |     def __call__(self, outputs, attention_mask=None, word_index=None, word_attention_mask=None):
56 |         hidden = outputs.last_hidden_state
57 |         hidden = torch.cat(
58 |             [
59 |                 hidden[:, :1, :],
60 |                 torch.gather(
61 |                     hidden[:, 1:-1, :],
62 |                     dim=1,
63 |                     index=word_index.unsqueeze(-1).expand(-1, -1, hidden.size(-1)),
64 |                 ),
65 |             ],
66 |             dim=1,
67 |         )
68 |         return self.dropout(hidden), word_attention_mask
69 | 


--------------------------------------------------------------------------------
/python/core/ltp_core/models/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HIT-SCIR/ltp/6c6059d5ccad87dca003190ee7565af86e7e22a0/python/core/ltp_core/models/utils/__init__.py


--------------------------------------------------------------------------------
/python/core/ltp_core/models/utils/instantiate.py:
--------------------------------------------------------------------------------
 1 | import functools
 2 | import importlib
 3 | from typing import Callable
 4 | 
 5 | 
 6 | def find_callable(target: str) -> Callable:
 7 |     target_module_path, target_callable_path = target.rsplit(".", 1)
 8 |     target_callable_paths = [target_callable_path]
 9 | 
10 |     target_module = None
11 |     while len(target_module_path):
12 |         try:
13 |             target_module = importlib.import_module(target_module_path)
14 |             break
15 |         except Exception as e:
16 |             target_module_path, target_callable_path = target_module_path.rsplit(".", 1)
17 |             if len(target_module_path) == 0:
18 |                 raise e
19 |             target_callable_paths.append(target_callable_path)
20 |     target_callable = target_module
21 |     for attr in reversed(target_callable_paths):
22 |         target_callable = getattr(target_callable, attr)
23 | 
24 |     return target_callable
25 | 
26 | 
27 | def instantiate(config, target="_ltp_target_", partial="_ltp_partial_"):
28 |     if isinstance(config, dict) and target in config:
29 |         target_path = config.get(target)
30 |         target_callable = find_callable(target_path)
31 | 
32 |         is_partial = config.get(partial, False)
33 |         target_args = {key: instantiate(value) for key, value in config.items() if key not in [target, partial]}
34 | 
35 |         if is_partial:
36 |             return functools.partial(target_callable, **target_args)
37 |         else:
38 |             return target_callable(**target_args)
39 |     elif isinstance(config, dict):
40 |         return {key: instantiate(value) for key, value in config.items()}
41 |     else:
42 |         return config
43 | 
44 | 
45 | def instantiate_omega(config, target="_ltp_target_", partial="_ltp_partial_"):
46 |     from omegaconf import DictConfig
47 | 
48 |     if (isinstance(config, dict) or isinstance(config, DictConfig)) and target in config:
49 |         target_path = config.get(target)
50 |         target_callable = find_callable(target_path)
51 | 
52 |         is_partial = config.get(partial, False)
53 |         target_args = {key: instantiate_omega(value) for key, value in config.items() if key not in [target, partial]}
54 | 
55 |         if is_partial:
56 |             return functools.partial(target_callable, **target_args)
57 |         else:
58 |             return target_callable(**target_args)
59 |     elif isinstance(config, dict) or isinstance(config, DictConfig):
60 |         return {key: instantiate_omega(value) for key, value in config.items()}
61 |     else:
62 |         return config
63 | 
64 | 
65 | def main():
66 |     import yaml
67 | 
68 |     with open("configs/model/model.yaml") as stream:
69 |         try:
70 |             config = yaml.safe_load(stream)
71 |             model_config = config["model"]
72 |         except yaml.YAMLError as exc:
73 |             print(exc)
74 | 
75 |     model = instantiate(model_config)
76 |     print(model)
77 | 
78 | 
79 | if __name__ == "__main__":
80 |     main()
81 | 


--------------------------------------------------------------------------------
/python/core/ltp_core/models/utils/transformer.py:
--------------------------------------------------------------------------------
1 | def load_transformers(config):
2 |     from transformers import AutoConfig, AutoModel
3 | 
4 |     config = AutoConfig.for_model(**config)
5 |     return AutoModel.from_config(config)
6 | 


--------------------------------------------------------------------------------
/python/core/ltp_core/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HIT-SCIR/ltp/6c6059d5ccad87dca003190ee7565af86e7e22a0/python/core/ltp_core/utils/__init__.py


--------------------------------------------------------------------------------
/python/core/ltp_core/utils/pylogger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from pytorch_lightning.utilities import rank_zero_only
 4 | 
 5 | 
 6 | def get_pylogger(name=__name__) -> logging.Logger:
 7 |     """Initializes multi-GPU-friendly python command line logger."""
 8 | 
 9 |     logger = logging.getLogger(name)
10 | 
11 |     # this ensures all logging levels get marked with the rank zero decorator
12 |     # otherwise logs would get multiplied for each GPU process in multi-GPU setup
13 |     logging_levels = (
14 |         "debug",
15 |         "info",
16 |         "warning",
17 |         "error",
18 |         "exception",
19 |         "fatal",
20 |         "critical",
21 |     )
22 |     for level in logging_levels:
23 |         setattr(logger, level, rank_zero_only(getattr(logger, level)))
24 | 
25 |     return logger
26 | 


--------------------------------------------------------------------------------
/python/core/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.pytest.ini_options]
 2 | addopts = [
 3 |     "--color=yes",
 4 |     "--durations=0",
 5 |     "--strict-markers",
 6 |     "--doctest-modules",
 7 | ]
 8 | filterwarnings = [
 9 |     "ignore::DeprecationWarning",
10 |     "ignore::UserWarning",
11 | ]
12 | log_cli = "True"
13 | markers = [
14 |     "slow: slow tests",
15 | ]
16 | minversion = "6.0"
17 | testpaths = "tests/"
18 | 
19 | [tool.coverage.report]
20 | exclude_lines = [
21 |     "pragma: nocover",
22 |     "raise NotImplementedError",
23 |     "raise NotImplementedError()",
24 |     "if __name__ == .__main__.:",
25 | ]
26 | 
27 | [tool.ruff]
28 | # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
29 | select = ["E", "F"]
30 | ignore = []
31 | 
32 | # Allow autofix for all enabled rules (when `--fix`) is provided.
33 | fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
34 | unfixable = []
35 | 
36 | # Exclude a variety of commonly ignored directories.
37 | exclude = [
38 |     ".bzr",
39 |     ".direnv",
40 |     ".eggs",
41 |     ".git",
42 |     ".git-rewrite",
43 |     ".hg",
44 |     ".mypy_cache",
45 |     ".nox",
46 |     ".pants.d",
47 |     ".pytype",
48 |     ".ruff_cache",
49 |     ".svn",
50 |     ".tox",
51 |     ".venv",
52 |     "__pypackages__",
53 |     "_build",
54 |     "buck-out",
55 |     "build",
56 |     "dist",
57 |     "node_modules",
58 |     "venv",
59 |     "ltp_core/train.py",
60 |     "ltp_core/eval.py",
61 | ]
62 | per-file-ignores = {}
63 | 
64 | # Same as Black.
65 | line-length = 120
66 | 
67 | # Allow unused variables when underscore-prefixed.
68 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
69 | 
70 | # Assume Python 3.10.
71 | target-version = "py310"
72 | 
73 | [tool.ruff.mccabe]
74 | # Unlike Flake8, default to a complexity level of 10.
75 | max-complexity = 10


--------------------------------------------------------------------------------
/python/core/requirements.txt:
--------------------------------------------------------------------------------
 1 | # --------- pytorch --------- #
 2 | torch>=1.6.0
 3 | 
 4 | # --------- transformers --------- #
 5 | transformers>=4.0.0
 6 | 
 7 | # --------- train --------- #
 8 | pytorch-lightning>=1.5.10
 9 | torchmetrics>=0.7.0
10 | datasets>=1.0.0
11 | 
12 | # --------- hydra --------- #
13 | hydra-core>=1.1.0
14 | hydra-colorlog>=1.1.0
15 | 
16 | # --------- loggers --------- #
17 | wandb
18 | # neptune-client
19 | # mlflow
20 | # comet-ml
21 | # tensorboard
22 | 
23 | # --------- others --------- #
24 | pyrootutils     # standardizing the project root setup
25 | pre-commit      # hooks for applying linters on commit
26 | rich            # beautiful text formatting in terminal
27 | pytest          # tests
28 | sh              # for running bash commands in some tests
29 | 


--------------------------------------------------------------------------------
/python/core/setup.cfg:
--------------------------------------------------------------------------------
 1 | [isort]
 2 | line_length = 99
 3 | profile = black
 4 | filter_files = True
 5 | 
 6 | 
 7 | [flake8]
 8 | max_line_length = 99
 9 | show_source = True
10 | format = pylint
11 | ignore =
12 |     F401  # Module imported but unused
13 |     W504  # Line break occurred after a binary operator
14 |     F841  # Local variable name is assigned to but never used
15 |     E501  # Line too long
16 | exclude =
17 |     .git
18 |     __pycache__
19 |     data/*
20 |     tests/*
21 |     notebooks/*
22 |     logs/*
23 | 
24 | 
25 | [tool:pytest]
26 | testpaths = tests/
27 | log_cli = True
28 | markers =
29 |     slow
30 | addopts =
31 |     --durations=0
32 |     --strict-markers
33 |     --doctest-modules
34 | filterwarnings =
35 |     ignore::DeprecationWarning
36 |     ignore::UserWarning
37 | 


--------------------------------------------------------------------------------
/python/core/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from setuptools import find_packages, setup
 4 | 
 5 | project_dir, _ = os.path.split(__file__)
 6 | 
 7 | with open(os.path.join(project_dir, "README.md"), encoding="utf-8") as fh:
 8 |     long_description = fh.read()
 9 | 
10 | setup(
11 |     name="ltp_core",
12 |     version="0.1.4",
13 |     author="Yunlong Feng",
14 |     author_email="ylfeng@ir.hit.edu.cn",
15 |     url="https://github.com/HIT-SCIR/ltp",
16 |     description="Language Technology Platform",
17 |     long_description=long_description,
18 |     long_description_content_type="text/markdown",
19 |     install_requires=[
20 |         "torch>=1.6.0",
21 |         "transformers>=4.0.0",
22 |     ],
23 |     extras_require={
24 |         "train": [
25 |             # pytorch-lightning
26 |             "pytorch-lightning>=1.0.0",
27 |             "torchmetrics>=0.7.0",
28 |             # datasets
29 |             "datasets>=1.0.0",
30 |             # hydra
31 |             "rich",
32 |             "pyrootutils",
33 |             "hydra-core>=1.1.0",
34 |             "hydra-colorlog>=1.1.0",
35 |         ]
36 |     },
37 |     classifiers=[
38 |         "Development Status :: 1 - Planning",
39 |         "Operating System :: OS Independent",
40 |         "Intended Audience :: Developers",
41 |         "Programming Language :: Python :: 3.6",
42 |         "Programming Language :: Python :: 3.7",
43 |         "Programming Language :: Python :: 3.8",
44 |         "Programming Language :: Python :: 3.9",
45 |         "Programming Language :: Python :: 3.10",
46 |         "Programming Language :: Python :: 3.11",
47 |         "Topic :: Software Development :: Libraries",
48 |     ],
49 |     packages=find_packages(),
50 |     include_dirs=["ltp_core"],
51 |     python_requires=">=3.6, <4",
52 |     zip_safe=True,
53 | )
54 | 


--------------------------------------------------------------------------------
/python/core/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HIT-SCIR/ltp/6c6059d5ccad87dca003190ee7565af86e7e22a0/python/core/tests/__init__.py


--------------------------------------------------------------------------------
/python/extension/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "ltp-extension"
 3 | version = "0.1.13"
 4 | edition = "2021"
 5 | authors = ["ylfeng <ylfeng@ir.hit.edu.cn>"]
 6 | description = "Rust Extension For Language Technology Platform(Python)."
 7 | homepage = "https://github.com/HIT-SCIR/ltp"
 8 | repository = "https://github.com/HIT-SCIR/ltp"
 9 | keywords = ["ltp", "nlp"]
10 | exclude = [".github"]
11 | readme = "README.md"
12 | license-file = "LICENSE"
13 | 
14 | [lib]
15 | name = "ltp_extension"
16 | crate-type = ["cdylib"]
17 | 
18 | [dependencies]
19 | libc = { version = "0.2" }
20 | rayon = { version = "1.7" }
21 | rayon-cond = { version = "0.4" }
22 | anyhow = { version = "1.0" }
23 | serde = { version = "1.0", features = ["derive"] }
24 | pyo3 = { version = "0.24", features = ["extension-module", "anyhow", "serde"] }
25 | mimalloc = { version = "0.1", default-features = false, optional = true }
26 | 
27 | [dependencies.ltp]
28 | version = "*"
29 | path = "../../rust/ltp"
30 | features = ["serialization", "parallel"]
31 | 
32 | [features]
33 | default = ["abi3", "near-char-type"]
34 | malloc = ["mimalloc"]
35 | secure = ["mimalloc/secure"]
36 | char-type = ["ltp/char-type"]
37 | cross-char = ["ltp/cross-char"]
38 | near-char-type = ["ltp/near-char-type"]
39 | abi3 = ["pyo3/abi3", "pyo3/abi3-py37"]
40 | 


--------------------------------------------------------------------------------
/python/extension/LICENSE:
--------------------------------------------------------------------------------
1 | 1. 语言技术平台面向国内外大学、中科院各研究所以及个人研究者免费开放源代码，但如上述机构和个人将该平台用于商业目的（如企业合作项目等）则需要付费。
2 | 2. 除上述机构以外的企事业单位，如申请使用该平台，需付费。
3 | 3. 凡涉及付费问题，请发邮件到 car@ir.hit.edu.cn 洽商。
4 | 4. 如果您在 LTP 基础上发表论文或取得科研成果，请您在发表论文和申报成果时声明“使用了哈工大社会计算与信息检索研究中心研制的语言技术平台（LTP）”.
5 |    同时，发信给car@ir.hit.edu.cn，说明发表论文或申报成果的题目、出处等。
6 | 


--------------------------------------------------------------------------------
/python/extension/examples/legacy_train.py:
--------------------------------------------------------------------------------
 1 | from ltp_extension.perceptron import Algorithm, CWSModel, CWSTrainer, ModelType, Trainer
 2 | 
 3 | 
 4 | def train_cws():
 5 |     ap = Algorithm("AP")
 6 |     pa = Algorithm("Pa")
 7 |     pai = Algorithm("PaI", 0.5)
 8 |     paii = Algorithm("PaII", 0.5)
 9 | 
10 |     trainer: CWSTrainer = CWSTrainer()
11 |     trainer.epoch = 10
12 |     trainer.load_train_data("data/cws/val.txt")
13 |     trainer.load_eval_data("data/cws/test.txt")
14 |     print(trainer)
15 | 
16 |     for algorithm in [ap, pa, pai, paii]:
17 |         print(algorithm)
18 |         trainer.algorithm = algorithm
19 |         _: CWSModel = trainer.train()
20 | 
21 | 
22 | def train_auto():
23 |     ap = Algorithm("AP")
24 |     pa = Algorithm("Pa")
25 |     pai = Algorithm("PaI", 0.5)
26 |     paii = Algorithm("PaII", 0.5)
27 | 
28 |     model_type = ModelType("cws")
29 |     trainer: Trainer = Trainer(model_type)
30 |     trainer.epoch = 10
31 |     trainer.load_train_data("data/cws/val.txt")
32 |     trainer.load_eval_data("data/cws/test.txt")
33 |     print(trainer)
34 | 
35 |     for algorithm in [ap, pa, pai, paii]:
36 |         print(algorithm)
37 |         trainer.algorithm = algorithm
38 |         _: CWSModel = trainer.train()
39 | 
40 | 
41 | def main():
42 |     # train_cws()
43 |     train_auto()
44 | 
45 | 
46 | if __name__ == "__main__":
47 |     main()
48 | 


--------------------------------------------------------------------------------
/python/extension/ltp_extension/__init__.py:
--------------------------------------------------------------------------------
1 | from . import ltp_extension
2 | 
3 | __version__ = ltp_extension.__version__
4 | perceptron = ltp_extension.perceptron
5 | algorithms = ltp_extension.algorithms
6 | 


--------------------------------------------------------------------------------
/python/extension/ltp_extension/algorithms/__init__.py:
--------------------------------------------------------------------------------
1 | # Generated content DO NOT EDIT
2 | from .. import algorithms
3 | 
4 | eisner = algorithms.eisner
5 | get_entities = algorithms.get_entities
6 | viterbi_decode_postprocess = algorithms.viterbi_decode_postprocess
7 | Hook = algorithms.Hook
8 | StnSplit = algorithms.StnSplit
9 | 


--------------------------------------------------------------------------------
/python/extension/ltp_extension/algorithms/algorithms.pyi:
--------------------------------------------------------------------------------
 1 | # Generated content DO NOT EDIT
 2 | def eisner(scores, stn_length, remove_root=False):
 3 |     """
 4 |     Decode with Eisner's algorithm
 5 |     """
 6 |     pass
 7 | 
 8 | def get_entities(tags):
 9 |     """
10 |     Convert Tags to Entities
11 |     """
12 |     pass
13 | 
14 | def viterbi_decode_postprocess(history, last_tags, stn_length, labels_num):
15 |     """
16 |     Viterbi Decode Postprocessing
17 |     """
18 |     pass
19 | 
20 | class Hook:
21 |     def __init__(self):
22 |         pass
23 |     def add_word(self, word, freq=None):
24 |         """
25 |         add words to the hook, the freq can be zero
26 |         """
27 |         pass
28 |     def hook(self, sentence, words):
29 |         """
30 |         hook to the new words
31 |         """
32 |         pass
33 | 
34 | class StnSplit:
35 |     def __init__(self):
36 |         pass
37 |     def batch_split(self, batch_text, threads=8):
38 |         """
39 |         batch split to sentences
40 |         """
41 |         pass
42 |     @property
43 |     def bracket_as_entity(self):
44 |         """
45 |         Get the value of the bracket_as_entity option.
46 |         """
47 |         pass
48 |     @property
49 |     def en_quote_as_entity(self):
50 |         """
51 |         Get the value of the en_quote_as_entity option.
52 |         """
53 |         pass
54 |     def split(self, text):
55 |         """
56 |         split to sentences
57 |         """
58 |         pass
59 |     @property
60 |     def use_en(self):
61 |         """
62 |         Get the value of the use_en option.
63 |         """
64 |         pass
65 |     @property
66 |     def use_zh(self):
67 |         """
68 |         Get the value of the use_zh option.
69 |         """
70 |         pass
71 |     @property
72 |     def zh_quote_as_entity(self):
73 |         """
74 |         Get the value of the zh_quote_as_entity option.
75 |         """
76 |         pass
77 | 


--------------------------------------------------------------------------------
/python/extension/ltp_extension/ltp_extension.pyi:
--------------------------------------------------------------------------------
1 | # Generated content DO NOT EDIT
2 | 


--------------------------------------------------------------------------------
/python/extension/ltp_extension/perceptron/__init__.py:
--------------------------------------------------------------------------------
 1 | # Generated content DO NOT EDIT
 2 | from .. import perceptron
 3 | 
 4 | Algorithm = perceptron.Algorithm
 5 | CWSModel = perceptron.CWSModel
 6 | CWSTrainer = perceptron.CWSTrainer
 7 | CharacterType = perceptron.CharacterType
 8 | Model = perceptron.Model
 9 | ModelType = perceptron.ModelType
10 | NERModel = perceptron.NERModel
11 | NERTrainer = perceptron.NERTrainer
12 | POSModel = perceptron.POSModel
13 | POSTrainer = perceptron.POSTrainer
14 | Trainer = perceptron.Trainer
15 | 


--------------------------------------------------------------------------------
/python/extension/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["maturin>=1.0,<2.0"]
 3 | build-backend = "maturin"
 4 | 
 5 | [tool.ruff]
 6 | # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
 7 | select = ["E", "F"]
 8 | ignore = []
 9 | 
10 | # Allow autofix for all enabled rules (when `--fix`) is provided.
11 | fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
12 | unfixable = []
13 | 
14 | # Exclude a variety of commonly ignored directories.
15 | exclude = [
16 |     ".bzr",
17 |     ".direnv",
18 |     ".eggs",
19 |     ".git",
20 |     ".git-rewrite",
21 |     ".hg",
22 |     ".mypy_cache",
23 |     ".nox",
24 |     ".pants.d",
25 |     ".pytype",
26 |     ".ruff_cache",
27 |     ".svn",
28 |     ".tox",
29 |     ".venv",
30 |     "__pypackages__",
31 |     "_build",
32 |     "buck-out",
33 |     "build",
34 |     "dist",
35 |     "node_modules",
36 |     "venv",
37 |     "*.pyi"
38 | ]
39 | per-file-ignores = {}
40 | 
41 | # Same as Black.
42 | line-length = 120
43 | 
44 | # Allow unused variables when underscore-prefixed.
45 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
46 | 
47 | # Assume Python 3.10.
48 | target-version = "py310"
49 | 
50 | [tool.ruff.mccabe]
51 | # Unlike Flake8, default to a complexity level of 10.
52 | max-complexity = 10


--------------------------------------------------------------------------------
/python/extension/src/algorithms.rs:
--------------------------------------------------------------------------------
 1 | use ltp::utils::{drop_get_entities, eisner, viterbi_decode_postprocessing};
 2 | use pyo3::prelude::*;
 3 | 
 4 | /// Convert Tags to Entities
 5 | #[pyfunction]
 6 | #[pyo3(name = "get_entities", text_signature = "(tags)")]
 7 | pub fn py_get_entities(tags: Vec<&str>) -> PyResult<Vec<(&str, usize, usize)>> {
 8 |     Ok(drop_get_entities(tags))
 9 | }
10 | 
11 | /// Decode with Eisner's algorithm
12 | #[pyfunction]
13 | #[pyo3(
14 |     name = "eisner",
15 |     text_signature = "(scores, stn_length, remove_root=False)"
16 | )]
17 | pub fn py_eisner(
18 |     scores: Vec<f32>,
19 |     stn_length: Vec<usize>,
20 |     remove_root: bool,
21 | ) -> PyResult<Vec<Vec<usize>>> {
22 |     Ok(eisner(&scores, &stn_length, remove_root))
23 | }
24 | 
25 | /// Viterbi Decode Postprocessing
26 | #[pyfunction]
27 | #[pyo3(
28 |     name = "viterbi_decode_postprocess",
29 |     text_signature = "(history, last_tags, stn_length, labels_num)"
30 | )]
31 | pub fn py_viterbi_decode_postprocess(
32 |     history: Vec<i64>,
33 |     last_tags: Vec<i64>,
34 |     stn_lengths: Vec<usize>,
35 |     labels_num: usize,
36 | ) -> PyResult<Vec<Vec<i64>>> {
37 |     Ok(viterbi_decode_postprocessing(
38 |         &history,
39 |         &last_tags,
40 |         &stn_lengths,
41 |         labels_num,
42 |     ))
43 | }
44 | 


--------------------------------------------------------------------------------
/python/extension/src/hook.rs:
--------------------------------------------------------------------------------
 1 | use ltp::utils::hook::Hook;
 2 | use pyo3::prelude::*;
 3 | 
 4 | #[pyclass(module = "ltp_extension.algorithms", name = "Hook", subclass)]
 5 | #[derive(Clone, Debug)]
 6 | pub struct PyHook {
 7 |     pub hook: Hook,
 8 | }
 9 | 
10 | #[pymethods]
11 | impl PyHook {
12 |     #[new]
13 |     #[pyo3(text_signature = "(self)")]
14 |     pub fn new() -> PyResult<Self> {
15 |         Ok(Self { hook: Hook::new() })
16 |     }
17 | 
18 |     pub fn __len__(&self) -> usize {
19 |         self.hook.total()
20 |     }
21 | 
22 |     /// add words to the hook, the freq can be zero
23 |     #[pyo3(text_signature = "(self, word, freq = None)")]
24 |     pub fn add_word(&mut self, word: &str, freq: Option<usize>) -> usize {
25 |         self.hook.add_word(word, freq)
26 |     }
27 | 
28 |     /// hook to the new words
29 |     #[pyo3(text_signature = "(self, sentence, words)")]
30 |     pub fn hook<'a>(&self, sentence: &'a str, words: Vec<&str>) -> PyResult<Vec<&'a str>> {
31 |         Ok(self.hook.hook(sentence, &words)?)
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/python/extension/src/lib.rs:
--------------------------------------------------------------------------------
 1 | #[cfg(feature = "mimalloc")]
 2 | use mimalloc::MiMalloc;
 3 | 
 4 | #[cfg(feature = "mimalloc")]
 5 | #[global_allocator]
 6 | static GLOBAL: MiMalloc = MiMalloc;
 7 | 
 8 | mod algorithms;
 9 | mod hook;
10 | mod perceptron;
11 | mod stnsplit;
12 | mod utils;
13 | 
14 | use crate::perceptron::{ModelType, PyModel, PyTrainer};
15 | pub use algorithms::{py_eisner, py_get_entities, py_viterbi_decode_postprocess};
16 | use hook::PyHook;
17 | pub use perceptron::{
18 |     CharacterType, PyAlgorithm, PyCWSModel, PyCWSTrainer, PyNERModel, PyNERTrainer, PyPOSModel,
19 |     PyPOSTrainer,
20 | };
21 | use pyo3::prelude::*;
22 | use stnsplit::StnSplit;
23 | 
24 | pub const VERSION: &str = env!("CARGO_PKG_VERSION");
25 | 
26 | // For users using multiprocessing in python, it is quite easy to fork the process running
27 | // tokenizers, ending up with a deadlock because we internaly make use of multithreading. So
28 | // we register a callback to be called in the event of a fork so that we can warn the user.
29 | static mut REGISTERED_FORK_CALLBACK: bool = false;
30 | extern "C" fn child_after_fork() {
31 |     use utils::parallelism::*;
32 |     if has_parallelism_been_used() && !is_parallelism_configured() {
33 |         println!(
34 |             "LTP: The current process just got forked, after parallelism has \
35 |             already been used. Disabling parallelism to avoid deadlocks..."
36 |         );
37 |         println!("To disable this warning, you can either:");
38 |         println!(
39 |             "\t- Avoid using `LTP/legacy` model before the fork if possible\n\
40 |             \t- Explicitly set the environment variable {}=(true | false)",
41 |             ENV_VARIABLE
42 |         );
43 |         set_parallelism(false);
44 |     }
45 | }
46 | 
47 | /// LTP Module
48 | #[pymodule]
49 | fn ltp_extension(py: Python, m: &PyModule) -> PyResult<()> {
50 |     // Register the fork callback
51 |     #[cfg(target_family = "unix")]
52 |     unsafe {
53 |         if !REGISTERED_FORK_CALLBACK {
54 |             libc::pthread_atfork(None, None, Some(child_after_fork));
55 |             REGISTERED_FORK_CALLBACK = true;
56 |         }
57 |     }
58 | 
59 |     m.add("__version__", env!("CARGO_PKG_VERSION"))?;
60 | 
61 |     // Algorithms Module
62 |     let algorithms = PyModule::new(py, "algorithms")?;
63 | 
64 |     algorithms.add_class::<StnSplit>()?;
65 |     algorithms.add_class::<PyHook>()?;
66 |     algorithms.add_function(wrap_pyfunction!(py_eisner, m)?)?;
67 |     algorithms.add_function(wrap_pyfunction!(py_get_entities, m)?)?;
68 |     algorithms.add_function(wrap_pyfunction!(py_viterbi_decode_postprocess, m)?)?;
69 | 
70 |     // Perceptron Module
71 |     let perceptron = PyModule::new(py, "perceptron")?;
72 |     perceptron.add_class::<PyModel>()?;
73 |     perceptron.add_class::<ModelType>()?;
74 |     perceptron.add_class::<PyTrainer>()?;
75 |     perceptron.add_class::<PyAlgorithm>()?;
76 | 
77 |     perceptron.add_class::<CharacterType>()?;
78 |     perceptron.add_class::<PyCWSModel>()?;
79 |     perceptron.add_class::<PyCWSTrainer>()?;
80 | 
81 |     perceptron.add_class::<PyPOSModel>()?;
82 |     perceptron.add_class::<PyPOSTrainer>()?;
83 | 
84 |     perceptron.add_class::<PyNERModel>()?;
85 |     perceptron.add_class::<PyNERTrainer>()?;
86 | 
87 |     m.add_submodule(algorithms)?;
88 |     m.add_submodule(perceptron)?;
89 |     Ok(())
90 | }
91 | 


--------------------------------------------------------------------------------
/python/extension/src/perceptron/alg.rs:
--------------------------------------------------------------------------------
 1 | use ltp::perceptron::{Algorithm, PaMode};
 2 | use pyo3::exceptions::PyValueError;
 3 | use pyo3::prelude::*;
 4 | use serde::{Deserialize, Serialize};
 5 | use std::fmt::{Display, Formatter};
 6 | 
 7 | /// The perceptron algorithm.
 8 | /// algorithm support "AP", "Pa", "PaI", "PaII"
 9 | /// AP: average perceptron, param is the threads
10 | /// PA: parallel average perceptron, param is c(margin)
11 | #[pyclass(module = "ltp_extension.perceptron", name = "Algorithm", subclass)]
12 | #[derive(Clone, Serialize, Deserialize, Default, Debug, PartialEq)]
13 | pub struct PyAlgorithm {
14 |     pub(crate) algorithm: Algorithm<f64>,
15 | }
16 | 
17 | impl Display for PyAlgorithm {
18 |     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
19 |         write!(f, "{}", self.algorithm)
20 |     }
21 | }
22 | 
23 | #[pymethods]
24 | impl PyAlgorithm {
25 |     #[new]
26 |     #[pyo3(text_signature = "(self, algorithm, param = None)")]
27 |     pub fn new(py: Python, algorithm: &str, param: Option<PyObject>) -> PyResult<Self> {
28 |         let algorithm: Algorithm<f64> = match algorithm {
29 |             "AP" => {
30 |                 if let Some(param) = param {
31 |                     let param = param.extract::<usize>(py)?;
32 |                     Ok(Algorithm::AP(param))
33 |                 } else {
34 |                     Ok(Algorithm::AP(1usize))
35 |                 }
36 |             }
37 |             "Pa" => Ok(Algorithm::PA(PaMode::Pa)),
38 |             "PaI" => {
39 |                 if let Some(c) = param {
40 |                     let c = c.extract::<f64>(py)?;
41 |                     Ok(Algorithm::PA(PaMode::PaI(c)))
42 |                 } else {
43 |                     Err(PyValueError::new_err("param is needed"))
44 |                 }
45 |             }
46 |             "PaII" => {
47 |                 if let Some(c) = param {
48 |                     let c = c.extract::<f64>(py)?;
49 |                     Ok(Algorithm::PA(PaMode::PaII(c)))
50 |                 } else {
51 |                     Err(PyValueError::new_err("param is needed"))
52 |                 }
53 |             }
54 |             _ => Err(PyValueError::new_err("algorithm is not supported"))?,
55 |         }?;
56 | 
57 |         Ok(Self { algorithm })
58 |     }
59 | 
60 |     fn __repr__(&self) -> String {
61 |         format!("{}", self.algorithm)
62 |     }
63 | }
64 | 


--------------------------------------------------------------------------------
/python/extension/src/perceptron/com.rs:
--------------------------------------------------------------------------------
 1 | #[macro_export]
 2 | macro_rules! impl_model {
 3 |     ($name:ident) => {
 4 |         impl $name {
 5 |             fn inner_load(path: &str) -> anyhow::Result<Self> {
 6 |                 use ltp::perceptron::ModelSerde;
 7 |                 let file = std::fs::File::open(path)?;
 8 |                 let model = if path.ends_with(".json") {
 9 |                     ModelSerde::load(file, ltp::perceptron::Format::JSON)?
10 |                 } else {
11 |                     ModelSerde::load(
12 |                         file,
13 |                         ltp::perceptron::Format::AVRO(ltp::perceptron::Codec::Deflate),
14 |                     )?
15 |                 };
16 |                 Ok(Self { model })
17 |             }
18 | 
19 |             fn inner_save(&self, path: &str) -> anyhow::Result<()> {
20 |                 use ltp::perceptron::ModelSerde;
21 |                 let file = std::fs::File::create(path)?;
22 |                 if path.ends_with(".json") {
23 |                     self.model.save(file, ltp::perceptron::Format::JSON)?;
24 |                 } else {
25 |                     self.model.save(
26 |                         file,
27 |                         ltp::perceptron::Format::AVRO(ltp::perceptron::Codec::Deflate),
28 |                     )?;
29 |                 }
30 |                 Ok(())
31 |             }
32 |         }
33 |     };
34 |     () => {};
35 | }
36 | 


--------------------------------------------------------------------------------
/python/extension/src/perceptron/mod.rs:
--------------------------------------------------------------------------------
 1 | mod alg;
 2 | mod com;
 3 | mod model;
 4 | mod specialization;
 5 | mod trainer;
 6 | 
 7 | pub type Perceptron<T> = ltp::perceptron::SerdeModel<T, f64>;
 8 | pub use alg::PyAlgorithm;
 9 | pub use model::{EnumModel, ModelType, PyModel};
10 | pub use specialization::{
11 |     CharacterType, PyCWSModel, PyCWSTrainer, PyNERModel, PyNERTrainer, PyPOSModel, PyPOSTrainer,
12 | };
13 | pub use trainer::{EnumTrainer, PyTrainer};
14 | 


--------------------------------------------------------------------------------
/python/extension/src/perceptron/specialization/mod.rs:
--------------------------------------------------------------------------------
1 | mod cws;
2 | mod ner;
3 | mod pos;
4 | 
5 | pub use cws::{CharacterType, PyCWSModel, PyCWSTrainer};
6 | pub use ner::{PyNERModel, PyNERTrainer};
7 | pub use pos::{PyPOSModel, PyPOSTrainer};
8 | 


--------------------------------------------------------------------------------
/python/extension/src/utils/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod parallelism;
2 | 


--------------------------------------------------------------------------------
/python/interface/LICENSE:
--------------------------------------------------------------------------------
1 | 1. 语言技术平台面向国内外大学、中科院各研究所以及个人研究者免费开放源代码，但如上述机构和个人将该平台用于商业目的（如企业合作项目等）则需要付费。
2 | 2. 除上述机构以外的企事业单位，如申请使用该平台，需付费。
3 | 3. 凡涉及付费问题，请发邮件到 car@ir.hit.edu.cn 洽商。
4 | 4. 如果您在 LTP 基础上发表论文或取得科研成果，请您在发表论文和申报成果时声明“使用了哈工大社会计算与信息检索研究中心研制的语言技术平台（LTP）”.
5 |    同时，发信给car@ir.hit.edu.cn，说明发表论文或申报成果的题目、出处等。
6 | 


--------------------------------------------------------------------------------
/python/interface/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include requirements.txt
3 | 
4 | recursive-include ltp *
5 | 
6 | recursive-exclude * *.pyc
7 | recursive-exclude * .DS_Store
8 | recursive-exclude * __pycache__
9 | 


--------------------------------------------------------------------------------
/python/interface/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | help:  ## Show help
 3 | 	@grep -E '^[.a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
 4 | 
 5 | clean: ## Clean autogenerated files
 6 | 	rm -rf dist
 7 | 	find . -type f -name "*.DS_Store" -ls -delete
 8 | 	find . | grep -E "(__pycache__|\.pyc|\.pyo)" | xargs rm -rf
 9 | 	find . | grep -E ".pytest_cache" | xargs rm -rf
10 | 	find . | grep -E ".ipynb_checkpoints" | xargs rm -rf
11 | 	rm -f .coverage
12 | 
13 | style: ## Run pre-commit hooks
14 | 	pre-commit run -a
15 | 
16 | sync: ## Merge changes from main branch to your current branch
17 | 	git fetch --all
18 | 	git merge main
19 | 
20 | test: ## Run not slow tests
21 | 	pytest -k "not slow"
22 | 
23 | test-full: ## Run all tests
24 | 	pytest
25 | 


--------------------------------------------------------------------------------
/python/interface/docs/README.md:
--------------------------------------------------------------------------------
1 | # 文档生成
2 | 
3 | ```shell script
4 | sphinx-build -b html docs build
5 | ```
6 | 


--------------------------------------------------------------------------------
/python/interface/docs/api/ltp.rst:
--------------------------------------------------------------------------------
 1 | LTP 文档
 2 | ===========
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | ltp.interface module
 8 | --------------------
 9 | 
10 | .. automodule:: ltp.interface
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | ltp.legacy module
16 | -----------------
17 | 
18 | .. automodule:: ltp.legacy
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | ltp.nerual module
24 | -----------------
25 | 
26 | .. automodule:: ltp.nerual
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 
31 | 
32 | Module contents
33 | ---------------
34 | 
35 | .. automodule:: ltp
36 |    :members:
37 |    :undoc-members:
38 |    :show-inheritance:
39 | 


--------------------------------------------------------------------------------
/python/interface/docs/conf.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | sys.path.insert(0, os.path.abspath("../ltp"))
 5 | 
 6 | project = "LTP4"
 7 | copyright = "2020, Feng Yunlong"
 8 | author = "Feng Yunlong"
 9 | 
10 | from ltp import __version__ as version
11 | 
12 | release = version
13 | 
14 | extensions = [
15 |     "sphinx.ext.autodoc",
16 |     "sphinx.ext.coverage",
17 |     "sphinx.ext.doctest",
18 |     "sphinx.ext.intersphinx",
19 |     "sphinx.ext.viewcode",
20 |     "sphinx.ext.napoleon",
21 | ]
22 | 
23 | autodoc_default_options = {
24 |     "members": True,
25 |     "show-inheritance": False,
26 |     "member-order": "bysource",
27 |     "exclude-members": "__weakref__",
28 | }
29 | 
30 | autodoc_typehints = "none"
31 | add_module_names = False
32 | 
33 | templates_path = ["templates"]
34 | language = "zh"
35 | exclude_patterns = []
36 | html_theme = "sphinx_rtd_theme"
37 | html_static_path = ["static"]
38 | source_suffix = [".rst", ".md"]
39 | master_doc = "index"
40 | 


--------------------------------------------------------------------------------
/python/interface/docs/index.rst:
--------------------------------------------------------------------------------
 1 | LTP4 文档
 2 | ================================
 3 | 
 4 | 
 5 | .. include:: introduction.rst
 6 | .. include:: quickstart.rst
 7 | .. include:: performance.rst
 8 | 
 9 | API文档
10 | ========
11 | .. toctree::
12 |     :titlesonly:
13 |     :glob:
14 | 
15 |     introduction
16 |     quickstart
17 |     performance
18 | 
19 |     api/*
20 | 
21 |     appendix
22 | 
23 | 
24 | 索引和图表
25 | ==================
26 | 
27 | * :ref:`genindex`
28 | * :ref:`modindex`
29 | * :ref:`search`
30 | 


--------------------------------------------------------------------------------
/python/interface/docs/introduction.rst:
--------------------------------------------------------------------------------
 1 | 开始使用LTP
 2 | =============
 3 | 
 4 | 如果你是第一次使用LTP，不妨花一些时间了解LTP能帮你做什么。
 5 | 
 6 | LTP提供了一系列中文自然语言处理工具，用户可以使用这些工具对于中文文本进行分词、词性标注、句法分析等等工作。从应用角度来看，LTP为用户提供了下列组件：
 7 | 
 8 | * 针对单一自然语言处理任务，生成统计机器学习模型的工具
 9 | * 针对单一自然语言处理任务，调用模型进行分析的编程接口
10 | * 系统可调用的，用于中文语言处理的模型文件
11 | * 针对单一自然语言处理任务，基于云端的编程接口
12 | 
13 | 如果你的公司需要一套高性能的中文语言分析工具以处理海量的文本，或者你的在研究工作建立在一系列底层中文自然语言处理任务之上，或者你想将自己的科研成果与前沿先进工作进行对比，LTP都可能是你的选择。
14 | 


--------------------------------------------------------------------------------
/python/interface/docs/performance.rst:
--------------------------------------------------------------------------------
 1 | 性能
 2 | ===============
 3 | 
 4 | 分词模块
 5 | ---------
 6 | 
 7 | 基础模型在人民日报测试数据上的性能如下：
 8 | 
 9 | 语料信息：人民日报1998年2月-6月(后10%数据作为开发集)作为训练数据，1月作为测试数据。
10 | 
11 | 
12 |     +------+----------+
13 |     |      | F1       |
14 |     +======+==========+
15 |     |测试集| 98.5%    |
16 |     +------+----------+
17 | 
18 | 
19 | 词性标注模块
20 | ------------
21 | 
22 | 基础模型在人民日报数据集上的性能如下：
23 | 
24 | 语料信息：人民日报1998年2月-6月(后10%数据作为开发集)作为训练数据，1月作为测试数据。
25 | 
26 |     +------+----------+
27 |     |      | ACC      |
28 |     +======+==========+
29 |     |测试集| 98.5%    |
30 |     +------+----------+
31 | 
32 | 命名实体识别模块
33 | ----------------
34 | 
35 | 基础模型在人民日报数据集上的性能如下：
36 | 
37 | 语料信息：人民日报1998年1月做训练（后10%数据作为开发集），6月前10000句做测试作为训练数据。
38 | 
39 |     +------+------+
40 |     |      | F1   |
41 |     +======+======+
42 |     |测试集| 95.4 |
43 |     +------+------+
44 | 
45 | 语义角色标注模块
46 | -----------------
47 | 
48 | 基础模型在CPB3.0上的性能如下：
49 | 
50 |     +------+----------+
51 |     |      | F1       |
52 |     +======+==========+
53 |     |测试集| 80.6%    |
54 |     +------+----------+
55 | 
56 | 依存句法分析模块
57 | -----------------
58 | 
59 | 在 `Chinese Dependency Treebank(CDT) <https://catalog.ldc.upenn.edu/LDC2012T05>`_ 数据集上的性能如下。
60 | 
61 |     +------+-------+
62 |     |      | LAS   |
63 |     +======+=======+
64 |     |测试集|  89.5 |
65 |     +------+-------+
66 | 
67 | 语义依存分析模块
68 | -----------------
69 | 
70 | 在 `CCL2020 <http://ir.hit.edu.cn/sdp2020ccl>`_ 数据集上的性能如下。
71 | 
72 |     +------+-------+
73 |     |      | LAS   |
74 |     +======+=======+
75 |     |测试集|  75.2 |
76 |     +------+-------+
77 | 


--------------------------------------------------------------------------------
/python/interface/docs/quickstart.rst:
--------------------------------------------------------------------------------
  1 | 快速上手
  2 | ========
  3 | 
  4 | 快速安装
  5 | -----------
  6 | 
  7 | 安装LTP是非常简单的，使用Pip安装只需要：
  8 | 
  9 | .. code-block:: sh
 10 | 
 11 |     pip install ltp
 12 | 
 13 | 载入模型
 14 | --------------------------
 15 | 
 16 | .. code-block:: python
 17 | 
 18 |     from ltp import LTP
 19 |     ltp = LTP() # 默认加载 LTP/Small 模型
 20 |     # ltp = LTP(path = "LTP/base|LTP/small|LTP/tiny")
 21 | 
 22 | 分句
 23 | --------------------------
 24 | 
 25 | 使用LTP分句只需要使用 StnSplit
 26 | 
 27 | .. code-block:: python
 28 | 
 29 |     from ltp import StnSplit
 30 |     sents = StnSplit().split("汤姆生病了。他去了医院。")
 31 |     # [
 32 |     #   "汤姆生病了。",
 33 |     #   "他去了医院。"
 34 |     # ]
 35 | 
 36 |     sents = StnSplit().batch_split(["他叫汤姆去拿外衣。", "汤姆生病了。他去了医院。"])
 37 | 
 38 |     # [
 39 |     #   "他叫汤姆去拿外衣。",
 40 |     #   "汤姆生病了。",
 41 |     #   "他去了医院。"
 42 |     # ]
 43 | 
 44 | 用户自定义词典
 45 | -------------------
 46 | 
 47 | .. code-block:: python
 48 | 
 49 |     from ltp import LTP
 50 |     ltp = LTP()
 51 |     # 也可以在代码中添加自定义的词语
 52 |     ltp.add_words(word="长江大桥", freq = 2)
 53 | 
 54 | 
 55 | 分词
 56 | ------------------
 57 | 
 58 | 使用LTP分词非常简单，下面是一个简短的例子：
 59 | 
 60 | .. code-block:: python
 61 | 
 62 |     from ltp import LTP
 63 | 
 64 |     ltp = LTP()
 65 | 
 66 |     words = ltp.pipeline(["他叫汤姆去拿外衣。"], tasks = ["cws"], return_dict = False)
 67 |     # [['他', '叫', '汤姆', '去', '拿', '外衣', '。']]
 68 | 
 69 | 
 70 | 词性标注
 71 | ------------------
 72 | 
 73 | .. code-block:: python
 74 | 
 75 |     from ltp import LTP
 76 | 
 77 |     ltp = LTP()
 78 | 
 79 |     result = ltp.pipeline(["他叫汤姆去拿外衣。"], tasks = ["cws","pos"])
 80 |     print(result.pos)
 81 |     # [['他', '叫', '汤姆', '去', '拿', '外衣', '。']]
 82 |     # [['r', 'v', 'nh', 'v', 'v', 'n', 'wp']]
 83 | 
 84 | 命名实体识别
 85 | ------------------
 86 | 
 87 | 
 88 | .. code-block:: python
 89 | 
 90 |     from ltp import LTP
 91 | 
 92 |     ltp = LTP()
 93 | 
 94 |     result = ltp.pipeline(["他叫汤姆去拿外衣。"], tasks = ["cws","ner"])
 95 |     print(result.ner)
 96 |     # [['他', '叫', '汤姆', '去', '拿', '外衣', '。']]
 97 | 
 98 | 
 99 | 
100 | 语义角色标注
101 | ------------------
102 | 
103 | .. code-block:: python
104 | 
105 |     from ltp import LTP
106 | 
107 |     ltp = LTP()
108 | 
109 |     result = ltp.pipeline(["他叫汤姆去拿外衣。"], tasks = ["cws","srl"])
110 |     print(result.srl)
111 | 
112 | 
113 | 
114 | 依存句法分析
115 | ------------------
116 | 
117 | 需要注意的是，在依存句法当中，虚节点ROOT占据了0位置，因此节点的下标从1开始。
118 | 
119 | .. code-block:: python
120 | 
121 |     from ltp import LTP
122 | 
123 |     ltp = LTP()
124 | 
125 |     result = ltp.pipeline(["他叫汤姆去拿外衣。"], tasks = ["cws","dep"])
126 |     print(result.dep)
127 | 
128 | 
129 | 
130 | 语义依存分析(树)
131 | ------------------
132 | 
133 | 与依存句法类似的，这里的下标也是从1开始。
134 | 
135 | .. code-block:: python
136 | 
137 |     from ltp import LTP
138 | 
139 |     ltp = LTP()
140 | 
141 |     result = ltp.pipeline(["他叫汤姆去拿外衣。"], tasks = ["cws","sdp"])
142 |     print(result.sdp)
143 | 
144 | 
145 | 语义依存分析(图)
146 | ------------------
147 | 
148 | 与依存句法类似的，这里的下标也是从1开始。
149 | 
150 | .. code-block:: python
151 | 
152 |     from ltp import LTP
153 | 
154 |     ltp = LTP()
155 | 
156 |     result = ltp.pipeline(["他叫汤姆去拿外衣。"], tasks = ["cws","sdpg"])
157 |     print(result.sdpg)
158 | 
159 | 
160 | LTP Server
161 | ------------------------------
162 | 
163 | LTP Server 是对 LTP 的一个简单包装，依赖于 tornado，使用方式如下：
164 | 
165 | .. code-block:: bash
166 | 
167 |     pip install ltp, tornado
168 |     python utils/server.py serve
169 | 


--------------------------------------------------------------------------------
/python/interface/examples/conllu.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # Author: Yunlong Feng <ylfeng@ir.hit.edu.cn>
 3 | 
 4 | 
 5 | from ltp import LTP
 6 | 
 7 | 
 8 | class Token:
 9 |     def __init__(self, id, form, lemma, upos, xpos, feats, head, deprel, deps, misc):
10 |         self.id = id
11 |         self.form = form
12 |         self.lemma = lemma
13 |         self.upos = upos
14 |         self.xpos = xpos
15 |         self.feats = feats
16 |         self.head = head
17 |         self.deprel = deprel
18 |         self.deps = deps
19 |         self.misc = misc
20 | 
21 |     def __str__(self):
22 |         return "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
23 |             self.id,
24 |             self.form,
25 |             self.lemma,
26 |             self.upos,
27 |             self.xpos,
28 |             self.feats,
29 |             self.head,
30 |             self.deprel,
31 |             self.deps,
32 |             self.misc,
33 |         )
34 | 
35 |     def __repr__(self):
36 |         return "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
37 |             self.id,
38 |             self.form,
39 |             self.lemma,
40 |             self.upos,
41 |             self.xpos,
42 |             self.feats,
43 |             self.head,
44 |             self.deprel,
45 |             self.deps,
46 |             self.misc,
47 |         )
48 | 
49 | 
50 | def main():
51 |     ltp = LTP("LTP/tiny")
52 |     batched_cws, batched_pos, batched_dep, batched_sdpg = ltp.pipeline(
53 |         ["他叫汤姆去拿外衣。", "他点头表示同意我的意见。", "我们即将以昂扬的斗志迎来新的一年。"], ["cws", "pos", "dep", "sdpg"]
54 |     ).to_tuple()
55 | 
56 |     for cws, pos, dep, sdpg in zip(batched_cws, batched_pos, batched_dep, batched_sdpg):
57 |         sentence = []
58 |         for idx, (form, xpos, head, deprel) in enumerate(zip(cws, pos, dep["head"], dep["label"])):
59 |             sentence.append(
60 |                 Token(
61 |                     id=idx + 1,
62 |                     form=form,
63 |                     lemma="_",
64 |                     upos="_",
65 |                     xpos=xpos,
66 |                     feats="_",
67 |                     head=head,
68 |                     deprel=deprel,
69 |                     deps="",
70 |                     misc="_",
71 |                 )
72 |             )
73 | 
74 |         for id, head, tag in sdpg:
75 |             if sentence[id - 1].deps:
76 |                 sentence[id - 1].deps = sentence[id - 1].deps + f"|{head}:{tag}"
77 |             else:
78 |                 sentence[id - 1].deps = f"{head}:{tag}"
79 | 
80 |         sentence = [str(token) for token in sentence]
81 |         sentence = "\n".join(sentence)
82 | 
83 |         print(sentence)
84 |         print("\n")
85 | 
86 | 
87 | if __name__ == "__main__":
88 |     main()
89 | 


--------------------------------------------------------------------------------
/python/interface/examples/issues.py:
--------------------------------------------------------------------------------
  1 | from python.interface.examples.simple import stn_split
  2 | 
  3 | 
  4 | def issue590():
  5 |     from ltp import LTP
  6 |     ltp = LTP("LTP/tiny")
  7 |     ltp.add_words(words=["[ENT]"])
  8 |     print(ltp.pipeline(["[ENT] Info"], tasks=["cws"]))
  9 | 
 10 |     ltp.add_words(words=["[EOS]"])
 11 |     print(ltp.pipeline(["[EOS] Info"], tasks=["cws"]))
 12 | 
 13 | 
 14 | def issue592():
 15 |     from ltp import LTP
 16 |     legacy_ltp = LTP("LTP/legacy")
 17 | 
 18 |     legacy_ltp.add_words(words=["SCSG", "IP地址"])
 19 |     print(legacy_ltp.pipeline(["SCSGIP地址"], tasks=["cws"]))
 20 | 
 21 |     neural_ltp = LTP("LTP/tiny")
 22 | 
 23 |     # not bug, but not work because of the bert tokenizer
 24 |     neural_ltp.add_words(words=["SCSG", "IP地址"])
 25 |     print(neural_ltp.pipeline(["SCSGIP地址"], tasks=["cws"]))
 26 | 
 27 | 
 28 | def issue600():
 29 |     from ltp import LTP
 30 |     legacy_ltp = LTP("LTP/legacy")
 31 |     print(legacy_ltp.pipeline("他叫汤姆去拿外衣。", tasks=["cws"], return_dict=False))
 32 | 
 33 |     neural_ltp = LTP("LTP/tiny")
 34 |     print(neural_ltp.pipeline("他叫汤姆去拿外衣。", tasks=["cws"], return_dict=False))
 35 | 
 36 | 
 37 | def issue612():
 38 |     from ltp import LTP
 39 |     legacy_ltp = LTP("LTP/legacy")
 40 |     legacy_ltp.add_words(words=["五星武器"])
 41 |     print(legacy_ltp.pipeline("80 抽两五星武器给我吧哥", tasks=["cws"], return_dict=False))
 42 | 
 43 |     neural_ltp = LTP("LTP/tiny")
 44 |     neural_ltp.add_words(words=["五星武器"])
 45 |     print(neural_ltp.pipeline("80 抽两五星武器给我吧哥", tasks=["cws"], return_dict=False))
 46 | 
 47 | 
 48 | def issue613():
 49 |     import cProfile
 50 |     from pstats import SortKey
 51 | 
 52 |     cProfile.run('from ltp import LTP;LTP("LTP/legacy", local_files_only=True)', sort=SortKey.CUMULATIVE)
 53 | 
 54 | 
 55 | def issue623():
 56 |     from ltp import LTP
 57 |     from matplotlib import pyplot as plt
 58 |     from tqdm import trange
 59 |     ltp = LTP("LTP/legacy")
 60 | 
 61 |     def get_current_memory() -> int:
 62 |         import os
 63 | 
 64 |         import psutil
 65 | 
 66 |         # 获取当前进程内存占用。
 67 |         pid = os.getpid()
 68 |         p = psutil.Process(pid)
 69 |         info = p.memory_full_info()
 70 |         return info.uss / 1024 / 1024
 71 | 
 72 |     memory = [get_current_memory()]
 73 | 
 74 |     for _ in trange(10000):
 75 |         # ltp.pipeline('他叫汤姆去拿外衣。')
 76 |         # ltp.pipeline('台湾是中国领土不可分割的一部分。')
 77 |         ltp.pipeline(["他叫汤姆去拿外衣。", "台湾是中国领土不可分割的一部分。"])
 78 |         memory.append(get_current_memory())
 79 | 
 80 |     memory.append(get_current_memory())
 81 | 
 82 |     plt.plot(memory)
 83 |     plt.show()
 84 | 
 85 | 
 86 | def issue686():
 87 |     from ltp_extension.algorithms import Hook
 88 |     sentence = b'\xc2\x28'.decode('utf-8', 'replace')
 89 |     hook = Hook()
 90 |     hook.add_word(word="[FAKE]")
 91 |     try:
 92 |         hook.hook(sentence, ['a', 'b'])
 93 |     except Exception as e:
 94 |         print(e)
 95 | 
 96 | 
 97 | def issue693():
 98 |     from ltp import LTP
 99 |     ltp = LTP("LTP/tiny")
100 |     print(ltp.pipeline(
101 |         ["视觉Transformers通过将图像区域表示为转换后的tokens并通过注意力权重整合它们来提取视觉信息。"],
102 |         tasks=["cws"])
103 |     )
104 | 
105 | 
106 | def issue714():
107 |     from ltp import StnSplit
108 | 
109 |     spliter = StnSplit()
110 |     spliter.use_en = False
111 |     sents = spliter.split("1.联通华盛电商分公司办公室内的灯火彻夜不熄，这已经成为常态。")
112 |     print(sents)
113 | 
114 | 
115 | def main():
116 |     issue714()
117 | 
118 | 
119 | if __name__ == "__main__":
120 |     main()
121 | 


--------------------------------------------------------------------------------
/python/interface/examples/rules.py:
--------------------------------------------------------------------------------
 1 | from ltp import LTP
 2 | from ltp.legacy import CharacterType
 3 | 
 4 | 
 5 | def rules():
 6 |     ltp = LTP("LTP/legacy")
 7 |     result = ltp(["视频4k60fps无bg"], tasks=["cws"])
 8 |     print(result.cws)
 9 |     ltp.enable_type_cut_d(CharacterType.Roman, CharacterType.Kanji)
10 |     ltp.enable_type_concat(CharacterType.Digit, CharacterType.Roman)
11 |     result = ltp(["视频4k60fps无bg"], tasks=["cws"])
12 |     print(result.cws)
13 | 
14 | 
15 | def main():
16 |     rules()
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     main()
21 | 


--------------------------------------------------------------------------------
/python/interface/examples/simple.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from ltp import LTP
 3 | 
 4 | 
 5 | def stn_split():
 6 |     from ltp import StnSplit
 7 | 
 8 |     spliter = StnSplit()
 9 |     spliter.use_en = False # 关闭英文断句
10 | 
11 |     sents = spliter.split("汤姆生病了。他去了医院。")
12 |     print(sents)
13 |     # [
14 |     #   "汤姆生病了。",
15 |     #   "他去了医院。"
16 |     # ]
17 | 
18 |     sents = StnSplit().batch_split(["他叫汤姆去拿外衣。", "汤姆生病了。他去了医院。"])
19 |     print(sents)
20 |     # [
21 |     #   "他叫汤姆去拿外衣。",
22 |     #   "汤姆生病了。",
23 |     #   "他去了医院。"
24 |     # ]
25 | 
26 | 
27 | def legacy():
28 |     ltp = LTP("LTP/legacy")
29 |     ltp.add_word("汤姆去")
30 |     result = ltp(
31 |         ["他叫汤姆去拿外衣。", "树上停着一些小鸟。先飞走了19只，又飞走了15只。两次共飞走了多少只小鸟？"],
32 |         tasks=["cws", "pos", "ner"],
33 |     )
34 |     print(result.cws)
35 |     print(result.pos)
36 |     print(result.ner)
37 | 
38 | 
39 | def neural():
40 |     ltp = LTP("LTP/tiny")
41 | 
42 |     if torch.cuda.is_available():
43 |         ltp = ltp.to("cuda")
44 | 
45 |     ltp.add_word("汤姆去")
46 | 
47 |     # 未分词的文本
48 |     result = ltp.pipeline(
49 |         ["他叫汤姆去拿外衣。", "韓語：한국의 단오", "树上停着一些小鸟。先飞走了19只，又飞走了15只。两次共飞走了多少只小鸟？"],
50 |         tasks=["cws", "pos", "ner", "srl", "dep", "sdp"],
51 |     )
52 |     print(result.cws)
53 |     print(result.pos)
54 |     print(result.ner)
55 |     print(result.srl)
56 |     print(result.dep)
57 |     print(result.sdp)
58 | 
59 |     # 已经分词的文本
60 |     result = ltp.pipeline(
61 |         [["他", "叫", "汤姆", "去", "拿", "外衣", "。"], ["가을동", "叫", "1993", "年", "的", "Ameri", "·"]],
62 |         # 注意这里移除了 "cws" 任务
63 |         tasks=["pos", "ner", "srl", "dep", "sdp"],
64 |     )
65 |     print(result.pos)
66 |     print(result.ner)
67 |     print(result.srl)
68 |     print(result.dep)
69 |     print(result.sdp)
70 | 
71 | 
72 | def main():
73 |     stn_split()
74 |     legacy()
75 |     neural()
76 | 
77 | 
78 | if __name__ == "__main__":
79 |     main()
80 | 


--------------------------------------------------------------------------------
/python/interface/ltp/__init__.py:
--------------------------------------------------------------------------------
 1 | __version__ = "4.2.13"
 2 | 
 3 | from ltp_extension.algorithms import StnSplit
 4 | 
 5 | from .interface import LTP
 6 | 
 7 | __all__ = [
 8 |     "LTP",
 9 |     "StnSplit",
10 |     "__version__",
11 | ]
12 | 


--------------------------------------------------------------------------------
/python/interface/ltp/generic.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # Author: Yunlong Feng <ylfeng@ir.hit.edu.cn>
 3 | 
 4 | from collections import OrderedDict
 5 | from dataclasses import dataclass, fields
 6 | from typing import Any, List, Optional, Tuple, Union
 7 | 
 8 | 
 9 | class ModelOutput(OrderedDict):
10 |     """Base class for all model outputs as dataclass. Has a `__getitem__` that allows indexing by
11 |     integer or slice (like a tuple) or strings (like a dictionary) that will ignore the `None`
12 |     attributes. Otherwise behaves like a regular python dictionary.
13 | 
14 |     <Tip warning={true}>
15 | 
16 |     You can't unpack a `ModelOutput` directly. Use the [`~utils.ModelOutput.to_tuple`] method to convert it to a tuple
17 |     before.
18 | 
19 |     </Tip>
20 |     """
21 | 
22 |     def __post_init__(self):
23 |         class_fields = fields(self)
24 | 
25 |         # Safety and consistency checks
26 |         if not len(class_fields):
27 |             raise ValueError(f"{self.__class__.__name__} has no fields.")
28 |         if not all(field.default is None for field in class_fields[1:]):
29 |             raise ValueError(f"{self.__class__.__name__} should not have more than one required field.")
30 | 
31 |         for field in class_fields:
32 |             v = getattr(self, field.name)
33 |             if v is not None:
34 |                 self[field.name] = v
35 | 
36 |     def __delitem__(self, *args, **kwargs):
37 |         raise Exception(f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance.")
38 | 
39 |     def setdefault(self, *args, **kwargs):
40 |         raise Exception(f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance.")
41 | 
42 |     def pop(self, *args, **kwargs):
43 |         raise Exception(f"You cannot use ``pop`` on a {self.__class__.__name__} instance.")
44 | 
45 |     def update(self, *args, **kwargs):
46 |         raise Exception(f"You cannot use ``update`` on a {self.__class__.__name__} instance.")
47 | 
48 |     def __getitem__(self, k):
49 |         if isinstance(k, str):
50 |             inner_dict = {k: v for (k, v) in self.items()}
51 |             return inner_dict[k]
52 |         else:
53 |             return self.to_tuple()[k]
54 | 
55 |     def __setattr__(self, name, value):
56 |         if name in self.keys() and value is not None:
57 |             # Don't call self.__setitem__ to avoid recursion errors
58 |             super().__setitem__(name, value)
59 |         super().__setattr__(name, value)
60 | 
61 |     def __setitem__(self, key, value):
62 |         # Will raise a KeyException if needed
63 |         super().__setitem__(key, value)
64 |         # Don't call self.__setattr__ to avoid recursion errors
65 |         super().__setattr__(key, value)
66 | 
67 |     def to_tuple(self) -> Tuple[Any]:
68 |         """Convert self to a tuple containing all the attributes/keys that are not `None`."""
69 |         return tuple(self[k] for k in self.keys())
70 | 
71 | 
72 | @dataclass
73 | class LTPOutput(ModelOutput):
74 |     cws: Optional[Union[List[str], List[List[str]]]] = None
75 |     pos: Optional[Union[List[str], List[List[str]]]] = None
76 |     ner: Optional[Union[List[str], List[List[str]]]] = None
77 |     srl: Optional[Union[List[str], List[List[str]]]] = None
78 |     dep: Optional[Union[List[str], List[List[str]]]] = None
79 |     sdp: Optional[Union[List[str], List[List[str]]]] = None
80 |     sdpg: Optional[Union[List[str], List[List[str]]]] = None
81 | 


--------------------------------------------------------------------------------
/python/interface/ltp/module.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, Union
 2 | 
 3 | import torch
 4 | from torch.nn import Module
 5 | 
 6 | 
 7 | class BaseModule(Module):
 8 |     __jit_unused_properties__ = ["device", "dtype"]
 9 | 
10 |     def __init__(self):
11 |         super().__init__()
12 |         self._dtype = torch.get_default_dtype()
13 |         self._device = torch.device("cpu")
14 | 
15 |     @property
16 |     def dtype(self) -> Union[str, torch.dtype]:
17 |         return self._dtype
18 | 
19 |     @dtype.setter
20 |     def dtype(self, new_dtype: Union[str, torch.dtype]):
21 |         # necessary to avoid infinite recursion
22 |         raise RuntimeError("Cannot set the dtype explicitly. Please use module.to(new_dtype).")
23 | 
24 |     @property
25 |     def device(self) -> Union[str, torch.device]:
26 |         return self._device
27 | 
28 |     @device.setter
29 |     def device(self, new_device: Union[str, torch.device]):
30 |         raise RuntimeError("Cannot set the device explicitly. Please use module.to(new_device).")
31 | 
32 |     def to(self, *args, **kwargs) -> Module:
33 |         out = torch._C._nn._parse_to(*args, **kwargs)
34 |         self.__update_properties(device=out[0], dtype=out[1])
35 |         return super().to(*args, **kwargs)
36 | 
37 |     def cuda(self, device: Optional[int] = None) -> Module:
38 |         self.__update_properties(device=torch.device("cuda", index=device))
39 |         return super().cuda(device=device)
40 | 
41 |     def cpu(self) -> Module:
42 |         self.__update_properties(device=torch.device("cpu"))
43 |         return super().cpu()
44 | 
45 |     def type(self, dst_type: Union[str, torch.dtype]) -> Module:
46 |         self.__update_properties(dtype=dst_type)
47 |         return super().type(dst_type=dst_type)
48 | 
49 |     def float(self) -> Module:
50 |         self.__update_properties(dtype=torch.float)
51 |         return super().float()
52 | 
53 |     def double(self) -> Module:
54 |         self.__update_properties(dtype=torch.double)
55 |         return super().double()
56 | 
57 |     def half(self) -> Module:
58 |         self.__update_properties(dtype=torch.half)
59 |         return super().half()
60 | 
61 |     def __update_properties(self, device: Optional[torch.device] = None, dtype: Optional[torch.dtype] = None):
62 |         def apply_fn(module):
63 |             if not isinstance(module, BaseModule):
64 |                 return
65 |             if device is not None:
66 |                 module._device = device
67 |             if dtype is not None:
68 |                 module._dtype = dtype
69 | 
70 |         self.apply(apply_fn)
71 | 


--------------------------------------------------------------------------------
/python/interface/ltp/utils.py:
--------------------------------------------------------------------------------
1 | import logging
2 | 
3 | 
4 | def get_pylogger(name=__name__) -> logging.Logger:
5 |     logger = logging.getLogger(name)
6 |     return logger
7 | 


--------------------------------------------------------------------------------
/python/interface/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.pytest.ini_options]
 2 | addopts = [
 3 |     "--color=yes",
 4 |     "--durations=0",
 5 |     "--strict-markers",
 6 |     "--doctest-modules",
 7 | ]
 8 | filterwarnings = [
 9 |     "ignore::DeprecationWarning",
10 |     "ignore::UserWarning",
11 | ]
12 | log_cli = "True"
13 | markers = [
14 |     "slow: slow tests",
15 | ]
16 | minversion = "6.0"
17 | testpaths = "tests/"
18 | 
19 | [tool.coverage.report]
20 | exclude_lines = [
21 |     "pragma: nocover",
22 |     "raise NotImplementedError",
23 |     "raise NotImplementedError()",
24 |     "if __name__ == .__main__.:",
25 | ]
26 | 
27 | [tool.ruff]
28 | # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
29 | select = ["E", "F"]
30 | ignore = []
31 | 
32 | # Allow autofix for all enabled rules (when `--fix`) is provided.
33 | fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
34 | unfixable = []
35 | 
36 | # Exclude a variety of commonly ignored directories.
37 | exclude = [
38 |     ".bzr",
39 |     ".direnv",
40 |     ".eggs",
41 |     ".git",
42 |     ".git-rewrite",
43 |     ".hg",
44 |     ".mypy_cache",
45 |     ".nox",
46 |     ".pants.d",
47 |     ".pytype",
48 |     ".ruff_cache",
49 |     ".svn",
50 |     ".tox",
51 |     ".venv",
52 |     "__pypackages__",
53 |     "_build",
54 |     "buck-out",
55 |     "build",
56 |     "dist",
57 |     "node_modules",
58 |     "venv",
59 |     "docs",
60 |     "examples"
61 | ]
62 | per-file-ignores = { }
63 | 
64 | # Same as Black.
65 | line-length = 120
66 | 
67 | # Allow unused variables when underscore-prefixed.
68 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
69 | 
70 | # Assume Python 3.10.
71 | target-version = "py310"
72 | 
73 | [tool.ruff.mccabe]
74 | # Unlike Flake8, default to a complexity level of 10.
75 | max-complexity = 10


--------------------------------------------------------------------------------
/python/interface/requirements.txt:
--------------------------------------------------------------------------------
1 | ltp_core>=0.1.0
2 | ltp_extension>=0.1.0
3 | 


--------------------------------------------------------------------------------
/python/interface/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from setuptools import find_packages, setup
 4 | 
 5 | project_dir, _ = os.path.split(__file__)
 6 | 
 7 | with open(os.path.join(project_dir, "README.md"), encoding="utf-8") as fh:
 8 |     long_description = fh.read()
 9 | 
10 | setup(
11 |     name="ltp",
12 |     version="4.2.14",
13 |     author="Yunlong Feng",
14 |     author_email="ylfeng@ir.hit.edu.cn",
15 |     url="https://github.com/HIT-SCIR/ltp",
16 |     description="Language Technology Platform",
17 |     long_description=long_description,
18 |     long_description_content_type="text/markdown",
19 |     install_requires=[
20 |         "ltp_core>=0.1.3",
21 |         "ltp_extension>=0.1.9",
22 |         "huggingface_hub>=0.8.0",
23 |     ],
24 |     classifiers=[
25 |         "Development Status :: 1 - Planning",
26 |         "Operating System :: OS Independent",
27 |         "Intended Audience :: Developers",
28 |         "Programming Language :: Python :: 3.6",
29 |         "Programming Language :: Python :: 3.7",
30 |         "Programming Language :: Python :: 3.8",
31 |         "Programming Language :: Python :: 3.9",
32 |         "Programming Language :: Python :: 3.10",
33 |         "Programming Language :: Python :: 3.11",
34 |         "Topic :: Software Development :: Libraries",
35 |     ],
36 |     packages=find_packages(),
37 |     python_requires=">=3.6, <4",
38 |     zip_safe=True,
39 | )
40 | 


--------------------------------------------------------------------------------
/python/interface/utils/upload_models.py:
--------------------------------------------------------------------------------
 1 | import os.path
 2 | 
 3 | from huggingface_hub import CommitOperationAdd, HfApi
 4 | 
 5 | 
 6 | def upload_model(model_dir, repo_id):
 7 |     api = HfApi()
 8 |     operations = [
 9 |         CommitOperationAdd(
10 |             path_in_repo="config.json",
11 |             path_or_fileobj=os.path.join(model_dir, "config.json"),
12 |         ),
13 |         CommitOperationAdd(
14 |             path_in_repo="pytorch_model.bin",
15 |             path_or_fileobj=os.path.join(model_dir, "pytorch_model.bin"),
16 |         ),
17 |         CommitOperationAdd(
18 |             path_in_repo="vocab.txt",
19 |             path_or_fileobj=os.path.join(model_dir, "vocab.txt"),
20 |         ),
21 |     ]
22 | 
23 |     api.create_commit(
24 |         repo_id=repo_id,
25 |         operations=operations,
26 |         commit_message="Uploaded model",
27 |     )
28 | 
29 | 
30 | def upload_tokenizer(model_dir, repo_id):
31 |     api = HfApi()
32 |     operations = [
33 |         CommitOperationAdd(
34 |             path_in_repo="added_tokens.json",
35 |             path_or_fileobj=os.path.join(model_dir, "added_tokens.json"),
36 |         ),
37 |         CommitOperationAdd(
38 |             path_in_repo="special_tokens_map.json",
39 |             path_or_fileobj=os.path.join(model_dir, "special_tokens_map.json"),
40 |         ),
41 |         CommitOperationAdd(
42 |             path_in_repo="tokenizer.json",
43 |             path_or_fileobj=os.path.join(model_dir, "tokenizer.json"),
44 |         ),
45 |         CommitOperationAdd(
46 |             path_in_repo="tokenizer_config.json",
47 |             path_or_fileobj=os.path.join(model_dir, "tokenizer_config.json"),
48 |         ),
49 |     ]
50 | 
51 |     api.create_commit(
52 |         repo_id=repo_id,
53 |         operations=operations,
54 |         commit_message="Uploaded tokenizer",
55 |     )
56 | 
57 | 
58 | def upload_readme(model_dir, repo_id):
59 |     api = HfApi()
60 |     operations = [
61 |         CommitOperationAdd(
62 |             path_in_repo="README.md",
63 |             path_or_fileobj="README.md",
64 |         ),
65 |     ]
66 | 
67 |     api.create_commit(
68 |         repo_id=repo_id,
69 |         operations=operations,
70 |         commit_message="Uploaded model",
71 |     )
72 | 
73 | 
74 | def main():
75 |     for model in ["legacy", "tiny", "small", "base", "base1", "base2"]:
76 |         upload_readme(None, repo_id=f"LTP/{model}")
77 | 
78 | 
79 | if __name__ == "__main__":
80 |     main()
81 | 


--------------------------------------------------------------------------------
/rust/ltp-cffi/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "ltp-cffi"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | authors = ["ylfeng <ylfeng@ir.hit.edu.cn>"]
 6 | description = "The C bindings for LTP."
 7 | homepage = "https://github.com/HIT-SCIR/ltp"
 8 | repository = "https://github.com/HIT-SCIR/ltp"
 9 | keywords = ["ltp", "nlp"]
10 | exclude = [".github"]
11 | readme = "README.md"
12 | license-file = "LICENSE"
13 | 
14 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
15 | 
16 | [lib]
17 | name = "ltp"
18 | path = "src/lib.rs"
19 | crate-type = ["cdylib", "staticlib"]
20 | 
21 | [dependencies]
22 | rayon = { version = "1.5" }
23 | ltp = { version = "*", path = "../ltp", features = ["serialization", "parallel"] }
24 | mimalloc = { version = "0.1", default-features = false, optional = true }
25 | 
26 | [features]
27 | malloc = ["mimalloc"]
28 | secure = ["mimalloc/secure"]
29 | 


--------------------------------------------------------------------------------
/rust/ltp-cffi/LICENSE:
--------------------------------------------------------------------------------
1 | 1. 语言技术平台面向国内外大学、中科院各研究所以及个人研究者免费开放源代码，但如上述机构和个人将该平台用于商业目的（如企业合作项目等）则需要付费。
2 | 2. 除上述机构以外的企事业单位，如申请使用该平台，需付费。
3 | 3. 凡涉及付费问题，请发邮件到 car@ir.hit.edu.cn 洽商。
4 | 4. 如果您在 LTP 基础上发表论文或取得科研成果，请您在发表论文和申报成果时声明“使用了哈工大社会计算与信息检索研究中心研制的语言技术平台（LTP）”.
5 |    同时，发信给car@ir.hit.edu.cn，说明发表论文或申报成果的题目、出处等。
6 | 


--------------------------------------------------------------------------------
/rust/ltp-cffi/README.md:
--------------------------------------------------------------------------------
1 | # LTP CFFI
2 | 
3 | The C bindings for `LTP for Rust`.
4 | 


--------------------------------------------------------------------------------
/rust/ltp-cffi/cbindgen.toml:
--------------------------------------------------------------------------------
  1 | # This is a template cbindgen.toml file with all of the default values.
  2 | # Some values are commented out because their absence is the real default.
  3 | #
  4 | # See https://github.com/eqrion/cbindgen/blob/master/docs.md#cbindgentoml
  5 | # for detailed documentation of every option here.
  6 | language = "C"
  7 | 
  8 | ############## Options for Wrapping the Contents of the Header #################
  9 | 
 10 | # header = "/* Text to put at the beginning of the generated file. Probably a license. */"
 11 | # trailer = "/* Text to put at the end of the generated file */"
 12 | include_guard = "LTP_BINDINGS_DEFINE"
 13 | pragma_once = true
 14 | autogen_warning = "/* Warning, this file is autogenerated by cbindgen. Don't modify this manually. */"
 15 | include_version = false
 16 | # namespace = "ltp"
 17 | namespaces = []
 18 | using_namespaces = []
 19 | sys_includes = []
 20 | includes = []
 21 | no_includes = false
 22 | cpp_compat = true
 23 | after_includes = ""
 24 | 
 25 | ############################ Code Style Options ################################
 26 | 
 27 | braces = "SameLine"
 28 | line_length = 100
 29 | tab_width = 2
 30 | documentation = true
 31 | documentation_style = "auto"
 32 | documentation_length = "full"
 33 | line_endings = "LF" # also "CR", "CRLF", "Native"
 34 | 
 35 | ############################# Codegen Options ##################################
 36 | 
 37 | style = "both"
 38 | sort_by = "Name" # default for `fn.sort_by` and `const.sort_by`
 39 | usize_is_size_t = true
 40 | 
 41 | [defines]
 42 | # "target_os = freebsd" = "DEFINE_FREEBSD"
 43 | # "feature = serde" = "DEFINE_SERDE"
 44 | 
 45 | [export]
 46 | include = []
 47 | exclude = []
 48 | # prefix = "CAPI_"
 49 | item_types = []
 50 | renaming_overrides_prefixing = false
 51 | 
 52 | [export.rename]
 53 | 
 54 | [export.body]
 55 | 
 56 | [export.mangle]
 57 | 
 58 | [fn]
 59 | rename_args = "None"
 60 | # must_use = "MUST_USE_FUNC"
 61 | # no_return = "NO_RETURN"
 62 | # prefix = "START_FUNC"
 63 | # postfix = "END_FUNC"
 64 | args = "auto"
 65 | sort_by = "Name"
 66 | 
 67 | [struct]
 68 | rename_fields = "None"
 69 | # must_use = "MUST_USE_STRUCT"
 70 | derive_constructor = false
 71 | derive_eq = false
 72 | derive_neq = false
 73 | derive_lt = false
 74 | derive_lte = false
 75 | derive_gt = false
 76 | derive_gte = false
 77 | 
 78 | [enum]
 79 | rename_variants = "None"
 80 | # must_use = "MUST_USE_ENUM"
 81 | add_sentinel = false
 82 | prefix_with_name = false
 83 | derive_helper_methods = false
 84 | derive_const_casts = false
 85 | derive_mut_casts = false
 86 | # cast_assert_name = "ASSERT"
 87 | derive_tagged_enum_destructor = false
 88 | derive_tagged_enum_copy_constructor = false
 89 | enum_class = true
 90 | private_default_tagged_enum_constructor = false
 91 | 
 92 | [const]
 93 | allow_static_const = true
 94 | allow_constexpr = false
 95 | sort_by = "Name"
 96 | 
 97 | [macro_expansion]
 98 | bitflags = false
 99 | 
100 | ############## Options for How Your Rust library Should Be Parsed ##############
101 | 
102 | [parse]
103 | parse_deps = false
104 | # include = []
105 | exclude = []
106 | clean = false
107 | extra_bindings = []
108 | 
109 | [parse.expand]
110 | crates = []
111 | all_features = false
112 | default_features = true
113 | features = []
114 | 


--------------------------------------------------------------------------------
/rust/ltp-cffi/examples/example.c:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by 冯云龙 on 2022/8/12.
 3 | //
 4 | 
 5 | #include <string.h>
 6 | #include <stdio.h>
 7 | #include <assert.h>
 8 | #include "ltp.h"
 9 | 
10 | #define MAX_WORD_LEN (10)
11 | 
12 | struct State {
13 |   char **results;
14 |   size_t *lengths;
15 | };
16 | 
17 | void store_results(struct State *state, const uint8_t *word, size_t word_len, size_t idx, size_t length) {
18 |   state->results[idx] = malloc(word_len + 1);
19 |   state->lengths[idx] = word_len;
20 | 
21 |   strncpy(state->results[idx], (const char *) word, word_len);
22 |   state->results[idx][word_len] = '\0';
23 | 
24 |   if (idx < length - 1) {
25 |     printf("%s ", state->results[idx]);
26 |   } else {
27 |     printf("%s\n", state->results[idx]);
28 |   }
29 | 
30 | }
31 | int main() {
32 |   const char *cws_model_path = "data/legacy-models/cws_model.bin";
33 |   const char *pos_model_path = "data/legacy-models/pos_model.bin";
34 |   const char *ner_model_path = "data/legacy-models/ner_model.bin";
35 |   Model *cws_model = NULL;
36 |   cws_model = model_load(cws_model_path);
37 |   Model *pos_model = NULL;
38 |   pos_model = model_load(pos_model_path);
39 |   Model *ner_model = NULL;
40 |   ner_model = model_load(ner_model_path);
41 | 
42 |   const char *sentence = "他叫汤姆去拿外衣";
43 |   size_t word_length[MAX_WORD_LEN] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
44 |   size_t pos_length[MAX_WORD_LEN] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
45 |   size_t ner_length[MAX_WORD_LEN] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
46 |   char *words[MAX_WORD_LEN] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL};
47 |   char *pos[MAX_WORD_LEN] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL};
48 |   char *ner[MAX_WORD_LEN] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL};
49 | 
50 |   struct State word_state = {words, word_length};
51 |   struct State pos_state = {pos, pos_length};
52 |   struct State ner_state = {ner, ner_length};
53 | 
54 |   Callback cws_callback = {&word_state, store_results};
55 |   size_t length = model_cws_predict(cws_model, sentence, strlen(sentence), cws_callback);
56 | 
57 |   Callback pos_callback = {&pos_state, store_results};
58 |   model_pos_predict(pos_model, words, word_length, length, pos_callback);
59 | 
60 |   Callback ner_callback = {&ner_state, store_results};
61 |   model_ner_predict(ner_model, words, word_length, pos, pos_length, length, ner_callback);
62 | 
63 |   for (size_t i = 0; i < MAX_WORD_LEN; i++) {
64 |     if (words[i] != NULL) { free(words[i]); words[i]=NULL;}
65 |     if (pos[i] != NULL) { free(pos[i]); pos[i]=NULL;}
66 |     if (ner[i] != NULL) { free(ner[i]); ner[i]=NULL;}
67 |   }
68 | 
69 |   model_release(&cws_model);
70 |   model_release(&pos_model);
71 |   model_release(&ner_model);
72 | 
73 |   assert(cws_model == NULL);
74 |   assert(pos_model == NULL);
75 |   assert(ner_model == NULL);
76 | 
77 |   return 0;
78 | }
79 | 


--------------------------------------------------------------------------------
/rust/ltp-cffi/src/lib.rs:
--------------------------------------------------------------------------------
 1 | #[cfg(feature = "mimalloc")]
 2 | use mimalloc::MiMalloc;
 3 | 
 4 | #[cfg(feature = "mimalloc")]
 5 | #[global_allocator]
 6 | static GLOBAL: MiMalloc = MiMalloc;
 7 | 
 8 | use std::ffi::c_void;
 9 | 
10 | pub mod model;
11 | pub mod stnsplit;
12 | 
13 | /// The LTP CFFI API.
14 | /// the call args:
15 | ///    state: your design
16 | ///    tag: the predicted tag
17 | ///    tag_len: the length of tag
18 | ///    tag_index: the index of current predict
19 | ///    tag_total: the length of current predict
20 | #[repr(C)]
21 | pub struct Callback {
22 |     pub state: *mut c_void,
23 |     // state, char*, char_len, current idx, max_num
24 |     pub call: extern "C" fn(*mut c_void, *const u8, usize, usize, usize),
25 | }
26 | 
27 | /// The LTP CFFI API.
28 | /// the call args:
29 | ///    state: your design
30 | ///    tag: the predicted tag
31 | ///    tag_len: the length of tag
32 | ///    tag_index: the index of current predict
33 | ///    tag_total: the length of current predict
34 | ///    batch_index: the predict index of current batch
35 | ///    batch_total: the batch size of current batch
36 | #[repr(C)]
37 | pub struct BatchCallback {
38 |     pub state: *mut c_void,
39 |     // state, char*, char_len, tag idx, tag num, batch index, batch num
40 |     pub call: extern "C" fn(*mut c_void, *const u8, usize, usize, usize, usize, usize),
41 | }
42 | 


--------------------------------------------------------------------------------
/rust/ltp-cffi/src/stnsplit.rs:
--------------------------------------------------------------------------------
 1 | use crate::Callback;
 2 | use ltp::utils::stnsplit::{
 3 |     stn_split as r_stn_split, stn_split_with_options as r_stn_split_with_options, SplitOptions,
 4 | };
 5 | use std::slice;
 6 | 
 7 | #[no_mangle]
 8 | #[allow(clippy::not_unsafe_ptr_arg_deref)]
 9 | pub extern "C" fn stn_split(text: *const u8, text_len: usize, callback: Callback) -> usize {
10 |     let text = unsafe { std::str::from_utf8_unchecked(slice::from_raw_parts(text, text_len)) };
11 |     let sentences = r_stn_split(text);
12 |     for (idx, sentence) in sentences.iter().enumerate() {
13 |         (callback.call)(
14 |             callback.state,
15 |             text.as_ptr(),
16 |             sentence.len(),
17 |             idx,
18 |             sentence.len(),
19 |         );
20 |     }
21 |     sentences.len()
22 | }
23 | 
24 | #[no_mangle]
25 | #[allow(clippy::not_unsafe_ptr_arg_deref)]
26 | pub extern "C" fn stn_split_with_options(
27 |     text: *const u8,
28 |     text_len: usize,
29 |     callback: Callback,
30 |     use_zh: bool,
31 |     use_en: bool,
32 |     bracket_as_entity: bool,
33 |     zh_quote_as_entity: bool,
34 |     en_quote_as_entity: bool,
35 | ) -> usize {
36 |     let text = unsafe { std::str::from_utf8_unchecked(slice::from_raw_parts(text, text_len)) };
37 |     let options = SplitOptions {
38 |         use_zh,
39 |         use_en,
40 |         bracket_as_entity,
41 |         zh_quote_as_entity,
42 |         en_quote_as_entity,
43 |     };
44 | 
45 |     let sentences = r_stn_split_with_options(text, &options);
46 |     for (idx, sentence) in sentences.iter().enumerate() {
47 |         (callback.call)(
48 |             callback.state,
49 |             sentence.as_ptr(),
50 |             sentence.len(),
51 |             idx,
52 |             sentence.len(),
53 |         );
54 |     }
55 |     sentences.len()
56 | }
57 | 


--------------------------------------------------------------------------------
/rust/ltp/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "ltp"
 3 | version = "0.1.9"
 4 | edition = "2021"
 5 | authors = ["ylfeng <ylfeng@ir.hit.edu.cn>"]
 6 | description = "Language Technology Platform For Rust."
 7 | homepage = "https://github.com/HIT-SCIR/ltp"
 8 | repository = "https://github.com/HIT-SCIR/ltp"
 9 | keywords = ["ltp", "nlp"]
10 | exclude = [".github"]
11 | readme = "README.md"
12 | license-file = "LICENSE"
13 | 
14 | [[example]]
15 | name = "cws"
16 | path = "examples/cws.rs"
17 | required-features = ["serialization", "parallel"]
18 | 
19 | [[example]]
20 | name = "pos"
21 | path = "examples/pos.rs"
22 | required-features = ["serialization", "parallel"]
23 | 
24 | [[example]]
25 | name = "ner"
26 | path = "examples/ner.rs"
27 | required-features = ["serialization", "parallel"]
28 | 
29 | [[example]]
30 | name = "simple"
31 | path = "examples/simple.rs"
32 | required-features = ["serialization", "parallel"]
33 | 
34 | [dependencies]
35 | anyhow = "1"
36 | num-traits = "0.2"
37 | itertools = "0.14"
38 | 
39 | cedarwood = "0.4"
40 | 
41 | # 断句避免过多内存申请
42 | smallvec = { version = "1" }
43 | # 数据集 shuffle
44 | rand = { version = "0.9" }
45 | # 特征裁剪
46 | binary-heap-plus = { version = "0.5" }
47 | 
48 | # 并行
49 | rayon = { version = "1.5", optional = true }
50 | 
51 | # 序列化
52 | serde = { version = "1.0", features = ["derive"], optional = true }
53 | serde_json = { version = "1.0", optional = true }
54 | apache-avro = { version = "0.18.0", optional = true }
55 | 
56 | # Todo: Nocopy Serialize 更快地加载速度
57 | compact_str = { version = "0.9", optional = true }
58 | rkyv = { version = "0.8", optional = true }
59 | 
60 | [features]
61 | default = []
62 | char-type = []
63 | cross-char = []
64 | near-char-type = []
65 | parallel = ["rayon"]
66 | serialization = ["serde", "serde_json", "apache-avro"]
67 | 
68 | [dev-dependencies]
69 | clap = { version = "4", features = ["derive"] }
70 | 
71 | ndarray = "0.16"
72 | ndarray-npy = { version = "0.9", features = ["npz"] }
73 | 


--------------------------------------------------------------------------------
/rust/ltp/LICENSE:
--------------------------------------------------------------------------------
1 | 1. 语言技术平台面向国内外大学、中科院各研究所以及个人研究者免费开放源代码，但如上述机构和个人将该平台用于商业目的（如企业合作项目等）则需要付费。
2 | 2. 除上述机构以外的企事业单位，如申请使用该平台，需付费。
3 | 3. 凡涉及付费问题，请发邮件到 car@ir.hit.edu.cn 洽商。
4 | 4. 如果您在 LTP 基础上发表论文或取得科研成果，请您在发表论文和申报成果时声明“使用了哈工大社会计算与信息检索研究中心研制的语言技术平台（LTP）”.
5 |    同时，发信给car@ir.hit.edu.cn，说明发表论文或申报成果的题目、出处等。
6 | 


--------------------------------------------------------------------------------
/rust/ltp/examples/simple.rs:
--------------------------------------------------------------------------------
 1 | use itertools::multizip;
 2 | use ltp::{CWSModel, Codec, Format, ModelSerde, NERModel, POSModel};
 3 | use std::fs::File;
 4 | 
 5 | fn main() -> Result<(), Box<dyn std::error::Error>> {
 6 |     let file = File::open("data/legacy-models/cws_model.bin")?;
 7 |     let cws: CWSModel = ModelSerde::load(file, Format::AVRO(Codec::Deflate))?;
 8 |     let file = File::open("data/legacy-models/pos_model.bin")?;
 9 |     let pos: POSModel = ModelSerde::load(file, Format::AVRO(Codec::Deflate))?;
10 |     let file = File::open("data/legacy-models/ner_model.bin")?;
11 |     let ner: NERModel = ModelSerde::load(file, Format::AVRO(Codec::Deflate))?;
12 | 
13 |     let words = cws.predict("他叫汤姆去拿外衣。")?;
14 |     let pos = pos.predict(&words)?;
15 |     let ner = ner.predict((&words, &pos))?;
16 | 
17 |     for (w, p, n) in multizip((words, pos, ner)) {
18 |         println!("{}/{}/{}", w, p, n);
19 |     }
20 | 
21 |     Ok(())
22 | }
23 | 


--------------------------------------------------------------------------------
/rust/ltp/src/lib.rs:
--------------------------------------------------------------------------------
 1 | pub mod perceptron;
 2 | pub mod utils;
 3 | 
 4 | pub use perceptron::{
 5 |     Algorithm, CWSDefinition, NERDefinition, POSDefinition, PaMode, Perceptron, Trainer,
 6 | };
 7 | #[cfg(feature = "serialization")]
 8 | pub use perceptron::{Codec, Format, ModelSerde, Reader, SerdeModel, SerdeCWSModel, SerdePOSModel, SerdeNERModel};
 9 | 
10 | #[cfg(feature = "serialization")]
11 | pub type CWSModel = SerdeCWSModel;
12 | #[cfg(feature = "serialization")]
13 | pub type POSModel = SerdePOSModel;
14 | #[cfg(feature = "serialization")]
15 | pub type NERModel = SerdeNERModel;
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/rust/ltp/src/perceptron/definition/mod.rs:
--------------------------------------------------------------------------------
  1 | mod cws;
  2 | mod ner;
  3 | mod pos;
  4 | 
  5 | use anyhow::Result;
  6 | use std::collections::HashSet;
  7 | use std::fmt::Debug;
  8 | use std::io::Read;
  9 | 
 10 | use crate::utils::get_entities;
 11 | use crate::perceptron::Sample;
 12 | pub use cws::CWSDefinition;
 13 | pub use ner::NERDefinition;
 14 | pub use pos::POSDefinition;
 15 | 
 16 | #[macro_export]
 17 | macro_rules! buf_feature {
 18 |     ($dst:expr, $feat:tt, $($arg:tt)*) => {
 19 |         write!($dst, $($arg)*)?;
 20 |         $feat.push($dst.len());
 21 |     };
 22 | }
 23 | 
 24 | pub trait CommonDefinePredict {}
 25 | 
 26 | impl CommonDefinePredict for POSDefinition {}
 27 | 
 28 | impl CommonDefinePredict for NERDefinition {}
 29 | 
 30 | pub trait GenericItem<'a> {
 31 |     type Item;
 32 | }
 33 | 
 34 | pub trait Definition: Default + Debug + Clone {
 35 |     type Fragment: ?Sized + for<'any> GenericItem<'any>;
 36 |     type Prediction: ?Sized + for<'any> GenericItem<'any>;
 37 |     type RawFeature: ?Sized + for<'any> GenericItem<'any>;
 38 | 
 39 |     fn use_viterbi(&self) -> bool {
 40 |         false
 41 |     }
 42 | 
 43 |     fn labels(&self) -> Vec<String>;
 44 | 
 45 |     fn label_num(&self) -> usize;
 46 | 
 47 |     fn label_to(&self, label: &str) -> usize;
 48 | 
 49 |     fn to_label(&self, index: usize) -> &str;
 50 | 
 51 |     #[allow(clippy::type_complexity)]
 52 |     fn parse_features(
 53 |         &self,
 54 |         raw: &<Self::RawFeature as GenericItem>::Item,
 55 |     ) -> Result<(<Self::Fragment as GenericItem>::Item, Vec<Vec<String>>)>;
 56 | 
 57 |     #[allow(clippy::type_complexity)]
 58 |     fn parse_features_with_buffer<'a>(
 59 |         &self,
 60 |         raw: &<Self::RawFeature as GenericItem>::Item,
 61 |         buf: &'a mut Vec<u8>,
 62 |     ) -> Result<(<Self::Fragment as GenericItem>::Item, Vec<Vec<&'a str>>)>;
 63 | 
 64 |     fn parse_gold_features<R: Read>(&self, reader: R) -> Result<Vec<Sample>>;
 65 | 
 66 |     fn to_labels(&self, index: &[usize]) -> Vec<&str> {
 67 |         index.iter().map(|&p| self.to_label(p)).collect()
 68 |     }
 69 | 
 70 |     fn predict(
 71 |         &self,
 72 |         raw: &<Self::RawFeature as GenericItem>::Item,
 73 |         fragments: &<Self::Fragment as GenericItem>::Item,
 74 |         preds: &[usize],
 75 |     ) -> <Self::Prediction as GenericItem>::Item;
 76 | 
 77 |     fn evaluate(&self, predicts: &[usize], labels: &[usize]) -> (usize, usize, usize);
 78 | 
 79 |     fn evaluate_tags(&self, predicts: &[usize], labels: &[usize]) -> (usize, usize, usize) {
 80 |         (
 81 |             predicts
 82 |                 .iter()
 83 |                 .zip(labels.iter())
 84 |                 .map(|(p, l)| if p == l { 1usize } else { 0usize })
 85 |                 .sum::<usize>(),
 86 |             predicts.len(),
 87 |             labels.len(),
 88 |         )
 89 |     }
 90 | 
 91 |     fn evaluate_entities(&self, predicts: &[usize], labels: &[usize]) -> (usize, usize, usize) {
 92 |         let predicts = self.to_labels(predicts);
 93 |         let labels = self.to_labels(labels);
 94 | 
 95 |         let predicts: HashSet<_> = get_entities(&predicts).into_iter().collect();
 96 |         let labels: HashSet<_> = get_entities(&labels).into_iter().collect();
 97 | 
 98 |         let correct = predicts.intersection(&labels).count();
 99 |         (correct, predicts.len(), labels.len())
100 |     }
101 | }
102 | 


--------------------------------------------------------------------------------
/rust/ltp/src/perceptron/feature.rs:
--------------------------------------------------------------------------------
  1 | use std::collections::HashMap;
  2 | use std::ops::Deref;
  3 | 
  4 | pub trait TraitFeature {
  5 |     fn get_with_key(&self, key: &str) -> Option<usize>;
  6 |     fn get_vector_str(&self, key: &[&str]) -> Vec<usize> {
  7 |         key.iter()
  8 |             .map(|k| self.get_with_key(k))
  9 |             .into_iter()
 10 |             .flatten()
 11 |             .collect()
 12 |     }
 13 |     fn get_vector_string(&self, key: &[String]) -> Vec<usize> {
 14 |         key.iter()
 15 |             .map(|k| self.get_with_key(k))
 16 |             .into_iter()
 17 |             .flatten()
 18 |             .collect()
 19 |     }
 20 | }
 21 | pub trait TraitFeatureCompressUtils: Default + IntoIterator<Item = (String, usize)> {
 22 |     fn features(self) -> Vec<(String, usize)>;
 23 | }
 24 | 
 25 | impl<T> TraitFeatureCompressUtils for T
 26 | where
 27 |     T: Default + IntoIterator<Item = (String, usize)>,
 28 | {
 29 |     fn features(self) -> Vec<(String, usize)> {
 30 |         self.into_iter().collect()
 31 |     }
 32 | }
 33 | 
 34 | pub trait TraitFeaturesTrainUtils: Clone {
 35 |     fn feature_num(&self) -> usize;
 36 |     fn insert_feature(&mut self, key: String, value: usize);
 37 |     fn remove_feature(&mut self, key: &str) -> Option<usize>;
 38 |     fn put_feature(&mut self, key: String, value: usize);
 39 |     fn del_feature(&mut self, key: &str) -> Option<usize>;
 40 | }
 41 | 
 42 | impl<T> TraitFeature for &T
 43 | where
 44 |     T: TraitFeature,
 45 | {
 46 |     fn get_with_key(&self, key: &str) -> Option<usize> {
 47 |         self.deref().get_with_key(key)
 48 |     }
 49 | }
 50 | 
 51 | impl<T> TraitFeaturesTrainUtils for &T
 52 | where
 53 |     T: TraitFeaturesTrainUtils,
 54 | {
 55 |     fn feature_num(&self) -> usize {
 56 |         self.deref().feature_num()
 57 |     }
 58 | 
 59 |     fn insert_feature(&mut self, key: String, value: usize) {
 60 |         self.deref().put_feature(key, value)
 61 |     }
 62 | 
 63 |     fn remove_feature(&mut self, key: &str) -> Option<usize> {
 64 |         self.deref().del_feature(key)
 65 |     }
 66 | 
 67 |     fn put_feature(&mut self, key: String, value: usize) {
 68 |         self.deref().insert_feature(key, value)
 69 |     }
 70 | 
 71 |     fn del_feature(&mut self, key: &str) -> Option<usize> {
 72 |         self.deref().remove_feature(key)
 73 |     }
 74 | }
 75 | 
 76 | // HashMap
 77 | 
 78 | impl TraitFeature for HashMap<String, usize> {
 79 |     fn get_with_key(&self, key: &str) -> Option<usize> {
 80 |         self.get(key).copied()
 81 |     }
 82 | }
 83 | 
 84 | impl TraitFeaturesTrainUtils for HashMap<String, usize> {
 85 |     fn feature_num(&self) -> usize {
 86 |         self.len()
 87 |     }
 88 | 
 89 |     fn insert_feature(&mut self, key: String, value: usize) {
 90 |         self.insert(key, value);
 91 |     }
 92 | 
 93 |     fn remove_feature(&mut self, key: &str) -> Option<usize> {
 94 |         self.remove(key)
 95 |     }
 96 | 
 97 |     fn put_feature(&mut self, key: String, value: usize) {
 98 |         self.insert(key, value);
 99 |     }
100 | 
101 |     fn del_feature(&mut self, key: &str) -> Option<usize> {
102 |         self.remove(key)
103 |     }
104 | }
105 | 


--------------------------------------------------------------------------------
/rust/ltp/src/perceptron/mod.rs:
--------------------------------------------------------------------------------
 1 | mod definition;
 2 | mod feature;
 3 | mod model;
 4 | mod parameter;
 5 | #[cfg(feature = "serialization")]
 6 | mod serialization;
 7 | mod trainer;
 8 | 
 9 | pub use definition::{CWSDefinition, Definition, GenericItem, NERDefinition, POSDefinition};
10 | pub use feature::{TraitFeature, TraitFeatureCompressUtils, TraitFeaturesTrainUtils};
11 | pub use model::{PaMode, Perceptron};
12 | pub use parameter::{
13 |     TraitParameter, TraitParameterStorage, TraitParameterStorageCompressUtils,
14 |     TraitParameterStorageTrainUtils, TraitParameterStorageUtils,
15 | };
16 | #[cfg(feature = "serialization")]
17 | pub use serialization::{
18 |     schema, Codec, Format, ModelSerde, Reader, Schema, SerdeCWSModel, SerdeModel, SerdeNERModel,
19 |     SerdePOSModel,
20 | };
21 | pub use trainer::{Algorithm, Trainer};
22 | pub type Sample = (Vec<Vec<String>>, Vec<usize>);
23 | 


--------------------------------------------------------------------------------
/rust/ltp/src/perceptron/parameter.rs:
--------------------------------------------------------------------------------
 1 | use num_traits::{Float, Num, NumAssignOps};
 2 | use std::ops::{Deref, Index, IndexMut};
 3 | 
 4 | pub trait TraitParameter: Float + NumAssignOps + Default {}
 5 | 
 6 | impl<T> TraitParameter for T where T: Float + NumAssignOps + Default {}
 7 | 
 8 | pub trait TraitParameterStorageUtils {
 9 |     fn len(&self) -> usize;
10 |     fn is_empty(&self) -> bool {
11 |         self.len() == 0
12 |     }
13 | }
14 | 
15 | impl<T> TraitParameterStorageUtils for &T
16 | where
17 |     T: TraitParameterStorageUtils,
18 | {
19 |     fn len(&self) -> usize {
20 |         self.deref().len()
21 |     }
22 |     fn is_empty(&self) -> bool {
23 |         self.deref().is_empty()
24 |     }
25 | }
26 | 
27 | pub trait TraitParameterStorage<Param>:
28 |     Index<usize, Output = Param> + TraitParameterStorageUtils
29 | where
30 |     Param: TraitParameter,
31 | {
32 | }
33 | 
34 | impl<T, Param> TraitParameterStorage<Param> for T
35 | where
36 |     T: Index<usize, Output = Param> + TraitParameterStorageUtils,
37 |     Param: TraitParameter,
38 | {
39 | }
40 | 
41 | impl<T> TraitParameterStorageUtils for Vec<T> {
42 |     fn len(&self) -> usize {
43 |         self.len()
44 |     }
45 |     fn is_empty(&self) -> bool {
46 |         self.is_empty()
47 |     }
48 | }
49 | 
50 | // 模型训练需要实现的接口
51 | pub trait TraitParameterStorageTrainUtilsInit<Param>: Default {
52 |     fn init(value: Param, size: usize) -> Self;
53 | }
54 | impl<Param: Num + Clone> TraitParameterStorageTrainUtilsInit<Param> for Vec<Param> {
55 |     fn init(value: Param, size: usize) -> Self {
56 |         vec![value; size]
57 |     }
58 | }
59 | pub trait TraitParameterStorageTrainUtils<Param>:
60 |     Clone
61 |     + Index<usize, Output = Param>
62 |     + IndexMut<usize, Output = Param>
63 |     + TraitParameterStorageTrainUtilsInit<Param>
64 | {
65 | }
66 | impl<T, Param> TraitParameterStorageTrainUtils<Param> for T where
67 |     T: Clone
68 |         + Index<usize, Output = Param>
69 |         + IndexMut<usize, Output = Param>
70 |         + TraitParameterStorageTrainUtilsInit<Param>
71 | {
72 | }
73 | 
74 | // 模型压缩需要实现的接口
75 | pub trait TraitParameterStorageCompressUtils<Param> {
76 |     fn with_capacity(capacity: usize) -> Self;
77 |     fn push(&mut self, value: Param);
78 | }
79 | 
80 | impl<T> TraitParameterStorageCompressUtils<T> for Vec<T> {
81 |     fn with_capacity(capacity: usize) -> Self {
82 |         Self::with_capacity(capacity)
83 |     }
84 | 
85 |     fn push(&mut self, value: T) {
86 |         self.push(value);
87 |     }
88 | }
89 | 


--------------------------------------------------------------------------------
/rust/ltp/src/utils/mod.rs:
--------------------------------------------------------------------------------
 1 | pub mod eisner;
 2 | pub mod entities;
 3 | pub mod hook;
 4 | pub mod stnsplit;
 5 | pub mod viterbi;
 6 | 
 7 | pub use eisner::eisner;
 8 | pub use entities::{drop_get_entities, get_entities};
 9 | pub use stnsplit::{stn_split, stn_split_with_options, SplitOptions};
10 | pub use viterbi::viterbi_decode_postprocessing;


--------------------------------------------------------------------------------
/rust/ltp/src/utils/viterbi.rs:
--------------------------------------------------------------------------------
 1 | use num_traits::PrimInt;
 2 | 
 3 | pub fn viterbi_decode_postprocessing<T>(
 4 |     history: &[T],
 5 |     last_tags: &[T],
 6 |     stn_lengths: &[usize],
 7 |     labels_num: usize,
 8 | ) -> Vec<Vec<T>>
 9 | where
10 |     T: PrimInt,
11 | {
12 |     // history
13 |     // max_stn_len * stn_num * labels_num
14 |     let stn_num: usize = stn_lengths.iter().sum();
15 |     let b_bias = stn_num * labels_num;
16 |     let i_bias = labels_num;
17 | 
18 |     let mut result: Vec<Vec<T>> = Vec::new();
19 |     let mut stn_idx = 0;
20 |     for &stn_len in stn_lengths {
21 |         for _search_idx in 0..stn_len {
22 |             let best_last_tag = last_tags[stn_idx];
23 |             let mut best_tags = vec![best_last_tag];
24 | 
25 |             // history
26 |             // stn_len *  stn_num * labels_num
27 |             for search_end in 1..(stn_len) {
28 |                 // last one has been used
29 |                 let search_end = (stn_len - 1) - search_end;
30 |                 let forward_best = *best_tags.last().unwrap();
31 |                 let index =
32 |                     search_end * b_bias + stn_idx * i_bias + forward_best.to_usize().unwrap();
33 |                 let last_best = history[index];
34 |                 best_tags.push(last_best);
35 |             }
36 |             best_tags.reverse();
37 |             result.push(best_tags);
38 |             stn_idx += 1;
39 |         }
40 |     }
41 |     result
42 | }
43 | 
44 | #[cfg(test)]
45 | mod tests {
46 |     use super::viterbi_decode_postprocessing;
47 |     use ndarray::{Array1, Array3};
48 |     use ndarray_npy::{NpzReader, ReadNpzError};
49 |     use std::fs::File;
50 | 
51 |     #[test]
52 |     fn test_viterbi() -> Result<(), ReadNpzError> {
53 |         let mut npz = NpzReader::new(File::open("test/viterbi.npz").unwrap())?;
54 |         let srl_history: Array3<i64> = npz.by_name("srl_history.npy")?;
55 |         let srl_last_tags: Array1<i64> = npz.by_name("srl_last_tags.npy")?;
56 |         let word_nums: Array1<i64> = npz.by_name("word_nums.npy")?;
57 |         let correct: Array1<i64> = npz.by_name("correct.npy")?;
58 | 
59 |         let label_num = srl_history.dim().2;
60 |         let word_nums: Vec<usize> = word_nums.iter().map(|&x| x as usize).collect();
61 | 
62 |         let output = viterbi_decode_postprocessing(
63 |             srl_history.as_slice().unwrap(),
64 |             srl_last_tags.as_slice().unwrap(),
65 |             word_nums.as_slice(),
66 |             label_num,
67 |         );
68 | 
69 |         let correct: Vec<i64> = correct.iter().map(|&x| x).collect();
70 |         let output: Vec<i64> = output.iter().flatten().map(|&x| x).collect();
71 | 
72 |         assert_eq!(correct, output);
73 | 
74 |         Ok(())
75 |     }
76 | }
77 | 


--------------------------------------------------------------------------------
/rust/ltp/test/eisner.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HIT-SCIR/ltp/6c6059d5ccad87dca003190ee7565af86e7e22a0/rust/ltp/test/eisner.npz


--------------------------------------------------------------------------------
/rust/ltp/test/viterbi.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HIT-SCIR/ltp/6c6059d5ccad87dca003190ee7565af86e7e22a0/rust/ltp/test/viterbi.npz


--------------------------------------------------------------------------------
/rust/ltp/vendor/schema/cws.avsc:
--------------------------------------------------------------------------------
 1 | {
 2 |   "type": "record",
 3 |   "name": "cws",
 4 |   "fields": [
 5 |     {
 6 |       "name": "definition",
 7 |       "type": "record",
 8 |       "fields": []
 9 |     },
10 |     {
11 |       "name": "features",
12 |       "type": "map",
13 |       "values": "long",
14 |       "default": {}
15 |     },
16 |     {
17 |       "name": "parameters",
18 |       "type": "array",
19 |       "items": "double",
20 |       "default": []
21 |     }
22 |   ]
23 | }
24 | 


--------------------------------------------------------------------------------
/rust/ltp/vendor/schema/ner.avsc:
--------------------------------------------------------------------------------
 1 | {
 2 |   "type": "record",
 3 |   "name": "ner",
 4 |   "fields": [
 5 |     {
 6 |       "name": "definition",
 7 |       "type": "record",
 8 |       "fields": [
 9 |         {
10 |           "name": "to_labels",
11 |           "type": "array",
12 |           "items": "string",
13 |           "default": []
14 |         },
15 |         {
16 |           "name": "labels_to",
17 |           "type": "map",
18 |           "values": "long",
19 |           "default": {}
20 |         }
21 |       ]
22 |     },
23 |     {
24 |       "name": "features",
25 |       "type": "map",
26 |       "values": "long",
27 |       "default": {}
28 |     },
29 |     {
30 |       "name": "parameters",
31 |       "type": "array",
32 |       "items": "double",
33 |       "default": []
34 |     }
35 |   ]
36 | }
37 | 


--------------------------------------------------------------------------------
/rust/ltp/vendor/schema/pos.avsc:
--------------------------------------------------------------------------------
 1 | {
 2 |   "type": "record",
 3 |   "name": "pos",
 4 |   "fields": [
 5 |     {
 6 |       "name": "definition",
 7 |       "type": "record",
 8 |       "fields": [
 9 |         {
10 |           "name": "to_labels",
11 |           "type": "array",
12 |           "items": "string",
13 |           "default": []
14 |         },
15 |         {
16 |           "name": "labels_to",
17 |           "type": "map",
18 |           "values": "long",
19 |           "default": {}
20 |         }
21 |       ]
22 |     },
23 |     {
24 |       "name": "features",
25 |       "type": "map",
26 |       "values": "long",
27 |       "default": {}
28 |     },
29 |     {
30 |       "name": "parameters",
31 |       "type": "array",
32 |       "items": "double",
33 |       "default": []
34 |     }
35 |   ]
36 | }
37 | 


--------------------------------------------------------------------------------