├── .github ├── ISSUE_TEMPLATE.md ├── PULL_REQUEST_TEMPLATE.md └── workflows │ └── ci.yml ├── .gitignore ├── .gitmodules ├── CHANGES.rst ├── CMakeLists.txt ├── MANIFEST.in ├── README.md ├── doc ├── Makefile ├── api.rst ├── changelog.rst ├── conf.py ├── index.rst ├── install.rst ├── intro.rst ├── make.bat └── start.rst ├── example ├── example.py └── lexicon.txt ├── pyproject.toml ├── setup.py ├── src └── pyltp.cpp └── tests └── basic_test.py /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 在提问之前,请确认以下几点: 4 | - [ ] 如果您对算法或C++实现有问题,请在https://github.com/HIT-SCIR/ltp/issues提问 5 | - [ ] 由于您的问题可能与前任问题重复,在提交issue前,请您确认您已经搜索过之前的问题 6 | 7 | ## 问题*类型* 8 | 9 | 10 | ## 出错*场景* 11 | 12 | 13 | ## 代码片段 14 | 15 | ## 如何复现这一错误 16 | 17 | 18 | ## 运行环境 19 | 20 | 21 | ## 期望结果 22 | 23 | 24 | ## 其他 25 | 26 | 27 | 28 | 29 | 30 | Please ensure your issue adheres to the following guidelines: 31 | - [ ] If there is an algorithm or native (c++) problem. Go to https://github.com/HIT-SCIR/ltp/issues 32 | - [ ] Search previous issues before making a new one, as yours may be a duplicate. 33 | 34 | ## *What* is affected by this bug? 35 | 36 | 37 | ## *When* does this occur? 38 | 39 | 40 | ## *Where* on the code does it happen? 41 | 42 | 43 | ## *How* do we replicate the issue? 44 | 45 | 46 | ## Your environment information 47 | 48 | 49 | ## Expected behavior (i.e. solution) 50 | 51 | 52 | ## Other Comments 53 | 54 | 55 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Description Changes 4 | 5 | 6 | ## Motivation and Context 7 | 8 | 9 | 10 | ## How Has This Been Tested? 11 | 12 | 13 | 14 | 15 | ## Screenshots (if appropriate): 16 | 17 | ## Types of changes 18 | 19 | - [ ] Bug fix (non-breaking change which fixes an issue) 20 | - [ ] New feature (non-breaking change which adds functionality) 21 | - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected) 22 | 23 | ## Checklist: 24 | 25 | 26 | - [ ] My code follows the code style of this project. 27 | - [ ] My change requires a change to the documentation. 28 | - [ ] I have updated the documentation accordingly. 29 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: Wheels 2 | 3 | on: 4 | workflow_dispatch: 5 | pull_request: 6 | push: 7 | branches: 8 | - master 9 | release: 10 | types: 11 | - published 12 | 13 | jobs: 14 | build_sdist: 15 | name: Build SDist 16 | runs-on: ubuntu-latest 17 | steps: 18 | - uses: actions/checkout@v3 19 | with: 20 | submodules: true 21 | 22 | - name: Build SDist 23 | run: pipx run build --sdist 24 | 25 | - name: Check metadata 26 | run: pipx run twine check dist/* 27 | 28 | - uses: actions/upload-artifact@v3 29 | with: 30 | path: dist/*.tar.gz 31 | 32 | 33 | build_wheels_x64: 34 | name: x64 Wheels on ${{ matrix.os }} 35 | runs-on: ${{ matrix.os }} 36 | strategy: 37 | fail-fast: false 38 | matrix: 39 | os: [ 40 | ubuntu-latest, 41 | windows-latest, 42 | macos-latest 43 | ] 44 | 45 | steps: 46 | - uses: actions/checkout@v3 47 | with: 48 | submodules: true 49 | - name: Build Wheels 50 | uses: pypa/cibuildwheel@v2.8.0 51 | env: 52 | CIBW_SKIP: "*-musllinux_x86_64 *-musllinux_i686" 53 | CIBW_ARCHS_MACOS: auto 54 | CIBW_ARCHS_WINDOWS: auto64 55 | CIBW_ARCHS_LINUX: auto64 56 | 57 | - name: Upload wheels 58 | uses: actions/upload-artifact@v3 59 | with: 60 | path: wheelhouse/*.whl 61 | 62 | build_wheels_x86: 63 | name: x86 Wheels on ${{ matrix.os }} 64 | runs-on: ${{ matrix.os }} 65 | strategy: 66 | fail-fast: false 67 | matrix: 68 | os: [ 69 | ubuntu-latest, 70 | # windows-latest, 71 | # macos-latest 72 | ] 73 | 74 | steps: 75 | - uses: actions/checkout@v3 76 | with: 77 | submodules: true 78 | - name: Build Wheels 79 | uses: pypa/cibuildwheel@v2.8.0 80 | env: 81 | CIBW_SKIP: "*-musllinux_x86_64 *-musllinux_i686" 82 | CIBW_ARCHS_MACOS: universal2 83 | CIBW_ARCHS_WINDOWS: auto32 84 | CIBW_ARCHS_LINUX: auto32 85 | 86 | - name: Upload wheels 87 | uses: actions/upload-artifact@v3 88 | with: 89 | path: wheelhouse/*.whl 90 | 91 | build_wheels_musl: 92 | name: Musl Wheels on ${{ matrix.os }} 93 | runs-on: ${{ matrix.os }} 94 | strategy: 95 | fail-fast: false 96 | matrix: 97 | os: [ ubuntu-latest ] 98 | 99 | steps: 100 | - uses: actions/checkout@v3 101 | with: 102 | submodules: true 103 | - name: Build Wheels 104 | uses: pypa/cibuildwheel@v2.8.0 105 | env: 106 | CIBW_BUILD: "*-musllinux_x86_64 *-musllinux_i686" 107 | CIBW_ARCHS_MACOS: universal2 108 | CIBW_ARCHS_WINDOWS: auto32 109 | CIBW_ARCHS_LINUX: auto32 110 | 111 | - name: Upload wheels 112 | uses: actions/upload-artifact@v3 113 | with: 114 | path: wheelhouse/*.whl 115 | 116 | 117 | upload_all: 118 | name: Upload if release 119 | needs: [ build_sdist, build_wheels_x64, build_wheels_x86, build_wheels_musl ] 120 | runs-on: ubuntu-latest 121 | if: github.event_name == 'release' && github.event.action == 'published' 122 | 123 | steps: 124 | - uses: actions/setup-python@v4 125 | with: 126 | python-version: "3.x" 127 | 128 | - uses: actions/download-artifact@v3 129 | with: 130 | name: artifact 131 | path: dist 132 | 133 | - uses: pypa/gh-action-pypi-publish@v1.5.0 134 | with: 135 | user: ${{ secrets.PYPI_USERNAME }} 136 | password: ${{ secrets.PYPI_PASSWORD }} -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ############### 2 | # object file # 3 | ############### 4 | .* 5 | !.travis 6 | !.github 7 | 8 | ############### 9 | # build # 10 | ############### 11 | build 12 | 13 | ############### 14 | # output # 15 | ############### 16 | include/ 17 | lib/ 18 | bin/ 19 | dist 20 | !patch/include 21 | pyltp.egg-info 22 | 23 | *.swp 24 | doc/_build 25 | doc/_static 26 | doc/_templates 27 | !doc/Makefile 28 | 29 | ############### 30 | # data # 31 | ############### 32 | ltp_data 33 | 34 | cmake-* -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "ltp"] 2 | path = ltp 3 | url = https://github.com/HIT-SCIR/ltp.git 4 | [submodule "pybind11"] 5 | path = pybind11 6 | url = https://github.com/pybind/pybind11.git 7 | -------------------------------------------------------------------------------- /CHANGES.rst: -------------------------------------------------------------------------------- 1 | * 2022年07月23日 使用 修复编译失败的问题 2 | * 2020年07月30日 使用 Pybind11 生成 Python 绑定,减少维护困难 3 | * 2017年12月05日 升级更新兼容 LTP 3.4.0 -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.0) 2 | project(pyltp) 3 | 4 | set(-DCMAKE_CXX_STANDARD=14) 5 | add_subdirectory(pybind11) 6 | add_subdirectory(ltp) 7 | 8 | pybind11_add_module(pyltp src/pyltp.cpp) 9 | target_link_libraries( 10 | pyltp PRIVATE 11 | pybind11::module 12 | ner_static_lib 13 | parser_static_lib 14 | postagger_static_lib 15 | segmentor_static_lib 16 | splitsnt_static_lib 17 | srl_static_lib 18 | ) 19 | target_include_directories( 20 | pyltp PRIVATE 21 | ltp/include 22 | ) 23 | target_compile_definitions(pyltp 24 | PRIVATE VERSION_INFO=${EXAMPLE_VERSION_INFO}) 25 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include CHANGES.rst 3 | 4 | recursive-include src *.cpp 5 | recursive-include ltp/src *.h 6 | recursive-include ltp/src/framework *.h *.hpp 7 | recursive-include ltp/src/utils *.h *.hpp *.tab 8 | recursive-include ltp/src/segmentor *.cpp *.h *.hpp 9 | recursive-include ltp/src/postagger *.cpp *.h *.hpp 10 | recursive-include ltp/src/ner *.cpp *.h *.hpp 11 | recursive-include ltp/src/parser.n *.cpp *.h *.hpp 12 | recursive-include ltp/src/srl *.cpp *.h 13 | recursive-include ltp/thirdparty/boost *.h *.hpp *.cpp *.ipp 14 | recursive-include ltp/thirdparty/dynet * 15 | recursive-include ltp/thirdparty/eigen * 16 | recursive-include ltp/thirdparty/gtest * 17 | recursive-include ltp/thirdparty/jsoncpp * 18 | recursive-include ltp/thirdparty/maxent *.h *.cpp 19 | recursive-include ltp/thirdparty/tinythreadpp *.h *.cpp 20 | recursive-include ltp/thirdparty/tinyxml *.h *.cpp 21 | graft pybind11/include 22 | graft pybind11/tools 23 | global-include CMakeLists.txt *.cmake 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pyltp 2 | 3 | [![PyPI Status](https://badge.fury.io/py/pyltp.svg)](https://badge.fury.io/py/pyltp) 4 | [![Readthedocs](https://readthedocs.org/projects/pyltp/badge/?version=latest)](http://pyltp.readthedocs.io/) 5 | [![Build Status](https://travis-ci.org/HIT-SCIR/pyltp.svg?branch=master)](https://travis-ci.org/HIT-SCIR/pyltp) 6 | [![Build status](https://ci.appveyor.com/api/projects/status/kp2kjujo4amunyvr/branch/master?svg=true)](https://ci.appveyor.com/project/Oneplus/pyltp/branch/master) 7 | [![PyPI Downloads](https://img.shields.io/pypi/dm/pyltp.svg)](https://pypi.python.org/pypi/pyltp) 8 | 9 | pyltp 是 [语言技术平台(Language Technology Platform, LTP)](https://github.com/HIT-SCIR/ltp)的 Python 封装。 10 | 11 | 在使用 pyltp 之前,您需要简要了解 [语言技术平台(LTP)](http://ltp.readthedocs.org/zh_CN/latest/) 能否帮助您解决问题。 12 | 13 | **目前基于Pytorch的LTP4 已经发布,而PyLTP将会只有非常有限的维护,请大家移步使用**[LTP 4]([LTP 4](https://github.com/HIT-SCIR/ltp)) 14 | 15 | ## 依赖支持情况 16 | 17 | Python 2.7, 3.x, and PyPy (PyPy2.7 >= 5.7) 18 | 19 | ## 一个简单的例子 20 | 21 | 下面是一个使用 pyltp 进行分词的例子 22 | 23 | ```python 24 | # -*- coding: utf-8 -*- 25 | from pyltp import Segmentor 26 | segmentor = Segmentor("/path/to/your/cws/model") 27 | words = segmentor.segment("元芳你怎么看") 28 | print("|".join(words)) 29 | segmentor.release() 30 | ``` 31 | 除了分词之外,pyltp 还提供词性标注、命名实体识别、依存句法分析、语义角色标注等功能。 32 | 33 | 详细使用方法可以参考 [example](example/example.py) 34 | 35 | ## 安装 36 | 37 | * 第一步,安装 pyltp 38 | 39 | 使用 pip 安装 40 | 41 | ``` 42 | $ pip install pyltp 43 | ``` 44 | 或从源代码安装 45 | 46 | ``` 47 | $ git clone https://github.com/HIT-SCIR/pyltp 48 | $ cd pyltp 49 | $ git submodule init 50 | $ git submodule update 51 | $ python setup.py install 52 | ``` 53 | 54 | + Mac系统出现版本问题使用 MACOSX_DEPLOYMENT_TARGET=10.7 python setup.py install 55 | + 编译时间较长(约5分钟左右),请耐心等待 56 | 57 | * 第二步,下载模型文件 58 | 59 | [七牛云](http://ltp.ai/download.html),当前模型版本 3.4.0 60 | 61 | ## 制作安装包 62 | 63 | ``` 64 | git submodule init 65 | git submodule update 66 | python setup.py bdist_wheel 67 | ``` 68 | 69 | ## 版本对应 70 | 71 | * pyltp 版本:0.4.0 72 | * LTP 版本:3.4.0 73 | * 模型版本:3.4.0 74 | 75 | ## 作者 76 | 77 | * 冯云龙 << ylfeng@ir.hit.edu.cn >> 2020-7-30 重写代码,换用 Pybind11 78 | * 徐梓翔 << zxxu@ir.hit.edu.cn >> 2015-01-20 解决跨平台运行问题 79 | * 刘一佳 << yjliu@ir.hit.edu.cn >> 2014-06-12 重组项目 80 | * HuangFJ << biohfj@gmail.com >> 本项目最初作者 81 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don\'t have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help 23 | help: 24 | @echo "Please use \`make ' where is one of" 25 | @echo " html to make standalone HTML files" 26 | @echo " dirhtml to make HTML files named index.html in directories" 27 | @echo " singlehtml to make a single large HTML file" 28 | @echo " pickle to make pickle files" 29 | @echo " json to make JSON files" 30 | @echo " htmlhelp to make HTML files and a HTML help project" 31 | @echo " qthelp to make HTML files and a qthelp project" 32 | @echo " applehelp to make an Apple Help Book" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " epub3 to make an epub3" 36 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 37 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 38 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 39 | @echo " text to make text files" 40 | @echo " man to make manual pages" 41 | @echo " texinfo to make Texinfo files" 42 | @echo " info to make Texinfo files and run them through makeinfo" 43 | @echo " gettext to make PO message catalogs" 44 | @echo " changes to make an overview of all changed/added/deprecated items" 45 | @echo " xml to make Docutils-native XML files" 46 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 47 | @echo " linkcheck to check all external links for integrity" 48 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 49 | @echo " coverage to run coverage check of the documentation (if enabled)" 50 | 51 | .PHONY: clean 52 | clean: 53 | rm -rf $(BUILDDIR)/* 54 | 55 | .PHONY: html 56 | html: 57 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 58 | @echo 59 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 60 | 61 | .PHONY: dirhtml 62 | dirhtml: 63 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 64 | @echo 65 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 66 | 67 | .PHONY: singlehtml 68 | singlehtml: 69 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 70 | @echo 71 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 72 | 73 | .PHONY: pickle 74 | pickle: 75 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 76 | @echo 77 | @echo "Build finished; now you can process the pickle files." 78 | 79 | .PHONY: json 80 | json: 81 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 82 | @echo 83 | @echo "Build finished; now you can process the JSON files." 84 | 85 | .PHONY: htmlhelp 86 | htmlhelp: 87 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 88 | @echo 89 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 90 | ".hhp project file in $(BUILDDIR)/htmlhelp." 91 | 92 | .PHONY: qthelp 93 | qthelp: 94 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 95 | @echo 96 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 97 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 98 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pyltp.qhcp" 99 | @echo "To view the help file:" 100 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pyltp.qhc" 101 | 102 | .PHONY: applehelp 103 | applehelp: 104 | $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp 105 | @echo 106 | @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." 107 | @echo "N.B. You won't be able to view it unless you put it in" \ 108 | "~/Library/Documentation/Help or install it in your application" \ 109 | "bundle." 110 | 111 | .PHONY: devhelp 112 | devhelp: 113 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 114 | @echo 115 | @echo "Build finished." 116 | @echo "To view the help file:" 117 | @echo "# mkdir -p $$HOME/.local/share/devhelp/pyltp" 118 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pyltp" 119 | @echo "# devhelp" 120 | 121 | .PHONY: epub 122 | epub: 123 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 124 | @echo 125 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 126 | 127 | .PHONY: epub3 128 | epub3: 129 | $(SPHINXBUILD) -b epub3 $(ALLSPHINXOPTS) $(BUILDDIR)/epub3 130 | @echo 131 | @echo "Build finished. The epub3 file is in $(BUILDDIR)/epub3." 132 | 133 | .PHONY: latex 134 | latex: 135 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 136 | @echo 137 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 138 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 139 | "(use \`make latexpdf' here to do that automatically)." 140 | 141 | .PHONY: latexpdf 142 | latexpdf: 143 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 144 | @echo "Running LaTeX files through pdflatex..." 145 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 146 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 147 | 148 | .PHONY: latexpdfja 149 | latexpdfja: 150 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 151 | @echo "Running LaTeX files through platex and dvipdfmx..." 152 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 153 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 154 | 155 | .PHONY: text 156 | text: 157 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 158 | @echo 159 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 160 | 161 | .PHONY: man 162 | man: 163 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 164 | @echo 165 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 166 | 167 | .PHONY: texinfo 168 | texinfo: 169 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 170 | @echo 171 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 172 | @echo "Run \`make' in that directory to run these through makeinfo" \ 173 | "(use \`make info' here to do that automatically)." 174 | 175 | .PHONY: info 176 | info: 177 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 178 | @echo "Running Texinfo files through makeinfo..." 179 | make -C $(BUILDDIR)/texinfo info 180 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 181 | 182 | .PHONY: gettext 183 | gettext: 184 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 185 | @echo 186 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 187 | 188 | .PHONY: changes 189 | changes: 190 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 191 | @echo 192 | @echo "The overview file is in $(BUILDDIR)/changes." 193 | 194 | .PHONY: linkcheck 195 | linkcheck: 196 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 197 | @echo 198 | @echo "Link check complete; look for any errors in the above output " \ 199 | "or in $(BUILDDIR)/linkcheck/output.txt." 200 | 201 | .PHONY: doctest 202 | doctest: 203 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 204 | @echo "Testing of doctests in the sources finished, look at the " \ 205 | "results in $(BUILDDIR)/doctest/output.txt." 206 | 207 | .PHONY: coverage 208 | coverage: 209 | $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage 210 | @echo "Testing of coverage in the sources finished, look at the " \ 211 | "results in $(BUILDDIR)/coverage/python.txt." 212 | 213 | .PHONY: xml 214 | xml: 215 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 216 | @echo 217 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 218 | 219 | .PHONY: pseudoxml 220 | pseudoxml: 221 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 222 | @echo 223 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 224 | -------------------------------------------------------------------------------- /doc/api.rst: -------------------------------------------------------------------------------- 1 | 使用 pyltp 2 | =========== 3 | pyltp 是 `LTP `_ 的 Python 封装,提供了分词,词性标注,命名实体识别,依存句法分析,语义角色标注的功能。 4 | 5 | 关于各个模块任务的介绍、标注体系、性能指标,可以查阅 `这里 `_ 的介绍。 6 | 7 | 8 | 使用前请先下载完整模型 9 | ---------------------- 10 | 11 | 请先下载完整的 LTP 模型文件 12 | 13 | * 下载地址 - `百度云 `_ 14 | * 当前模型版本 - 3.4.0 15 | 16 | 请确保下载的模型版本与当前版本的 pyltp 对应,否则会导致程序无法正确加载模型。 17 | 18 | 请注意编码 19 | ---------- 20 | 21 | pyltp 的所有输入的分析文本和输出的结果的编码均为 UTF-8。 22 | 23 | 如果您以非 UTF-8 编码的文本输入进行分析,结果可能为空。请注意源代码文件的默认编码。 24 | 25 | 由于 Windows 终端采用 GBK 编码显示,直接输出 pyltp 的分析结果会在终端显示为乱码。您可以将标准输出重定向到文件,以 UTF8 方式查看文件,就可以解决显示乱码的问题。 26 | 27 | 28 | 分句 29 | ----- 30 | 31 | 使用 pyltp 进行分句示例如下 :: 32 | 33 | # -*- coding: utf-8 -*- 34 | from pyltp import SentenceSplitter 35 | sents = SentenceSplitter.split('元芳你怎么看?我就趴窗口上看呗!') # 分句 36 | print '\n'.join(sents) 37 | 38 | 结果如下 :: 39 | 40 | 元芳你怎么看? 41 | 我就趴窗口上看呗! 42 | 43 | 44 | 分词 45 | ----- 46 | 47 | 使用 pyltp 进行分词示例如下 :: 48 | 49 | # -*- coding: utf-8 -*- 50 | import os 51 | LTP_DATA_DIR = '/path/to/your/ltp_data' # ltp模型目录的路径 52 | cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` 53 | 54 | from pyltp import Segmentor 55 | segmentor = Segmentor() # 初始化实例 56 | segmentor.load(cws_model_path) # 加载模型 57 | words = segmentor.segment('元芳你怎么看') # 分词 58 | print '\t'.join(words) 59 | segmentor.release() # 释放模型 60 | 61 | 结果如下 :: 62 | 63 | 元芳 你 怎么 看 64 | 65 | :code:`words = segmentor.segment('元芳你怎么看')` 的返回值类型是native的VectorOfString类型,可以使用list转换成Python的列表类型,例如 :: 66 | 67 | ... 68 | >>> words = segmentor.segment('元芳你怎么看') 69 | >>> type(words) 70 | 71 | >>> words_list = list(words) 72 | >>> type(words_list) 73 | 74 | >>> print words_list 75 | ['\xe5\xae\xa2\xe6\x9c\x8d', '\xe5\xa4\xaa', '\xe7\xb3\x9f\xe7\xb3\x95', '\xe4\xba\x86'] 76 | 77 | 使用分词外部词典 78 | ~~~~~~~~~~~~~~~~ 79 | 80 | pyltp 分词支持用户使用自定义词典。分词外部词典本身是一个文本文件(plain text),每行指定一个词,编码同样须为 UTF-8,样例如下所示 :: 81 | 82 | 苯并芘 83 | 亚硝酸盐 84 | 85 | 示例如下 :: 86 | 87 | # -*- coding: utf-8 -*- 88 | import os 89 | LTP_DATA_DIR = '/path/to/your/ltp_data' # ltp模型目录的路径 90 | cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` 91 | 92 | from pyltp import Segmentor 93 | segmentor = Segmentor() # 初始化实例 94 | segmentor.load_with_lexicon(cws_model_path, '/path/to/your/lexicon') # 加载模型,第二个参数是您的外部词典文件路径 95 | words = segmentor.segment('亚硝酸盐是一种化学物质') 96 | print '\t'.join(words) 97 | segmentor.release() 98 | 99 | 100 | 使用个性化分词模型 101 | ~~~~~~~~~~~~~~~~~~~ 102 | 103 | 个性化分词是 LTP 的特色功能。个性化分词为了解决测试数据切换到如小说、财经等不同于新闻领域的领域。 在切换到新领域时,用户只需要标注少量数据。 个性化分词会在原有新闻数据基础之上进行增量训练。 从而达到即利用新闻领域的丰富数据,又兼顾目标领域特殊性的目的。 104 | 105 | pyltp 支持使用用户训练好的个性化模型。关于个性化模型的训练需使用 LTP,详细介绍和训练方法请参考 `个性化分词 `_ 。 106 | 107 | 在 pyltp 中使用个性化分词模型的示例如下 :: 108 | 109 | # -*- coding: utf-8 -*- 110 | import os 111 | LTP_DATA_DIR = '/path/to/your/ltp_data' # ltp模型目录的路径 112 | cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` 113 | 114 | from pyltp import CustomizedSegmentor 115 | customized_segmentor = CustomizedSegmentor() # 初始化实例 116 | customized_segmentor.load(cws_model_path, '/path/to/your/customized_model') # 加载模型,第二个参数是您的增量模型路径 117 | words = customized_segmentor.segment('亚硝酸盐是一种化学物质') 118 | print '\t'.join(words) 119 | customized_segmentor.release() 120 | 121 | 同样,使用个性化分词模型的同时也可以使用外部词典 :: 122 | 123 | # -*- coding: utf-8 -*- 124 | import os 125 | LTP_DATA_DIR = '/path/to/your/ltp_data' # ltp模型目录的路径 126 | cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` 127 | 128 | from pyltp import CustomizedSegmentor 129 | customized_segmentor = CustomizedSegmentor() # 初始化实例 130 | customized_segmentor.load_with_lexicon(cws_model_path, '/path/to/your/customized_model', '/path/to/your/lexicon') # 加载模型 131 | words = customized_segmentor.segment('亚硝酸盐是一种化学物质') 132 | print '\t'.join(words) 133 | customized_segmentor.release() 134 | 135 | 136 | 词性标注 137 | -------- 138 | 139 | 使用 pyltp 进行词性标注示例如下 :: 140 | 141 | # -*- coding: utf-8 -*- 142 | import os 143 | LTP_DATA_DIR = '/path/to/your/ltp_data' # ltp模型目录的路径 144 | pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` 145 | 146 | from pyltp import Postagger 147 | postagger = Postagger() # 初始化实例 148 | postagger.load(pos_model_path) # 加载模型 149 | 150 | words = ['元芳', '你', '怎么', '看'] # 分词结果 151 | postags = postagger.postag(words) # 词性标注 152 | 153 | print '\t'.join(postags) 154 | postagger.release() # 释放模型 155 | 156 | 结果如下 :: 157 | 158 | nh r r v 159 | 160 | 参数 :code:`words` 是分词模块的返回值,也支持Python原生的list类型,例如 :: 161 | 162 | words = ['元芳', '你', '怎么', '看'] 163 | postags = postagger.postag(words) 164 | 165 | LTP 使用 863 词性标注集,详细请参考 `词性标注集 `_ 。 166 | 167 | 使用词性标注外部词典 168 | ~~~~~~~~~~~~~~~~~~~~ 169 | 170 | pyltp 词性标注同样支持用户的外部词典。词性标注外部词典同样为一个文本文件,每行指定一个词,第一列指定单词,第二列之后指定该词的候选词性(可以有多项,每一项占一列),列与列之间用空格区分。示例如下 :: 171 | 172 | 雷人 v a 173 | 】 wp 174 | 175 | 命名实体识别 176 | ------------- 177 | 178 | 使用 pyltp 进行命名实体识别示例如下 :: 179 | 180 | # -*- coding: utf-8 -*- 181 | import os 182 | LTP_DATA_DIR = '/path/to/your/ltp_data' # ltp模型目录的路径 183 | ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model') # 命名实体识别模型路径,模型名称为`pos.model` 184 | 185 | from pyltp import NamedEntityRecognizer 186 | recognizer = NamedEntityRecognizer() # 初始化实例 187 | recognizer.load(ner_model_path) # 加载模型 188 | 189 | words = ['元芳', '你', '怎么', '看'] 190 | postags = ['nh', 'r', 'r', 'v'] 191 | netags = recognizer.recognize(words, postags) # 命名实体识别 192 | 193 | print '\t'.join(netags) 194 | recognizer.release() # 释放模型 195 | 196 | 其中,:code:`words` 和 :code:`postags` 分别为分词和词性标注的结果。同样支持Python原生的list类型。 197 | 198 | 结果如下 :: 199 | 200 | S-Nh O O O 201 | 202 | LTP 采用 BIESO 标注体系。B 表示实体开始词,I表示实体中间词,E表示实体结束词,S表示单独成实体,O表示不构成命名实体。 203 | 204 | LTP 提供的命名实体类型为:人名(Nh)、地名(Ns)、机构名(Ni)。 205 | 206 | B、I、E、S位置标签和实体类型标签之间用一个横线 :code:`-` 相连;O标签后没有类型标签。 207 | 208 | 详细标注请参考 `命名实体识别标注集 `_ 。 209 | 210 | 依存句法分析 211 | ------------ 212 | 213 | 使用 pyltp 进行依存句法分析示例如下 :: 214 | 215 | # -*- coding: utf-8 -*- 216 | import os 217 | LTP_DATA_DIR = '/path/to/your/ltp_data' # ltp模型目录的路径 218 | par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model') # 依存句法分析模型路径,模型名称为`parser.model` 219 | 220 | from pyltp import Parser 221 | parser = Parser() # 初始化实例 222 | parser.load(par_model_path) # 加载模型 223 | 224 | words = ['元芳', '你', '怎么', '看'] 225 | postags = ['nh', 'r', 'r', 'v'] 226 | arcs = parser.parse(words, postags) # 句法分析 227 | 228 | print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs) 229 | parser.release() # 释放模型 230 | 231 | 232 | 其中,:code:`words` 和 :code:`postags` 分别为分词和词性标注的结果。同样支持Python原生的list类型。 233 | 234 | 结果如下 :: 235 | 236 | 4:SBV 4:SBV 4:ADV 0:HED 237 | 238 | :code:`arc.head` 表示依存弧的父节点词的索引。ROOT节点的索引是0,第一个词开始的索引依次为1、2、3... 239 | 240 | :code:`arc.relation` 表示依存弧的关系。 241 | 242 | :code:`arc.head` 表示依存弧的父节点词的索引,:code:`arc.relation` 表示依存弧的关系。 243 | 244 | 标注集请参考 `依存句法关系 `_ 。 245 | 246 | 语义角色标注 247 | ------------- 248 | 249 | 使用 pyltp 进行语义角色标注示例如下 :: 250 | 251 | # -*- coding: utf-8 -*- 252 | import os 253 | LTP_DATA_DIR = '/path/to/your/ltp_data' # ltp模型目录的路径 254 | srl_model_path = os.path.join(LTP_DATA_DIR, 'srl') # 语义角色标注模型目录路径,模型目录为`srl`。注意该模型路径是一个目录,而不是一个文件。 255 | 256 | from pyltp import SementicRoleLabeller 257 | labeller = SementicRoleLabeller() # 初始化实例 258 | labeller.load(srl_model_path) # 加载模型 259 | 260 | words = ['元芳', '你', '怎么', '看'] 261 | postags = ['nh', 'r', 'r', 'v'] 262 | # arcs 使用依存句法分析的结果 263 | roles = labeller.label(words, postags, arcs) # 语义角色标注 264 | 265 | # 打印结果 266 | for role in roles: 267 | print role.index, "".join( 268 | ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]) 269 | labeller.release() # 释放模型 270 | 271 | 结果如下 :: 272 | 273 | 3 A0:(0,0)A0:(1,1)ADV:(2,2) 274 | 275 | 276 | 第一个词开始的索引依次为0、1、2... 277 | 278 | 返回结果 :code:`roles` 是关于多个谓词的语义角色分析的结果。由于一句话中可能不含有语义角色,所以结果可能为空。 279 | 280 | :code:`role.index` 代表谓词的索引, :code:`role.arguments` 代表关于该谓词的若干语义角色。 281 | 282 | :code:`arg.name` 表示语义角色类型,:code:`arg.range.start` 表示该语义角色起始词位置的索引,:code:`arg.range.end` 表示该语义角色结束词位置的索引。 283 | 284 | 例如上面的例子,由于结果输出一行,所以“元芳你怎么看”有一组语义角色。 285 | 其谓词索引为3,即“看”。这个谓词有三个语义角色,范围分别是(0,0)即“元芳”,(1,1)即“你”,(2,2)即“怎么”,类型分别是A0、A0、ADV。 286 | 287 | :code:`arg.name` 表示语义角色关系,:code:`arg.range.start` 表示起始词位置,:code:`arg.range.end` 表示结束位置。 288 | 289 | 标注集请参考 `语义角色关系 `_ 。 290 | 291 | 语义依存分析 292 | ------------ 293 | 294 | pyltp 暂不提供语义依存分析功能。若需使用该功能,请使用 `语言云 `_ 。 295 | 296 | 297 | 完整示例代码 298 | ------------- 299 | 300 | 完整的示例代码可以参考 :file:`example/example.py` 。 301 | -------------------------------------------------------------------------------- /doc/changelog.rst: -------------------------------------------------------------------------------- 1 | * 2020年07月30日 使用 Pybind11 生成 Python 绑定,减少维护困难 2 | * 2017年12月05日 升级更新兼容 LTP 3.4.0 -------------------------------------------------------------------------------- /doc/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # pyltp documentation build configuration file, created by 4 | # sphinx-quickstart on Tue Mar 29 11:19:39 2016. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | import sys 16 | import os 17 | 18 | # If extensions (or modules to document with autodoc) are in another directory, 19 | # add these directories to sys.path here. If the directory is relative to the 20 | # documentation root, use os.path.abspath to make it absolute, like shown here. 21 | #sys.path.insert(0, os.path.abspath('.')) 22 | 23 | # -- General configuration ------------------------------------------------ 24 | 25 | # If your documentation needs a minimal Sphinx version, state it here. 26 | #needs_sphinx = '1.0' 27 | 28 | # Add any Sphinx extension module names here, as strings. They can be 29 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 30 | # ones. 31 | extensions = [] 32 | 33 | # Add any paths that contain templates here, relative to this directory. 34 | templates_path = ['_templates'] 35 | 36 | # The suffix(es) of source filenames. 37 | # You can specify multiple suffix as a list of string: 38 | # source_suffix = ['.rst', '.md'] 39 | source_suffix = '.rst' 40 | 41 | # The encoding of source files. 42 | #source_encoding = 'utf-8-sig' 43 | 44 | # The master toctree document. 45 | master_doc = 'index' 46 | 47 | # General information about the project. 48 | project = u'pyltp' 49 | copyright = u'2017, HIT-SCIR' 50 | author = u'HIT-SCIR' 51 | 52 | # The version info for the project you're documenting, acts as replacement for 53 | # |version| and |release|, also used in various other places throughout the 54 | # built documents. 55 | # 56 | # The short X.Y version. 57 | version = u'0.2.0' 58 | # The full version, including alpha/beta/rc tags. 59 | release = u'0.2.0' 60 | 61 | # The language for content autogenerated by Sphinx. Refer to documentation 62 | # for a list of supported languages. 63 | # 64 | # This is also used if you do content translation via gettext catalogs. 65 | # Usually you set "language" from the command line for these cases. 66 | language = 'zh_CN' 67 | 68 | # There are two options for replacing |today|: either, you set today to some 69 | # non-false value, then it is used: 70 | #today = '' 71 | # Else, today_fmt is used as the format for a strftime call. 72 | #today_fmt = '%B %d, %Y' 73 | 74 | # List of patterns, relative to source directory, that match files and 75 | # directories to ignore when looking for source files. 76 | # This patterns also effect to html_static_path and html_extra_path 77 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 78 | 79 | # The reST default role (used for this markup: `text`) to use for all 80 | # documents. 81 | #default_role = None 82 | 83 | # If true, '()' will be appended to :func: etc. cross-reference text. 84 | #add_function_parentheses = True 85 | 86 | # If true, the current module name will be prepended to all description 87 | # unit titles (such as .. function::). 88 | #add_module_names = True 89 | 90 | # If true, sectionauthor and moduleauthor directives will be shown in the 91 | # output. They are ignored by default. 92 | #show_authors = False 93 | 94 | # The name of the Pygments (syntax highlighting) style to use. 95 | pygments_style = 'sphinx' 96 | 97 | # A list of ignored prefixes for module index sorting. 98 | #modindex_common_prefix = [] 99 | 100 | # If true, keep warnings as "system message" paragraphs in the built documents. 101 | #keep_warnings = False 102 | 103 | # If true, `todo` and `todoList` produce output, else they produce nothing. 104 | todo_include_todos = False 105 | 106 | 107 | # -- Options for HTML output ---------------------------------------------- 108 | 109 | # The theme to use for HTML and HTML Help pages. See the documentation for 110 | # a list of builtin themes. 111 | html_theme = 'alabaster' 112 | 113 | # Theme options are theme-specific and customize the look and feel of a theme 114 | # further. For a list of options available for each theme, see the 115 | # documentation. 116 | #html_theme_options = {} 117 | 118 | # Add any paths that contain custom themes here, relative to this directory. 119 | #html_theme_path = [] 120 | 121 | # The name for this set of Sphinx documents. 122 | # " v documentation" by default. 123 | #html_title = u'pyltp v0.1.9' 124 | 125 | # A shorter title for the navigation bar. Default is the same as html_title. 126 | #html_short_title = None 127 | 128 | # The name of an image file (relative to this directory) to place at the top 129 | # of the sidebar. 130 | #html_logo = None 131 | 132 | # The name of an image file (relative to this directory) to use as a favicon of 133 | # the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 134 | # pixels large. 135 | #html_favicon = None 136 | 137 | # Add any paths that contain custom static files (such as style sheets) here, 138 | # relative to this directory. They are copied after the builtin static files, 139 | # so a file named "default.css" will overwrite the builtin "default.css". 140 | html_static_path = ['_static'] 141 | 142 | # Add any extra paths that contain custom files (such as robots.txt or 143 | # .htaccess) here, relative to this directory. These files are copied 144 | # directly to the root of the documentation. 145 | #html_extra_path = [] 146 | 147 | # If not None, a 'Last updated on:' timestamp is inserted at every page 148 | # bottom, using the given strftime format. 149 | # The empty string is equivalent to '%b %d, %Y'. 150 | #html_last_updated_fmt = None 151 | 152 | # If true, SmartyPants will be used to convert quotes and dashes to 153 | # typographically correct entities. 154 | #html_use_smartypants = True 155 | 156 | # Custom sidebar templates, maps document names to template names. 157 | #html_sidebars = {} 158 | 159 | # Additional templates that should be rendered to pages, maps page names to 160 | # template names. 161 | #html_additional_pages = {} 162 | 163 | # If false, no module index is generated. 164 | #html_domain_indices = True 165 | 166 | # If false, no index is generated. 167 | #html_use_index = True 168 | 169 | # If true, the index is split into individual pages for each letter. 170 | #html_split_index = False 171 | 172 | # If true, links to the reST sources are added to the pages. 173 | #html_show_sourcelink = True 174 | 175 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 176 | #html_show_sphinx = True 177 | 178 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 179 | #html_show_copyright = True 180 | 181 | # If true, an OpenSearch description file will be output, and all pages will 182 | # contain a tag referring to it. The value of this option must be the 183 | # base URL from which the finished HTML is served. 184 | #html_use_opensearch = '' 185 | 186 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 187 | #html_file_suffix = None 188 | 189 | # Language to be used for generating the HTML full-text search index. 190 | # Sphinx supports the following languages: 191 | # 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja' 192 | # 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr', 'zh' 193 | #html_search_language = 'en' 194 | 195 | # A dictionary with options for the search language support, empty by default. 196 | # 'ja' uses this config value. 197 | # 'zh' user can custom change `jieba` dictionary path. 198 | #html_search_options = {'type': 'default'} 199 | 200 | # The name of a javascript file (relative to the configuration directory) that 201 | # implements a search results scorer. If empty, the default will be used. 202 | #html_search_scorer = 'scorer.js' 203 | 204 | # Output file base name for HTML help builder. 205 | htmlhelp_basename = 'pyltpdoc' 206 | 207 | # -- Options for LaTeX output --------------------------------------------- 208 | 209 | latex_elements = { 210 | # The paper size ('letterpaper' or 'a4paper'). 211 | #'papersize': 'letterpaper', 212 | 213 | # The font size ('10pt', '11pt' or '12pt'). 214 | #'pointsize': '10pt', 215 | 216 | # Additional stuff for the LaTeX preamble. 217 | #'preamble': '', 218 | 219 | # Latex figure (float) alignment 220 | #'figure_align': 'htbp', 221 | } 222 | 223 | # Grouping the document tree into LaTeX files. List of tuples 224 | # (source start file, target name, title, 225 | # author, documentclass [howto, manual, or own class]). 226 | latex_documents = [ 227 | (master_doc, 'pyltp.tex', u'pyltp Documentation', 228 | u'HIT-SCIR', 'manual'), 229 | ] 230 | 231 | # The name of an image file (relative to this directory) to place at the top of 232 | # the title page. 233 | #latex_logo = None 234 | 235 | # For "manual" documents, if this is true, then toplevel headings are parts, 236 | # not chapters. 237 | #latex_use_parts = False 238 | 239 | # If true, show page references after internal links. 240 | #latex_show_pagerefs = False 241 | 242 | # If true, show URL addresses after external links. 243 | #latex_show_urls = False 244 | 245 | # Documents to append as an appendix to all manuals. 246 | #latex_appendices = [] 247 | 248 | # If false, no module index is generated. 249 | #latex_domain_indices = True 250 | 251 | 252 | # -- Options for manual page output --------------------------------------- 253 | 254 | # One entry per manual page. List of tuples 255 | # (source start file, name, description, authors, manual section). 256 | man_pages = [ 257 | (master_doc, 'pyltp', u'pyltp Documentation', 258 | [author], 1) 259 | ] 260 | 261 | # If true, show URL addresses after external links. 262 | #man_show_urls = False 263 | 264 | 265 | # -- Options for Texinfo output ------------------------------------------- 266 | 267 | # Grouping the document tree into Texinfo files. List of tuples 268 | # (source start file, target name, title, author, 269 | # dir menu entry, description, category) 270 | texinfo_documents = [ 271 | (master_doc, 'pyltp', u'pyltp Documentation', 272 | author, 'pyltp', 'One line description of project.', 273 | 'Miscellaneous'), 274 | ] 275 | 276 | # Documents to append as an appendix to all manuals. 277 | #texinfo_appendices = [] 278 | 279 | # If false, no module index is generated. 280 | #texinfo_domain_indices = True 281 | 282 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 283 | #texinfo_show_urls = 'footnote' 284 | 285 | # If true, do not generate a @detailmenu in the "Top" node's menu. 286 | #texinfo_no_detailmenu = False 287 | -------------------------------------------------------------------------------- /doc/index.rst: -------------------------------------------------------------------------------- 1 | .. pyltp documentation master file, created by 2 | sphinx-quickstart on Tue Mar 29 11:19:39 2016. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | .. include:: intro.rst 7 | 8 | 目录 9 | ================================= 10 | 11 | .. toctree:: 12 | :maxdepth: 2 13 | 14 | install 15 | api 16 | changelog 17 | 18 | -------------------------------------------------------------------------------- /doc/install.rst: -------------------------------------------------------------------------------- 1 | 安装 pyltp 2 | =========== 3 | 4 | * 注:由于新版本增加了新的第三方依赖如dynet等,不再支持 windows 下 python2 环境。 5 | 6 | 使用 pip 安装 7 | ------------- 8 | 9 | 使用 pip 安装前,请确保您已安装了 `pip `_ :: 10 | 11 | $ pip install pyltp 12 | 13 | 接下来,需要下载 LTP 模型文件。 14 | 15 | * 下载地址 - `模型下载 http://ltp.ai/download.html`_ 16 | * 当前模型版本 - 3.4.0 17 | * 注意在windows下 3.4.0 版本的 语义角色标注模块 模型需要单独下载,具体查看下载地址链接中的说明。 18 | 19 | 请确保下载的模型版本与当前版本的 pyltp 对应,否则会导致程序无法正确加载模型。 20 | 21 | 从源码安装 22 | --------- 23 | 24 | 您也可以选择从源代码编译安装 :: 25 | 26 | $ git clone https://github.com/HIT-SCIR/pyltp 27 | $ git submodule init 28 | $ git submodule update 29 | $ python setup.py install 30 | 31 | 安装完毕后,也需要下载相应版本的 LTP 模型文件。 32 | -------------------------------------------------------------------------------- /doc/intro.rst: -------------------------------------------------------------------------------- 1 | 欢迎使用 pyltp 2 | 3 | pyltp 是 `语言技术平台(Language Technology Platform, LTP) `_ 的 Python 封装。 4 | 5 | 如需了解 LTP 的详细信息,请参考 LTP 的 `文档 `_ 。 6 | -------------------------------------------------------------------------------- /doc/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. epub3 to make an epub3 31 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 32 | echo. text to make text files 33 | echo. man to make manual pages 34 | echo. texinfo to make Texinfo files 35 | echo. gettext to make PO message catalogs 36 | echo. changes to make an overview over all changed/added/deprecated items 37 | echo. xml to make Docutils-native XML files 38 | echo. pseudoxml to make pseudoxml-XML files for display purposes 39 | echo. linkcheck to check all external links for integrity 40 | echo. doctest to run all doctests embedded in the documentation if enabled 41 | echo. coverage to run coverage check of the documentation if enabled 42 | goto end 43 | ) 44 | 45 | if "%1" == "clean" ( 46 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 47 | del /q /s %BUILDDIR%\* 48 | goto end 49 | ) 50 | 51 | 52 | REM Check if sphinx-build is available and fallback to Python version if any 53 | %SPHINXBUILD% 1>NUL 2>NUL 54 | if errorlevel 9009 goto sphinx_python 55 | goto sphinx_ok 56 | 57 | :sphinx_python 58 | 59 | set SPHINXBUILD=python -m sphinx.__init__ 60 | %SPHINXBUILD% 2> nul 61 | if errorlevel 9009 ( 62 | echo. 63 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 64 | echo.installed, then set the SPHINXBUILD environment variable to point 65 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 66 | echo.may add the Sphinx directory to PATH. 67 | echo. 68 | echo.If you don't have Sphinx installed, grab it from 69 | echo.http://sphinx-doc.org/ 70 | exit /b 1 71 | ) 72 | 73 | :sphinx_ok 74 | 75 | 76 | if "%1" == "html" ( 77 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 78 | if errorlevel 1 exit /b 1 79 | echo. 80 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 81 | goto end 82 | ) 83 | 84 | if "%1" == "dirhtml" ( 85 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 86 | if errorlevel 1 exit /b 1 87 | echo. 88 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 89 | goto end 90 | ) 91 | 92 | if "%1" == "singlehtml" ( 93 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 94 | if errorlevel 1 exit /b 1 95 | echo. 96 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 97 | goto end 98 | ) 99 | 100 | if "%1" == "pickle" ( 101 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 102 | if errorlevel 1 exit /b 1 103 | echo. 104 | echo.Build finished; now you can process the pickle files. 105 | goto end 106 | ) 107 | 108 | if "%1" == "json" ( 109 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 110 | if errorlevel 1 exit /b 1 111 | echo. 112 | echo.Build finished; now you can process the JSON files. 113 | goto end 114 | ) 115 | 116 | if "%1" == "htmlhelp" ( 117 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 118 | if errorlevel 1 exit /b 1 119 | echo. 120 | echo.Build finished; now you can run HTML Help Workshop with the ^ 121 | .hhp project file in %BUILDDIR%/htmlhelp. 122 | goto end 123 | ) 124 | 125 | if "%1" == "qthelp" ( 126 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 127 | if errorlevel 1 exit /b 1 128 | echo. 129 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 130 | .qhcp project file in %BUILDDIR%/qthelp, like this: 131 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\pyltp.qhcp 132 | echo.To view the help file: 133 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\pyltp.ghc 134 | goto end 135 | ) 136 | 137 | if "%1" == "devhelp" ( 138 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 139 | if errorlevel 1 exit /b 1 140 | echo. 141 | echo.Build finished. 142 | goto end 143 | ) 144 | 145 | if "%1" == "epub" ( 146 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 147 | if errorlevel 1 exit /b 1 148 | echo. 149 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 150 | goto end 151 | ) 152 | 153 | if "%1" == "epub3" ( 154 | %SPHINXBUILD% -b epub3 %ALLSPHINXOPTS% %BUILDDIR%/epub3 155 | if errorlevel 1 exit /b 1 156 | echo. 157 | echo.Build finished. The epub3 file is in %BUILDDIR%/epub3. 158 | goto end 159 | ) 160 | 161 | if "%1" == "latex" ( 162 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 163 | if errorlevel 1 exit /b 1 164 | echo. 165 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 166 | goto end 167 | ) 168 | 169 | if "%1" == "latexpdf" ( 170 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 171 | cd %BUILDDIR%/latex 172 | make all-pdf 173 | cd %~dp0 174 | echo. 175 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 176 | goto end 177 | ) 178 | 179 | if "%1" == "latexpdfja" ( 180 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 181 | cd %BUILDDIR%/latex 182 | make all-pdf-ja 183 | cd %~dp0 184 | echo. 185 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 186 | goto end 187 | ) 188 | 189 | if "%1" == "text" ( 190 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 191 | if errorlevel 1 exit /b 1 192 | echo. 193 | echo.Build finished. The text files are in %BUILDDIR%/text. 194 | goto end 195 | ) 196 | 197 | if "%1" == "man" ( 198 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 199 | if errorlevel 1 exit /b 1 200 | echo. 201 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 202 | goto end 203 | ) 204 | 205 | if "%1" == "texinfo" ( 206 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 207 | if errorlevel 1 exit /b 1 208 | echo. 209 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 210 | goto end 211 | ) 212 | 213 | if "%1" == "gettext" ( 214 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 215 | if errorlevel 1 exit /b 1 216 | echo. 217 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 218 | goto end 219 | ) 220 | 221 | if "%1" == "changes" ( 222 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 223 | if errorlevel 1 exit /b 1 224 | echo. 225 | echo.The overview file is in %BUILDDIR%/changes. 226 | goto end 227 | ) 228 | 229 | if "%1" == "linkcheck" ( 230 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 231 | if errorlevel 1 exit /b 1 232 | echo. 233 | echo.Link check complete; look for any errors in the above output ^ 234 | or in %BUILDDIR%/linkcheck/output.txt. 235 | goto end 236 | ) 237 | 238 | if "%1" == "doctest" ( 239 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 240 | if errorlevel 1 exit /b 1 241 | echo. 242 | echo.Testing of doctests in the sources finished, look at the ^ 243 | results in %BUILDDIR%/doctest/output.txt. 244 | goto end 245 | ) 246 | 247 | if "%1" == "coverage" ( 248 | %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage 249 | if errorlevel 1 exit /b 1 250 | echo. 251 | echo.Testing of coverage in the sources finished, look at the ^ 252 | results in %BUILDDIR%/coverage/python.txt. 253 | goto end 254 | ) 255 | 256 | if "%1" == "xml" ( 257 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml 258 | if errorlevel 1 exit /b 1 259 | echo. 260 | echo.Build finished. The XML files are in %BUILDDIR%/xml. 261 | goto end 262 | ) 263 | 264 | if "%1" == "pseudoxml" ( 265 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml 266 | if errorlevel 1 exit /b 1 267 | echo. 268 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. 269 | goto end 270 | ) 271 | 272 | :end 273 | -------------------------------------------------------------------------------- /doc/start.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HIT-SCIR/pyltp/722f609f3798071dc7ab173eea2f02d663e9a798/doc/start.rst -------------------------------------------------------------------------------- /example/example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys, os 4 | 5 | ROOTDIR = os.path.join(os.path.dirname(__file__), os.pardir) 6 | sys.path = [os.path.join(ROOTDIR, "lib")] + sys.path 7 | 8 | # Set your own model path 9 | MODELDIR = os.path.join(ROOTDIR, "./ltp_data") 10 | 11 | from pyltp import SentenceSplitter, Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller 12 | 13 | if __name__ == '__main__': 14 | paragraph = '他叫汤姆去拿外衣。' 15 | 16 | # --------------------- 断句 ------------------------ 17 | sentence = SentenceSplitter.split(paragraph)[0] 18 | 19 | # -------------------- Context Manager ------------- 20 | with Segmentor(os.path.join(MODELDIR, "cws.model")) as s: 21 | words = s.segment(sentence) 22 | print("\t".join(words)) 23 | 24 | # --------------------- 分词 ------------------------ 25 | segmentor = Segmentor(os.path.join(MODELDIR, "cws.model")) 26 | 27 | segmentor_with_vocab = Segmentor( 28 | os.path.join(MODELDIR, "cws.model"), 29 | lexicon_path='lexicon.txt', # 分开的会合并在一起 30 | ) 31 | 32 | segmentor_with_force_vocab = Segmentor( 33 | os.path.join(MODELDIR, "cws.model"), 34 | force_lexicon_path='lexicon.txt' # 除上述功能外,原本合并在一起的亦会拆分 35 | ) 36 | 37 | words = segmentor.segment(sentence) 38 | print("\t".join(words)) 39 | 40 | words_with_vocab = segmentor_with_vocab.segment(sentence) 41 | print("\t".join(words_with_vocab), "\t\t| With Vocab") 42 | 43 | words_with_force_vocab = segmentor_with_force_vocab.segment(sentence) 44 | print("\t".join(words_with_force_vocab), "\t| Force Vocab") 45 | 46 | # --------------------- 词性标注 ------------------------ 47 | postagger = Postagger(os.path.join(MODELDIR, "pos.model")) 48 | postags = postagger.postag(words) 49 | # list-of-string parameter is support in 0.1.5 50 | # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"]) 51 | print("\t".join(postags)) 52 | 53 | # --------------------- 语义依存分析 ------------------------ 54 | parser = Parser(os.path.join(MODELDIR, "parser.model")) 55 | arcs = parser.parse(words, postags) 56 | 57 | print("\t".join("%d:%s" % (head, relation) for (head, relation) in arcs)) 58 | 59 | # --------------------- 命名实体识别 ------------------------ 60 | recognizer = NamedEntityRecognizer(os.path.join(MODELDIR, "ner.model")) 61 | netags = recognizer.recognize(words, postags) 62 | print("\t".join(netags)) 63 | 64 | # --------------------- 语义角色标注 ------------------------ 65 | labeller = SementicRoleLabeller(os.path.join(MODELDIR, "pisrl.model")) 66 | roles = labeller.label(words, postags, arcs) 67 | 68 | for index, arguments in roles: 69 | print(index, " ".join(["%s: (%d,%d)" % (name, start, end) for (name, (start, end)) in arguments])) 70 | 71 | segmentor.release() 72 | segmentor_with_vocab.release() 73 | segmentor_with_force_vocab.release() 74 | segmentor.release() 75 | postagger.release() 76 | parser.release() 77 | recognizer.release() 78 | labeller.release() 79 | -------------------------------------------------------------------------------- /example/lexicon.txt: -------------------------------------------------------------------------------- 1 | 他叫 2 | 汤 -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=42", 4 | "wheel", 5 | "cmake>=3.12", 6 | ] 7 | build-backend = "setuptools.build_meta" 8 | 9 | [tool.isort] 10 | profile = "black" 11 | 12 | [tool.cibuildwheel] 13 | test-command = "pytest {project}/tests" 14 | test-extras = ["test"] 15 | test-skip = ["*universal2:arm64"] 16 | # Setuptools bug causes collision between pypy and cpython artifacts 17 | before-build = "rm -rf {project}/build" -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import os 3 | import re 4 | import sys 5 | import subprocess 6 | 7 | from setuptools import setup, Extension 8 | from setuptools.command.build_ext import build_ext 9 | 10 | # Convert distutils Windows platform specifiers to CMake -A arguments 11 | PLAT_TO_CMAKE = { 12 | "win32": "Win32", 13 | "win-amd64": "x64", 14 | "win-arm32": "ARM", 15 | "win-arm64": "ARM64", 16 | } 17 | 18 | 19 | class CMakeExtension(Extension): 20 | def __init__(self, name, sourcedir=''): 21 | Extension.__init__(self, name, sources=[]) 22 | self.sourcedir = os.path.abspath(sourcedir) 23 | 24 | 25 | class CMakeBuild(build_ext): 26 | def build_extension(self, ext): 27 | extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name))) 28 | 29 | # required for auto-detection & inclusion of auxiliary "native" libs 30 | if not extdir.endswith(os.path.sep): 31 | extdir += os.path.sep 32 | 33 | debug = int(os.environ.get("DEBUG", 0)) if self.debug is None else self.debug 34 | cfg = "Debug" if debug else "Release" 35 | 36 | # CMake lets you override the generator - we need to check this. 37 | # Can be set with Conda-Build, for example. 38 | cmake_generator = os.environ.get("CMAKE_GENERATOR", "") 39 | 40 | # Set Python_EXECUTABLE instead if you use PYBIND11_FINDPYTHON 41 | # EXAMPLE_VERSION_INFO shows you how to pass a value into the C++ code 42 | # from Python. 43 | cmake_args = [ 44 | f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}", 45 | f"-DPYTHON_EXECUTABLE={sys.executable}", 46 | f"-DCMAKE_BUILD_TYPE={cfg}", # not used on MSVC, but no harm 47 | ] 48 | build_args = [] 49 | # Adding CMake arguments set as environment variable 50 | # (needed e.g. to build for ARM OSx on conda-forge) 51 | if "CMAKE_ARGS" in os.environ: 52 | cmake_args += [item for item in os.environ["CMAKE_ARGS"].split(" ") if item] 53 | 54 | # In this example, we pass in the version to C++. You might not need to. 55 | cmake_args += [f"-DEXAMPLE_VERSION_INFO={self.distribution.get_version()}"] 56 | 57 | if self.compiler.compiler_type != "msvc": 58 | # Using Ninja-build since it a) is available as a wheel and b) 59 | # multithreads automatically. MSVC would require all variables be 60 | # exported for Ninja to pick it up, which is a little tricky to do. 61 | # Users can override the generator with CMAKE_GENERATOR in CMake 62 | # 3.15+. 63 | if not cmake_generator or cmake_generator == "Ninja": 64 | try: 65 | import ninja # noqa: F401 66 | 67 | ninja_executable_path = os.path.join(ninja.BIN_DIR, "ninja") 68 | cmake_args += [ 69 | "-GNinja", 70 | f"-DCMAKE_MAKE_PROGRAM:FILEPATH={ninja_executable_path}", 71 | ] 72 | except ImportError: 73 | pass 74 | 75 | else: 76 | 77 | # Single config generators are handled "normally" 78 | single_config = any(x in cmake_generator for x in {"NMake", "Ninja"}) 79 | 80 | # CMake allows an arch-in-generator style for backward compatibility 81 | contains_arch = any(x in cmake_generator for x in {"ARM", "Win64"}) 82 | 83 | # Specify the arch if using MSVC generator, but only if it doesn't 84 | # contain a backward-compatibility arch spec already in the 85 | # generator name. 86 | if not single_config and not contains_arch: 87 | cmake_args += ["-A", PLAT_TO_CMAKE[self.plat_name]] 88 | 89 | # Multi-config generators have a different way to specify configs 90 | if not single_config: 91 | cmake_args += [ 92 | f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{cfg.upper()}={extdir}" 93 | ] 94 | build_args += ["--config", cfg] 95 | 96 | if sys.platform.startswith("darwin"): 97 | # Cross-compile support for macOS - respect ARCHFLAGS if set 98 | archs = re.findall(r"-arch (\S+)", os.environ.get("ARCHFLAGS", "")) 99 | if archs: 100 | cmake_args += ["-DCMAKE_OSX_ARCHITECTURES={}".format(";".join(archs))] 101 | 102 | # Set CMAKE_BUILD_PARALLEL_LEVEL to control the parallel build level 103 | # across all generators. 104 | if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ: 105 | # self.parallel is a Python 3 only way to set parallel jobs by hand 106 | # using -j in the build_ext call, not supported by pip or PyPA-build. 107 | if hasattr(self, "parallel") and self.parallel: 108 | # CMake 3.12+ only. 109 | build_args += [f"-j{self.parallel}"] 110 | 111 | build_temp = os.path.join(self.build_temp, ext.name) 112 | if not os.path.exists(build_temp): 113 | os.makedirs(build_temp) 114 | 115 | subprocess.check_call(["cmake", ext.sourcedir] + cmake_args, cwd=build_temp) 116 | subprocess.check_call(["cmake", "--build", "."] + build_args, cwd=build_temp) 117 | 118 | 119 | setup( 120 | name='pyltp', 121 | version='0.4.0', 122 | description='pyltp: the python extension for LTP 3', 123 | long_description=codecs.open('README.md', encoding='utf-8').read(), 124 | long_description_content_type='text/markdown', 125 | author='Yijia Liu, Zixiang Xu, Yang Liu, Yunlong Feng', 126 | author_email='ylfeng@ir.hit.edu.cn', 127 | url='https://github.com/HIT-SCIR/pyltp', 128 | classifiers=[ 129 | 'Development Status :: 4 - Beta', 130 | 'Intended Audience :: Developers', 131 | 'Intended Audience :: Science/Research', 132 | 'License :: OSI Approved :: MIT License', 133 | 'Programming Language :: Python :: 2', 134 | 'Programming Language :: Python :: 2.6', 135 | 'Programming Language :: Python :: 2.7', 136 | 'Programming Language :: Python :: 3', 137 | 'Programming Language :: Python :: 3.2', 138 | 'Programming Language :: Python :: 3.3', 139 | 'Programming Language :: Python :: 3.4', 140 | 'Programming Language :: Python :: 3.5', 141 | 'Programming Language :: Python :: 3.6', 142 | 'Programming Language :: Python :: 3.7', 143 | 'Programming Language :: Python :: 3.8', 144 | "Topic :: Software Development", 145 | "Topic :: Software Development :: Libraries :: Python Modules", 146 | "Topic :: Scientific/Engineering", 147 | "Topic :: Scientific/Engineering :: Information Analysis", 148 | "Topic :: Text Processing :: Linguistic", 149 | ], 150 | zip_safe=False, 151 | ext_modules=[CMakeExtension("pyltp")], 152 | extras_require={"test": ["pytest>=6.0"]}, 153 | cmdclass={"build_ext": CMakeBuild}, 154 | python_requires=">=3.6", 155 | ) 156 | -------------------------------------------------------------------------------- /src/pyltp.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * pyltp - A python extension for Language Technology Platform 3 | * 4 | * @author: YunLong Feng 5 | * 6 | * This project forks from https://github.com/HuangFJ/pyltp. The basic structure 7 | * of the project is perserved. But interface is adopted from XML level to 8 | * library level to allow more flexible usage. 9 | * 10 | * @author: Yijia Liu 11 | * @author: Zixiang Xu 12 | * @author: Yang Liu 13 | * @author: YunLong Feng 14 | */ 15 | #include "ltp/ner_dll.h" 16 | #include "ltp/parser_dll.h" 17 | #include "ltp/postag_dll.h" 18 | #include "ltp/segment_dll.h" 19 | #include "ltp/SplitSentence.h" 20 | #include "ltp/srl_dll.h" 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | #define STRINGIFY(x) #x 27 | #define MACRO_STRINGIFY(x) STRINGIFY(x) 28 | 29 | namespace py = pybind11; 30 | 31 | struct SentenceSplitter { 32 | SentenceSplitter() {} 33 | 34 | static std::vector split(const std::string ¶graph) { 35 | std::vector ret; 36 | SplitSentence(paragraph, ret); 37 | return ret; 38 | } 39 | }; 40 | 41 | struct Segmentor { 42 | Segmentor(const char *model_path, 43 | const char *lexicon_path = nullptr, 44 | const char *force_lexicon_file = nullptr) : model(NULL) { 45 | load(model_path, lexicon_path, force_lexicon_file); 46 | } 47 | 48 | std::vector segment(const std::string &sentence) { 49 | std::vector ret; 50 | if (model == NULL) { 51 | std::cerr << "Segmentor: Model not loaded!" << std::endl; 52 | } else { 53 | segmentor_segment(model, sentence.c_str(), ret); 54 | } 55 | return ret; 56 | } 57 | 58 | void release() { 59 | if (model != NULL) { 60 | segmentor_release_segmentor(model); 61 | model = NULL; 62 | } 63 | } 64 | 65 | void *model; 66 | 67 | private: 68 | void load(const char *model_path, 69 | const char *lexicon_path = nullptr, 70 | const char *force_lexicon_file = nullptr) { 71 | if (model == NULL) { 72 | model = segmentor_create_segmentor(model_path, lexicon_path, force_lexicon_file); 73 | } else { 74 | std::cerr << "Segmentor: Model reloaded!" << std::endl; 75 | } 76 | } 77 | }; 78 | 79 | struct CustomizedSegmentor { 80 | CustomizedSegmentor(const char *base_model_path, 81 | const char *customized_model_path = nullptr, 82 | const char *lexicon_path = nullptr) : model(NULL) { 83 | load(base_model_path, customized_model_path, lexicon_path); 84 | } 85 | 86 | std::vector segment(const std::string &sentence) { 87 | std::vector ret; 88 | if (model == NULL) { 89 | std::cerr << "CustomizedSegmentor: Model not loaded!" << std::endl; 90 | } else { 91 | customized_segmentor_segment(model, sentence.c_str(), ret); 92 | } 93 | return ret; 94 | } 95 | 96 | void release() { 97 | if (model != NULL) { 98 | customized_segmentor_release_segmentor(model); 99 | model = NULL; 100 | } 101 | } 102 | 103 | void *model; 104 | 105 | private: 106 | void load(const char *base_model_path, 107 | const char *customized_model_path = nullptr, 108 | const char *lexicon_path = nullptr) { 109 | if (model == NULL) { 110 | model = customized_segmentor_create_segmentor(base_model_path, customized_model_path, lexicon_path); 111 | } else { 112 | std::cerr << "CustomizedSegmentor: Model reloaded!" << std::endl; 113 | } 114 | } 115 | }; 116 | 117 | struct Postagger { 118 | Postagger(const char *model_path, const char *lexicon_path = nullptr) : model(NULL) { 119 | load(model_path, lexicon_path); 120 | } 121 | 122 | std::vector postag(const std::vector &words) { 123 | std::vector ret; 124 | if (model == NULL) { 125 | std::cerr << "Postagger: Model not loaded!" << std::endl; 126 | } else { 127 | postagger_postag(model, words, ret); 128 | } 129 | return ret; 130 | } 131 | 132 | void release() { 133 | if (model != NULL) { 134 | postagger_release_postagger(model); 135 | model = NULL; 136 | } 137 | } 138 | 139 | void *model; 140 | private: 141 | void load(const char *model_path, const char *lexicon_path = nullptr) { 142 | if (model == NULL) { 143 | model = 144 | postagger_create_postagger(model_path, lexicon_path); 145 | } else { 146 | std::cerr << "Postagger: Model reloaded!" << std::endl; 147 | } 148 | } 149 | }; 150 | 151 | typedef std::pair ParseResult; 152 | 153 | struct Parser { 154 | Parser(const char *model_path) : model(NULL) { 155 | load(model_path); 156 | } 157 | 158 | std::vector parse(const std::vector &words, 159 | const std::vector &postags) { 160 | std::vector ret; 161 | std::vector heads; 162 | std::vector relations; 163 | 164 | if (model == NULL) { 165 | std::cerr << "Parser: Model not loaded!" << std::endl; 166 | } else { 167 | parser_parse(model, words, postags, heads, relations); 168 | } 169 | 170 | for (std::size_t i = 0; i < heads.size(); ++i) { 171 | ret.push_back(ParseResult(heads[i], relations[i])); 172 | } 173 | return ret; 174 | } 175 | 176 | void release() { 177 | if (model != NULL) { 178 | parser_release_parser(model); 179 | model = NULL; 180 | } 181 | } 182 | 183 | void *model; 184 | private: 185 | void load(const char *model_path) { 186 | if (model == NULL) { 187 | model = parser_create_parser(model_path); 188 | } else { 189 | std::cerr << "Parser: Model reloaded!" << std::endl; 190 | } 191 | } 192 | }; 193 | 194 | struct NamedEntityRecognizer { 195 | NamedEntityRecognizer(const char *model_path) : model(NULL) { 196 | load(model_path); 197 | } 198 | 199 | std::vector recognize(const std::vector &words, 200 | const std::vector &postags) { 201 | std::vector netags; 202 | if (model == NULL) { 203 | std::cerr << "NER: Model not loaded!" << std::endl; 204 | } else { 205 | ner_recognize(model, words, postags, netags); 206 | } 207 | return netags; 208 | } 209 | 210 | void release() { 211 | if (model != NULL) { 212 | ner_release_recognizer(model); 213 | model = NULL; 214 | } 215 | } 216 | 217 | void *model; 218 | private: 219 | void load(const char *model_path) { 220 | if (model == NULL) { 221 | model = ner_create_recognizer(model_path); 222 | } else { 223 | std::cerr << "NER: Model reloaded!" << std::endl; 224 | } 225 | } 226 | }; 227 | 228 | typedef std::pair ArgRange; 229 | typedef std::pair Arg; 230 | typedef std::pair> SementicRole; 231 | 232 | struct SementicRoleLabeller { 233 | SementicRoleLabeller(const char *model_path) : loaded(false) { 234 | load(model_path); 235 | } 236 | 237 | std::vector label(const std::vector &words, 238 | const std::vector &postags, 239 | const std::vector &parse) { 240 | std::vector ret; 241 | 242 | // Some trick 243 | std::vector tmp_parse(parse); 244 | for (std::size_t i = 0; i < tmp_parse.size(); ++i) { 245 | tmp_parse[i].first--; 246 | } 247 | if (!loaded) { 248 | std::cerr << "SRL: Model not loaded!" << std::endl; 249 | } else { 250 | srl_dosrl(words, postags, tmp_parse, ret); 251 | } 252 | return ret; 253 | } 254 | 255 | void release() { 256 | if (loaded) { 257 | srl_release_resource(); 258 | } 259 | } 260 | 261 | bool loaded; 262 | private: 263 | void load(const char *model_path) { 264 | loaded = (srl_load_resource(model_path) == 0); 265 | } 266 | }; 267 | 268 | #ifdef SDPG 269 | #include "ltp/lstm_sdparser_dll.h" 270 | 271 | typedef std::pair SemanticArc; 272 | typedef std::vector SemanticNode; 273 | 274 | struct SDGraphParser { 275 | 276 | void load(const std::string &model_path) { 277 | if (model == NULL) { 278 | model = lstmsdparser_create_parser(model_path.c_str()); 279 | } else { 280 | std::cerr << "SDGraphParser: Model reloaded!" << std::endl; 281 | } 282 | } 283 | 284 | std::vector parse(const std::vector &words, 285 | const std::vector &postags) { 286 | std::vector> vecSemResult; 287 | std::vector ret; 288 | if (model == NULL) { 289 | std::cerr << "SDGraphParser: Model not loaded!" << std::endl; 290 | } else { 291 | lstmsdparser_parse(model, words, postags, vecSemResult); 292 | } 293 | for (int i = 0; i < vecSemResult.size(); i++) { 294 | SemanticNode node; 295 | for (int j = 0; j < vecSemResult[i].size(); j++) { 296 | if (vecSemResult[i][j] != "-NULL-") { 297 | node.push_back(SemanticArc( 298 | vecSemResult[i][j], j < vecSemResult[i].size() - 1 ? j + 1 : -1)); 299 | } 300 | } 301 | ret.push_back(node); 302 | } 303 | return ret; 304 | } 305 | 306 | void release() { 307 | if (model != NULL) { 308 | lstmsdparser_release_parser(model); 309 | model = NULL; 310 | } 311 | } 312 | 313 | void *model; 314 | }; 315 | #endif 316 | 317 | PYBIND11_MODULE(pyltp, m) { 318 | #ifdef VERSION_INFO 319 | m.attr("__version__") = MACRO_STRINGIFY(VERSION_INFO); 320 | #else 321 | m.attr("__version__") = "dev"; 322 | #endif 323 | 324 | py::class_(m, "SentenceSplitter") 325 | .def(py::init<>()) 326 | .def_static("split", &SentenceSplitter::split); 327 | 328 | py::class_(m, "Segmentor") 329 | .def( 330 | py::init(), 331 | "Init Segmentor", 332 | py::arg("model_path"), 333 | py::arg("lexicon_path") = nullptr, 334 | py::arg("force_lexicon_path") = nullptr 335 | ) 336 | .def("segment", &Segmentor::segment) 337 | .def("release", &Segmentor::release) 338 | .def("__enter__", 339 | [&](Segmentor &s) { return s; }, "Enter the runtime context related to this object") 340 | .def("__exit__", 341 | [&](Segmentor &s, py::object exc_type, py::object exc_value, py::object traceback) { s.release(); }, 342 | "Exit the runtime context related to this object"); 343 | 344 | py::class_(m, "CustomizedSegmentor") 345 | .def(py::init(), 346 | "Init CustomizedSegmentor", 347 | py::arg("base_model_path"), 348 | py::arg("customized_model_path") = nullptr, 349 | py::arg("lexicon_path") = nullptr 350 | ) 351 | .def("segment", &CustomizedSegmentor::segment) 352 | .def("release", &CustomizedSegmentor::release) 353 | .def("__enter__", 354 | [&](CustomizedSegmentor &s) { return s; }, "Enter the runtime context related to this object") 355 | .def("__exit__", 356 | [&](CustomizedSegmentor &s, py::object exc_type, py::object exc_value, py::object traceback) { s.release(); }, 357 | "Exit the runtime context related to this object"); 358 | 359 | py::class_(m, "Postagger") 360 | .def(py::init(), 361 | "Init Postagger", 362 | py::arg("model_path") = nullptr, 363 | py::arg("lexicon_path") = nullptr) 364 | .def("postag", &Postagger::postag) 365 | .def("release", &Postagger::release) 366 | .def("__enter__", 367 | [&](Postagger &s) { return s; }, "Enter the runtime context related to this object") 368 | .def("__exit__", 369 | [&](Postagger &s, py::object exc_type, py::object exc_value, py::object traceback) { s.release(); }, 370 | "Exit the runtime context related to this object"); 371 | 372 | py::class_(m, "Parser") 373 | .def(py::init()) 374 | .def("parse", &Parser::parse) 375 | .def("release", &Parser::release) 376 | .def("__enter__", 377 | [&](Parser &s) { return s; }, "Enter the runtime context related to this object") 378 | .def("__exit__", 379 | [&](Parser &s, py::object exc_type, py::object exc_value, py::object traceback) { s.release(); }, 380 | "Exit the runtime context related to this object"); 381 | 382 | py::class_(m, "NamedEntityRecognizer") 383 | .def(py::init()) 384 | .def("recognize", &NamedEntityRecognizer::recognize) 385 | .def("release", &NamedEntityRecognizer::release) 386 | .def("__enter__", 387 | [&](NamedEntityRecognizer &s) { return s; }, "Enter the runtime context related to this object") 388 | .def("__exit__", 389 | [&](NamedEntityRecognizer &s, py::object exc_type, py::object exc_value, py::object traceback) { s.release(); }, 390 | "Exit the runtime context related to this object"); 391 | 392 | py::class_(m, "SementicRoleLabeller") 393 | .def(py::init()) 394 | // .def("pi",&SementicRoleLabeller::pi) 395 | .def("label", &SementicRoleLabeller::label) 396 | .def("release", &SementicRoleLabeller::release) 397 | .def("__enter__", 398 | [&](SementicRoleLabeller &s) { return s; }, "Enter the runtime context related to this object") 399 | .def("__exit__", 400 | [&](SementicRoleLabeller &s, py::object exc_type, py::object exc_value, py::object traceback) { s.release(); }, 401 | "Exit the runtime context related to this object"); 402 | #ifdef SDPG 403 | py::class_(m, "SDGraphParser") 404 | .def(py::init<>()) 405 | .def("load", &SDGraphParser::load) 406 | .def("parse",&SDGraphParser::parse) 407 | .def("release", &SDGraphParser::release) 408 | .def("__enter__", 409 | [&](SDGraphParser &s) { return s; }, "Enter the runtime context related to this object") 410 | .def("__exit__", 411 | [&](SDGraphParser &s, py::object exc_type, py::object exc_value, py::object traceback) { s.release(); }, 412 | "Exit the runtime context related to this object"); 413 | #endif 414 | } 415 | -------------------------------------------------------------------------------- /tests/basic_test.py: -------------------------------------------------------------------------------- 1 | def test_main(): 2 | assert True 3 | --------------------------------------------------------------------------------