├── requirements-doc.txt ├── pycrfsuite ├── __init__.py ├── trainer_wrapper.hpp ├── tagger_wrapper.hpp ├── trainer_wrapper.cpp ├── crfsuite_api.pxd ├── _dumpparser.py ├── _logparser.py └── _pycrfsuite.pyx ├── .flake8 ├── tests ├── test_misc.py ├── conftest.py ├── test_itemsequence.py ├── test_trainer.py ├── test_tagger.py └── test_logparser.py ├── .gitignore ├── .gitmodules ├── MANIFEST.in ├── tox.ini ├── docs ├── pycrfsuite.rst ├── index.rst ├── Makefile ├── make.bat └── conf.py ├── .github └── workflows │ ├── tests.yml │ └── build_and_upload.yml ├── LICENSE.txt ├── pyproject.toml ├── setup.py ├── README.rst ├── CHANGES.rst └── examples └── CoNLL 2002.ipynb /requirements-doc.txt: -------------------------------------------------------------------------------- 1 | numpydoc 2 | -------------------------------------------------------------------------------- /pycrfsuite/__init__.py: -------------------------------------------------------------------------------- 1 | from ._pycrfsuite import * 2 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length=160 3 | extend-ignore = E203 -------------------------------------------------------------------------------- /tests/test_misc.py: -------------------------------------------------------------------------------- 1 | def test_version(): 2 | from pycrfsuite import CRFSUITE_VERSION 3 | 4 | assert bool(CRFSUITE_VERSION), CRFSUITE_VERSION 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.so 3 | *.html 4 | .tox 5 | dist 6 | build 7 | _build 8 | MANIFEST 9 | .ipynb_checkpoints 10 | conll2002-esp.crfsuite 11 | *.egg-info/ 12 | .cache -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "crfsuite"] 2 | path = crfsuite 3 | url = https://github.com/chokkan/crfsuite.git 4 | [submodule "liblbfgs"] 5 | path = liblbfgs 6 | url = https://github.com/chokkan/liblbfgs.git 7 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE.txt 2 | include README.rst 3 | include CHANGES.rst 4 | include update_cpp.sh 5 | 6 | recursive-include crfsuite * 7 | recursive-include liblbfgs * 8 | recursive-include tests *.py 9 | recursive-include pycrfsuite *.py *.pxd *.pyx *.cpp *.hpp 10 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py36,py37,py38,py39,py310,py311,py312 3 | 4 | [gh-actions] 5 | python = 6 | 3.6: py36 7 | 3.7: py37 8 | 3.8: py38 9 | 3.9: py39 10 | 3.10: py310 11 | 3.11: py311 12 | 3.12: py312 13 | 14 | [testenv] 15 | changedir = {envtmpdir} 16 | deps = 17 | pytest 18 | commands = 19 | py.test {toxinidir}/tests --doctest-modules {posargs} 20 | 21 | [testenv:manylinux] 22 | changedir = {envtmpdir} 23 | deps = 24 | pytest 25 | commands = 26 | py.test {toxinidir}/tests --doctest-modules {posargs} 27 | -------------------------------------------------------------------------------- /docs/pycrfsuite.rst: -------------------------------------------------------------------------------- 1 | .. _api-reference: 2 | 3 | API Reference 4 | ============= 5 | 6 | .. automodule:: pycrfsuite 7 | 8 | .. autoclass:: ItemSequence 9 | :members: 10 | :undoc-members: 11 | 12 | Training 13 | -------- 14 | 15 | .. autoclass:: Trainer 16 | :members: 17 | :undoc-members: 18 | :inherited-members: 19 | :show-inheritance: 20 | 21 | Tagging 22 | ------- 23 | 24 | .. autoclass:: Tagger 25 | :members: 26 | :undoc-members: 27 | 28 | Debugging 29 | --------- 30 | 31 | .. automodule:: pycrfsuite._dumpparser 32 | :members: ParsedDump 33 | :undoc-members: 34 | -------------------------------------------------------------------------------- /pycrfsuite/trainer_wrapper.hpp: -------------------------------------------------------------------------------- 1 | #ifndef TRAINER_WRAPPER_H 2 | #define TRAINER_WRAPPER_H 1 3 | 4 | #include 5 | #include "crfsuite_api.hpp" 6 | 7 | struct _object; 8 | typedef _object PyObject; 9 | 10 | namespace CRFSuiteWrapper 11 | { 12 | 13 | typedef PyObject* (*messagefunc)(PyObject *self, std::string message); 14 | 15 | /** 16 | * A wrapper around CRFSuite::Trainer that allows overriding 17 | * 'message' method from Python. 18 | */ 19 | class Trainer : public CRFSuite::Trainer 20 | { 21 | protected: 22 | PyObject *m_obj; 23 | messagefunc handler; 24 | 25 | public: 26 | void set_handler(PyObject *obj, messagefunc handler); 27 | virtual void message(const std::string& msg); 28 | void _init_hack(); 29 | }; 30 | 31 | } 32 | #endif 33 | -------------------------------------------------------------------------------- /pycrfsuite/tagger_wrapper.hpp: -------------------------------------------------------------------------------- 1 | #ifndef TAGGER_WRAPPER_H 2 | #define TAGGER_WRAPPER_H 1 3 | 4 | #include 5 | #include 6 | #include 7 | #include "crfsuite_api.hpp" 8 | 9 | 10 | namespace CRFSuiteWrapper 11 | { 12 | 13 | /** 14 | * A wrapper around CRFSuite::Tagger that allows to call 'dump' method 15 | * from Python. 16 | */ 17 | class Tagger : public CRFSuite::Tagger 18 | { 19 | public: 20 | void dump(int fileno) 21 | { 22 | if (model == NULL) { 23 | throw std::runtime_error("Tagger is closed"); 24 | } 25 | 26 | FILE* file = fdopen(fileno, "w"); 27 | if (!file){ 28 | throw std::runtime_error("Can't open file"); 29 | } 30 | 31 | model->dump(model, file); 32 | 33 | if (fclose(file)){ 34 | throw std::runtime_error("Can't close file"); 35 | }; 36 | } 37 | }; 38 | 39 | } 40 | #endif 41 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: tests 2 | 3 | on: 4 | push: 5 | branches: ["master"] 6 | pull_request: 7 | branches: ["master"] 8 | 9 | jobs: 10 | tests: 11 | name: "Python ${{ matrix.python-version }} ${{ matrix.os }}" 12 | runs-on: ${{ matrix.os }} 13 | 14 | strategy: 15 | matrix: 16 | python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] 17 | os: [ubuntu-latest, macos-latest, windows-latest] 18 | exclude: 19 | - os: ubuntu-latest 20 | python-version: "3.6" 21 | fail-fast: false 22 | 23 | steps: 24 | - uses: "actions/checkout@v4" 25 | with: 26 | submodules: true 27 | - uses: "actions/setup-python@v4" 28 | with: 29 | python-version: "${{ matrix.python-version }}" 30 | - name: "Install dependencies" 31 | run: | 32 | python -VV 33 | python -m site 34 | python -m pip install --upgrade pip setuptools wheel 35 | python -m pip install --upgrade virtualenv tox tox-gh-actions 36 | 37 | - name: "Run tox targets for ${{ matrix.python-version }}" 38 | run: "python -m tox" 39 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2014-2017 ScrapingHub Inc. and contributors. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /pycrfsuite/trainer_wrapper.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "Python.h" 3 | #include "trainer_wrapper.hpp" 4 | #include 5 | 6 | namespace CRFSuiteWrapper 7 | { 8 | 9 | 10 | void Trainer::set_handler(PyObject *obj, messagefunc handler) 11 | { 12 | // We are not holding a reference to obj (no PY_INCREF) here 13 | // because doing so prevents __del__ from being called 14 | this->m_obj = obj; 15 | this->handler = handler; 16 | } 17 | 18 | 19 | void Trainer::message(const std::string& msg) 20 | { 21 | if (this->m_obj == NULL) { 22 | std::cerr << "** Trainer invalid state: obj [" << this->m_obj << "]\n"; 23 | return; 24 | } 25 | PyObject* result = handler(this->m_obj, msg); 26 | if (result == NULL){ 27 | // Python exception is raised in the handler. 28 | // Raise a C++ exception to stop training. 29 | // Cython will catch it and re-raise the previous Python exception 30 | // (which is the one raised in a handler). 31 | throw std::runtime_error("You shouldn't have seen this message!"); 32 | } 33 | } 34 | 35 | void Trainer::_init_hack() 36 | { 37 | Trainer::init(); 38 | } 39 | 40 | 41 | } 42 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | =============== 2 | python-crfsuite 3 | =============== 4 | 5 | python-crfsuite is a python binding to CRFsuite_. 6 | 7 | .. _CRFsuite: https://github.com/chokkan/crfsuite 8 | 9 | 10 | Installation 11 | ============ 12 | 13 | :: 14 | 15 | pip install python-crfsuite 16 | 17 | Usage 18 | ===== 19 | 20 | * :ref:`api-reference` 21 | * Example_: building a Named Entity Recognition system with python-crfsuite. 22 | 23 | .. _Example: https://github.com/scrapinghub/python-crfsuite/blob/master/examples/CoNLL%202002.ipynb 24 | 25 | python-crfsuite is licensed under MIT license. 26 | CRFsuite_ C/C++ library is licensed under BSD license. 27 | 28 | Development happens at github: https://github.com/scrapinghub/python-crfsuite 29 | 30 | .. toctree:: 31 | :hidden: 32 | 33 | pycrfsuite 34 | 35 | See Also 36 | ======== 37 | 38 | sklearn-crfsuite_ is a python-crfsuite wrapper which provides 39 | API similar to scikit-learn. 40 | 41 | .. _sklearn-crfsuite: https://github.com/TeamHG-Memex/sklearn-crfsuite 42 | 43 | 44 | Indices and tables 45 | ================== 46 | 47 | * :ref:`genindex` 48 | * :ref:`modindex` 49 | * :ref:`search` 50 | 51 | 52 | 53 | .. include:: ../CHANGES.rst 54 | 55 | 56 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.fixture() 5 | def xseq(): 6 | return [ 7 | {"walk": 1, "shop": 0.5}, 8 | {"walk": 1}, 9 | {"walk": 1, "clean": 0.5}, 10 | {"shop": 0.5, "clean": 0.5}, 11 | {"walk": 0.5, "clean": 1}, 12 | {"clean": 1, "shop": 0.1}, 13 | {"walk": 1, "shop": 0.5}, 14 | {}, 15 | {"clean": 1}, 16 | {"солнце": "не светит".encode(), "clean": 1}, 17 | {"world": 2}, 18 | ] 19 | 20 | 21 | @pytest.fixture 22 | def yseq(): 23 | return [ 24 | "sunny", 25 | "sunny", 26 | "sunny", 27 | "rainy", 28 | "rainy", 29 | "rainy", 30 | "sunny", 31 | "sunny", 32 | "rainy", 33 | "rainy", 34 | "好", 35 | ] 36 | 37 | 38 | @pytest.fixture 39 | def model_filename(tmpdir, xseq, yseq): 40 | from pycrfsuite import Trainer 41 | 42 | trainer = Trainer("lbfgs", verbose=False) 43 | trainer.append(xseq, yseq) 44 | model_filename = str(tmpdir.join("model.crfsuite")) 45 | trainer.train(model_filename) 46 | return model_filename 47 | 48 | 49 | @pytest.fixture 50 | def model_bytes(model_filename): 51 | with open(model_filename, "rb") as f: 52 | return f.read() 53 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "python-crfsuite" 3 | version = "0.9.11" 4 | description = "Python binding for CRFsuite" 5 | authors = [ 6 | {name = "Terry Peng", email = "pengtaoo@gmail.com"}, 7 | {name = "Mikhail Korobov", email = "kmike84@gmail.com"}, 8 | ] 9 | readme = "README.rst" 10 | license = {text = "MIT License", url = "http://www.opensource.org/licenses/mit-license.php"} 11 | requires-python = ">=3.8" 12 | classifiers = [ 13 | "Development Status :: 4 - Beta", 14 | "Intended Audience :: Developers", 15 | "Intended Audience :: Science/Research", 16 | "License :: OSI Approved :: MIT License", 17 | "Programming Language :: Cython", 18 | "Programming Language :: Python", 19 | "Programming Language :: Python :: 3", 20 | "Programming Language :: Python :: 3.6", 21 | "Programming Language :: Python :: 3.7", 22 | "Programming Language :: Python :: 3.8", 23 | "Programming Language :: Python :: 3.9", 24 | "Programming Language :: Python :: 3.10", 25 | "Topic :: Software Development", 26 | "Topic :: Software Development :: Libraries :: Python Modules", 27 | "Topic :: Scientific/Engineering", 28 | "Topic :: Scientific/Engineering :: Information Analysis", 29 | "Topic :: Text Processing :: Linguistic", 30 | ] 31 | 32 | [project.urls] 33 | Homepage = "https://github.com/scrapinghub/python-crfsuite" 34 | 35 | [project.optional-dependencies] 36 | dev = ["tox", 37 | "black", 38 | "isort", 39 | "flake8", 40 | ] 41 | 42 | [build-system] 43 | requires = ["setuptools>=42", "wheel", "cython"] 44 | build-backend = "setuptools.build_meta" 45 | 46 | 47 | [tool.setuptools.packages.find] 48 | include = ["pycrfsuite"] 49 | 50 | 51 | [tool.pytest.ini_options] 52 | addopts = [ 53 | "--import-mode=importlib", 54 | ] 55 | testpaths = [ 56 | "tests", 57 | ] 58 | 59 | [tool.isort] 60 | profile = "black" 61 | src_paths = ["usaddress", "tests"] 62 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import glob 3 | import sys 4 | from distutils.command.build_ext import build_ext 5 | 6 | from Cython.Build import cythonize 7 | from setuptools import Extension, setup 8 | 9 | sources = ["pycrfsuite/_pycrfsuite.pyx", "pycrfsuite/trainer_wrapper.cpp"] 10 | 11 | # crfsuite 12 | sources += glob.glob("crfsuite/lib/crf/src/*.c") 13 | sources += glob.glob("crfsuite/swig/*.cpp") 14 | 15 | sources += ["crfsuite/lib/cqdb/src/cqdb.c"] 16 | sources += ["crfsuite/lib/cqdb/src/lookup3.c"] 17 | 18 | # lbfgs 19 | sources += glob.glob("liblbfgs/lib/*.c") 20 | 21 | includes = [ 22 | "crfsuite/include/", 23 | "crfsuite/lib/cqdb/include", 24 | "liblbfgs/include", 25 | "pycrfsuite/", 26 | ] 27 | 28 | 29 | class build_ext_check_gcc(build_ext): 30 | def build_extensions(self): 31 | c = self.compiler 32 | 33 | _compile = c._compile 34 | 35 | def c_compile(obj, src, ext, cc_args, extra_postargs, pp_opts): 36 | cc_args = ( 37 | cc_args + ["-D_POSIX_C_SOURCE=200112L"] 38 | if src.startswith("crfsuite/") 39 | else cc_args 40 | ) 41 | cc_args = cc_args + ["-std=c99"] if src.endswith(".c") else cc_args 42 | return _compile(obj, src, ext, cc_args, extra_postargs, pp_opts) 43 | 44 | if c.compiler_type == "unix" and any( 45 | item == "gcc" or item.endswith("-gcc") for item in c.compiler 46 | ): 47 | c._compile = c_compile 48 | 49 | elif self.compiler.compiler_type == "msvc": 50 | if sys.version_info[:2] < (3, 5): 51 | c.include_dirs.extend(["crfsuite/win32"]) 52 | 53 | build_ext.build_extensions(self) 54 | 55 | 56 | setup( 57 | ext_modules=cythonize( 58 | [ 59 | Extension( 60 | "pycrfsuite._pycrfsuite", 61 | include_dirs=includes, 62 | language="c++", 63 | sources=sorted(sources), 64 | ) 65 | ] 66 | ), 67 | cmdclass={"build_ext": build_ext_check_gcc}, 68 | ) 69 | -------------------------------------------------------------------------------- /.github/workflows/build_and_upload.yml: -------------------------------------------------------------------------------- 1 | name: Build and upload to PyPI 2 | 3 | on: 4 | push: 5 | pull_request: 6 | release: 7 | types: 8 | - published 9 | 10 | jobs: 11 | build_wheels: 12 | name: "Build wheels on ${{ matrix.os }}" 13 | runs-on: ${{ matrix.os }} 14 | strategy: 15 | fail-fast: false 16 | matrix: 17 | os: [ubuntu-latest, macos-latest, windows-latest] 18 | steps: 19 | - uses: "actions/checkout@v4" 20 | with: 21 | submodules: true 22 | - name: "Set up QEMU" 23 | if: matrix.os == 'ubuntu-latest' 24 | uses: "docker/setup-qemu-action@v3" 25 | with: 26 | platforms: arm64 27 | - name: "Build wheels" 28 | uses: "pypa/cibuildwheel@v2.21.1" 29 | env: 30 | CIBW_SKIP: "pp*" # FIXME 31 | CIBW_ARCHS_LINUX: "auto aarch64" 32 | CIBW_TEST_REQUIRES: "pytest" 33 | CIBW_TEST_COMMAND: "pytest {project}/tests --doctest-modules" 34 | - uses: "actions/upload-artifact@v4" 35 | with: 36 | path: "./wheelhouse/*.whl" 37 | name: wheel-${{ matrix.os }} 38 | 39 | make_sdist: 40 | name: "Build source distribution" 41 | runs-on: ubuntu-latest 42 | steps: 43 | - uses: "actions/checkout@v4" 44 | with: 45 | submodules: true 46 | - name: "Build source distribution" 47 | run: "pipx run build --sdist" 48 | - uses: "actions/upload-artifact@v4" 49 | with: 50 | path: "./dist/*.tar.gz" 51 | name: sdist 52 | 53 | upload_to_pypi: 54 | name: "Upload to PyPI" 55 | runs-on: ubuntu-latest 56 | if: github.event_name == 'release' && github.event.action == 'published' 57 | needs: 58 | - build_wheels 59 | - make_sdist 60 | steps: 61 | - uses: "actions/download-artifact@v4" 62 | with: 63 | path: dist 64 | merge-multiple: true 65 | - uses: "pypa/gh-action-pypi-publish@v1.13.0" 66 | with: 67 | user: __token__ 68 | password: ${{ secrets.PYPI_TOKEN }} 69 | print_hash: true 70 | verbose: true 71 | skip_existing: true 72 | -------------------------------------------------------------------------------- /tests/test_itemsequence.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import pycrfsuite 4 | 5 | 6 | def test_basic(): 7 | seq = pycrfsuite.ItemSequence([]) 8 | assert len(seq) == 0 9 | assert seq.items() == [] 10 | 11 | 12 | def test_lists(): 13 | seq = pycrfsuite.ItemSequence([["foo", "bar"], ["bar", "baz"]]) 14 | assert len(seq) == 2 15 | assert seq.items() == [{"foo": 1.0, "bar": 1.0}, {"bar": 1.0, "baz": 1.0}] 16 | assert pycrfsuite.ItemSequence(seq.items()).items() == seq.items() 17 | 18 | 19 | def test_dicts(): 20 | seq = pycrfsuite.ItemSequence( 21 | [ 22 | {"foo": True, "bar": {"foo": -1, "baz": False}}, 23 | ] 24 | ) 25 | assert len(seq) == 1 26 | assert seq.items() == [{"foo": 1.0, "bar:foo": -1, "bar:baz": 0.0}] 27 | 28 | 29 | def test_unicode(): 30 | seq = pycrfsuite.ItemSequence( 31 | [ 32 | {"foo": "привет", "ключ": 1.0, "привет": "мир"}, 33 | ] 34 | ) 35 | assert seq.items() == [{"foo:привет": 1.0, "ключ": 1.0, "привет:мир": 1.0}] 36 | 37 | 38 | @pytest.mark.xfail() 39 | def test_bad(): 40 | with pytest.raises(ValueError): 41 | seq = pycrfsuite.ItemSequence("foo") 42 | print(seq.items()) 43 | 44 | with pytest.raises(ValueError): 45 | seq = pycrfsuite.ItemSequence([[{"foo": "bar"}]]) 46 | print(seq.items()) 47 | 48 | 49 | def test_nested(): 50 | seq = pycrfsuite.ItemSequence( 51 | [ 52 | { 53 | "foo": { 54 | "bar": "baz", 55 | "spam": 0.5, 56 | "egg": ["x", "y"], 57 | "ham": {"x": -0.5, "y": -0.1}, 58 | }, 59 | }, 60 | { 61 | "foo": {"bar": "ham", "spam": -0.5, "ham": {"x", "y"}}, 62 | }, 63 | ] 64 | ) 65 | assert len(seq) == 2 66 | assert seq.items() == [ 67 | { 68 | "foo:bar:baz": 1.0, 69 | "foo:spam": 0.5, 70 | "foo:egg:x": 1.0, 71 | "foo:egg:y": 1.0, 72 | "foo:ham:x": -0.5, 73 | "foo:ham:y": -0.1, 74 | }, 75 | { 76 | "foo:bar:ham": 1.0, 77 | "foo:spam": -0.5, 78 | "foo:ham:x": 1.0, 79 | "foo:ham:y": 1.0, 80 | }, 81 | ] 82 | assert pycrfsuite.ItemSequence(seq.items()).items() == seq.items() 83 | -------------------------------------------------------------------------------- /pycrfsuite/crfsuite_api.pxd: -------------------------------------------------------------------------------- 1 | from libcpp.string cimport string 2 | from libcpp.vector cimport vector 3 | 4 | cdef extern from "../crfsuite/include/crfsuite.h": 5 | ctypedef enum: 6 | CRFSUITE_SUCCESS 7 | CRFSUITEERR_UNKNOWN # Unknown error occurred. 8 | CRFSUITEERR_OUTOFMEMORY # Insufficient memory. 9 | CRFSUITEERR_NOTSUPPORTED # Unsupported operation. 10 | CRFSUITEERR_INCOMPATIBLE # Incompatible data. 11 | CRFSUITEERR_INTERNAL_LOGIC # Internal error. 12 | CRFSUITEERR_OVERFLOW # Overflow. 13 | CRFSUITEERR_NOTIMPLEMENTED # Not implemented. 14 | 15 | 16 | cdef extern from "../crfsuite/include/crfsuite_api.hpp" namespace "CRFSuite": 17 | cdef cppclass Attribute: 18 | string attr 19 | double value 20 | 21 | Attribute() 22 | Attribute(string) 23 | Attribute(string, double) 24 | 25 | ctypedef vector[Attribute] Item 26 | ctypedef vector[Item] ItemSequence 27 | ctypedef vector[string] StringList 28 | 29 | cdef string version() 30 | 31 | 32 | cdef extern from "trainer_wrapper.hpp" namespace "CRFSuiteWrapper": 33 | 34 | ctypedef object (*messagefunc)(object self, string message) 35 | 36 | cdef cppclass Trainer: 37 | Trainer() except + 38 | void set_handler(object, messagefunc) except + 39 | void clear() except + 40 | void append(ItemSequence, StringList, int) except + 41 | bint select(string, string) except + 42 | int train(string, int) except + 43 | StringList params() except + 44 | void set(string, string) except + 45 | string get(string) except + 46 | string help(string) except + 47 | void _init_hack() except + 48 | 49 | 50 | cdef extern from "tagger_wrapper.hpp" namespace "CRFSuiteWrapper": 51 | 52 | ctypedef object (*messagefunc)(object self, string message) 53 | 54 | cdef cppclass Tagger: 55 | Tagger() except + 56 | int open(string) except + 57 | int open(const void*, size_t) except + 58 | void close() except + 59 | StringList labels() except + 60 | StringList tag(ItemSequence) except + 61 | void set(ItemSequence) except + 62 | StringList viterbi() except + 63 | double probability(StringList) except + 64 | double marginal(string, int) except + 65 | void dump(int) except + 66 | void dump2() except + 67 | -------------------------------------------------------------------------------- /pycrfsuite/_dumpparser.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | class ParsedDump: 5 | """ 6 | CRFsuite model parameters. Objects of this type are returned by 7 | :meth:`pycrfsuite.Tagger.info()` method. 8 | 9 | Attributes 10 | ---------- 11 | 12 | transitions : dict 13 | ``{(from_label, to_label): weight}`` dict with learned transition weights 14 | 15 | state_features : dict 16 | ``{(attribute, label): weight}`` dict with learned ``(attribute, label)`` weights 17 | 18 | header : dict 19 | Metadata from the file header 20 | 21 | labels : dict 22 | ``{name: internal_id}`` dict with model labels 23 | 24 | attributes : dict 25 | ``{name: internal_id}`` dict with known attributes 26 | 27 | """ 28 | 29 | def __init__(self): 30 | self.header = {} 31 | self.labels = {} 32 | self.attributes = {} 33 | self.transitions = {} 34 | self.state_features = {} 35 | 36 | 37 | class CRFsuiteDumpParser: 38 | """ 39 | A hack: parser for `crfsuite dump` results. 40 | 41 | Obtaining coefficients "the proper way" is quite hard otherwise 42 | because in CRFsuite they are hidden in private structures. 43 | """ 44 | 45 | def __init__(self): 46 | self.state = None 47 | self.result = ParsedDump() 48 | 49 | def feed(self, line): 50 | # Strip initial ws and line terminator, but allow for ws at the end of feature names. 51 | line = line.lstrip().rstrip("\r\n") 52 | if not line: 53 | return 54 | 55 | m = re.match( 56 | r"(FILEHEADER|LABELS|ATTRIBUTES|TRANSITIONS|STATE_FEATURES) = {", line 57 | ) 58 | if m: 59 | self.state = m.group(1) 60 | elif line == "}": 61 | self.state = None 62 | else: 63 | getattr(self, "parse_%s" % self.state)(line) 64 | 65 | def parse_FILEHEADER(self, line): 66 | m = re.match(r"(\w+): (.*)", line) 67 | self.result.header[m.group(1)] = m.group(2) 68 | 69 | def parse_LABELS(self, line): 70 | m = re.match(r"(\d+): (.*)", line) 71 | self.result.labels[m.group(2)] = m.group(1) 72 | 73 | def parse_ATTRIBUTES(self, line): 74 | m = re.match(r"(\d+): (.*)", line) 75 | self.result.attributes[m.group(2)] = m.group(1) 76 | 77 | def parse_TRANSITIONS(self, line): 78 | m = re.match(r"\(\d+\) (.+) --> (.+): ([+-]?\d+\.\d+)", line) 79 | from_, to_ = m.group(1), m.group(2) 80 | assert from_ in self.result.labels 81 | assert to_ in self.result.labels 82 | self.result.transitions[(from_, to_)] = float(m.group(3)) 83 | 84 | def parse_STATE_FEATURES(self, line): 85 | m = re.match(r"\(\d+\) (.+) --> (.+): ([+-]?\d+\.\d+)", line) 86 | attr, label = m.group(1), m.group(2) 87 | assert attr in self.result.attributes 88 | assert label in self.result.labels 89 | self.result.state_features[(attr, label)] = float(m.group(3)) 90 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | =============== 2 | python-crfsuite 3 | =============== 4 | 5 | .. image:: https://github.com/scrapinghub/python-crfsuite/actions/workflows/tests.yml/badge.svg 6 | :target: https://github.com/scrapinghub/python-crfsuite/actions/workflows/tests.yml 7 | 8 | .. image:: https://img.shields.io/pypi/v/python-crfsuite.svg?style=flat-square 9 | :target: https://pypi.python.org/pypi/python-crfsuite 10 | :alt: pypi Version 11 | 12 | .. image:: https://anaconda.org/conda-forge/python-crfsuite/badges/version.svg 13 | :target: https://anaconda.org/conda-forge/python-crfsuite 14 | :alt: conda Version 15 | 16 | python-crfsuite is a python binding to CRFsuite_. 17 | 18 | Installation 19 | ============ 20 | 21 | Using ``pip``:: 22 | 23 | pip install python-crfsuite 24 | 25 | Using ``conda``:: 26 | 27 | conda install -c conda-forge python-crfsuite 28 | 29 | Usage 30 | ===== 31 | 32 | See docs_ and an example_. 33 | 34 | .. _docs: http://python-crfsuite.rtfd.org/ 35 | .. _example: https://github.com/scrapinghub/python-crfsuite/blob/master/examples/CoNLL%202002.ipynb 36 | 37 | See Also 38 | ======== 39 | 40 | sklearn-crfsuite_ is a python-crfsuite wrapper which provides 41 | API similar to scikit-learn. 42 | 43 | .. _sklearn-crfsuite: https://github.com/TeamHG-Memex/sklearn-crfsuite 44 | 45 | Contributing 46 | ============ 47 | 48 | * Source code: https://github.com/scrapinghub/python-crfsuite 49 | * Issue tracker: https://github.com/scrapinghub/python-crfsuite/issues 50 | 51 | Feel free to submit ideas, bugs reports, pull requests or regular patches. 52 | 53 | Please don't commit generated cpp files in the same commit as other files. 54 | 55 | .. _Cython: http://cython.org/ 56 | .. _tox: http://tox.testrun.org 57 | 58 | Authors and Contributors 59 | ======================== 60 | 61 | Original authors are Terry Peng and 62 | Mikhail Korobov . Many other people contributed; 63 | some of them can be found at github Contributors_ page. 64 | 65 | Bundled CRFSuite_ C/C++ library is by Naoaki Okazaki & contributors. 66 | 67 | .. _Contributors: https://github.com/scrapinghub/python-crfsuite/graphs/contributors 68 | 69 | License 70 | ======= 71 | 72 | python-crfsuite is licensed under MIT license. 73 | CRFsuite_ library is licensed under BSD license. 74 | 75 | .. _CRFsuite: https://github.com/chokkan/crfsuite 76 | 77 | Alternatives 78 | ============ 79 | 80 | * https://github.com/chokkan/crfsuite/tree/master/swig/python - official 81 | Python wrapper, exposes C++ API using SWIG. 82 | * https://github.com/jakevdp/pyCRFsuite - uses C API instead of C++ API; 83 | allows to use scipy sparse matrices as an input. At the time of writing 84 | it is unmaintained. 85 | * https://github.com/bosondata/crfsuite-rs - uses a Rust wrapper with CFFI instead of C++ API; 86 | allows to tag with GIL released for better performance. 87 | 88 | This package (python-crfsuite) wraps CRFsuite C++ API using Cython. 89 | It is faster than official SWIG wrapper and has a simpler codebase than 90 | a more advanced pyCRFsuite. python-crfsuite works in Python 2 and Python 3, 91 | doesn't have external dependencies (CRFsuite is bundled, numpy/scipy stack 92 | is not needed) and workarounds some of the issues with C++ CRFsuite library. 93 | -------------------------------------------------------------------------------- /tests/test_trainer.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | 5 | from pycrfsuite import Trainer 6 | 7 | 8 | def test_trainer(tmpdir, xseq, yseq): 9 | trainer = Trainer("lbfgs") 10 | trainer.append(xseq, yseq) 11 | 12 | model_filename = str(tmpdir.join("model.crfsuite")) 13 | assert not os.path.isfile(model_filename) 14 | trainer.train(model_filename) 15 | assert os.path.isfile(model_filename) 16 | 17 | 18 | def test_trainer_noselect(tmpdir, xseq, yseq): 19 | # This shouldn't segfault; see https://github.com/chokkan/crfsuite/pull/21 20 | trainer = Trainer() 21 | trainer.append(xseq, yseq) 22 | model_filename = str(tmpdir.join("model.crfsuite")) 23 | trainer.train(model_filename) 24 | 25 | 26 | def test_trainer_noappend(tmpdir): 27 | # This shouldn't segfault; see https://github.com/chokkan/crfsuite/pull/21 28 | trainer = Trainer() 29 | trainer.select("lbfgs") 30 | model_filename = str(tmpdir.join("model.crfsuite")) 31 | trainer.train(model_filename) 32 | 33 | 34 | def test_trainer_noselect_noappend(tmpdir): 35 | # This shouldn't segfault; see https://github.com/chokkan/crfsuite/pull/21 36 | trainer = Trainer() 37 | model_filename = str(tmpdir.join("model.crfsuite")) 38 | trainer.train(model_filename) 39 | 40 | 41 | def test_training_messages(tmpdir, xseq, yseq): 42 | class CapturingTrainer(Trainer): 43 | def __init__(self): 44 | self.messages = [] 45 | 46 | def message(self, message): 47 | self.messages.append(message) 48 | 49 | trainer = CapturingTrainer() 50 | trainer.select("lbfgs") 51 | trainer.append(xseq, yseq) 52 | assert not trainer.messages 53 | 54 | model_filename = str(tmpdir.join("model.crfsuite")) 55 | trainer.train(model_filename) 56 | assert trainer.messages 57 | assert "type: CRF1d\n" in trainer.messages 58 | # print("".join(trainer.messages)) 59 | 60 | 61 | def test_training_messages_exception(tmpdir, xseq, yseq): 62 | class MyException(Exception): 63 | pass 64 | 65 | class BadTrainer(Trainer): 66 | def message(self, message): 67 | raise MyException("error") 68 | 69 | trainer = BadTrainer() 70 | trainer.select("lbfgs") 71 | trainer.append(xseq, yseq) 72 | 73 | model_filename = str(tmpdir.join("model.crfsuite")) 74 | 75 | with pytest.raises(MyException): 76 | trainer.train(model_filename) 77 | 78 | 79 | def test_trainer_select_raises_error(): 80 | trainer = Trainer() 81 | with pytest.raises(ValueError): 82 | trainer.select("foo") 83 | 84 | 85 | @pytest.mark.parametrize( 86 | "algo", 87 | [ 88 | "lbfgs", 89 | "l2sgd", 90 | "ap", 91 | "averaged-perceptron", 92 | "pa", 93 | "passive-aggressive", 94 | "arow", 95 | ], 96 | ) 97 | def test_algorithm_parameters(algo): 98 | trainer = Trainer(algo) 99 | params = trainer.get_params() 100 | assert params 101 | 102 | # set the same values 103 | trainer.set_params(params) 104 | params2 = trainer.get_params() 105 | assert params2 == params 106 | 107 | # change a value 108 | trainer.set("feature.possible_states", True) 109 | assert trainer.get_params()["feature.possible_states"] is True 110 | 111 | trainer.set("feature.possible_states", False) 112 | assert trainer.get_params()["feature.possible_states"] is False 113 | 114 | # invalid parameter 115 | params["foo"] = 5 116 | with pytest.raises(ValueError): 117 | trainer.set_params(params) 118 | 119 | 120 | def test_params_and_help(): 121 | trainer = Trainer() 122 | 123 | trainer.select("lbfgs") 124 | assert "c1" in trainer.params() 125 | assert "c2" in trainer.params() 126 | assert "num_memories" in trainer.params() 127 | assert "L1" in trainer.help("c1") 128 | 129 | trainer.select("l2sgd") 130 | assert "c2" in trainer.params() 131 | assert "c1" not in trainer.params() 132 | assert "L2" in trainer.help("c2") 133 | 134 | 135 | def test_help_invalid_parameter(): 136 | trainer = Trainer() 137 | trainer.select("l2sgd") 138 | 139 | # This segfaults without a workaround; 140 | # see https://github.com/chokkan/crfsuite/pull/21 141 | with pytest.raises(ValueError): 142 | trainer.help("foo") 143 | 144 | with pytest.raises(ValueError): 145 | trainer.help("c1") 146 | 147 | 148 | def test_get_parameter(): 149 | trainer = Trainer() 150 | trainer.select("l2sgd") 151 | assert abs(trainer.get("c2") - 0.1) > 1e-6 152 | trainer.set("c2", 0.1) 153 | assert abs(trainer.get("c2") - 0.1) < 1e-6 154 | 155 | 156 | def test_set_parameters_in_constructor(): 157 | trainer = Trainer(params={"c2": 100}) 158 | assert abs(trainer.get("c2") - 100) < 1e-6 159 | -------------------------------------------------------------------------------- /CHANGES.rst: -------------------------------------------------------------------------------- 1 | Changes 2 | ======= 3 | 4 | 0.9.9 (2023-02-01) 5 | ------------------ 6 | 7 | * Python 3.11 Support 8 | 9 | 0.9.7 (2020-03-15) 10 | ------------------ 11 | 12 | * Python 3.4 is no longer supported (it may work, but CI is disabled) 13 | * Python 3.8 support 14 | * fixed installation issues on OS X (thanks @kvinwang) 15 | * make it easier for distributions to have a reproducible build 16 | (thanks @bmwiedemann) 17 | 18 | 0.9.6 (2018-08-01) 19 | ------------------ 20 | 21 | * Python 3.7 support (thanks @fgregg, @danmacnaughtan and @fuhrysteve). 22 | * Python 3.3 support is dropped. 23 | * new Tagger.open_inmemory method which allows to load tagger data 24 | without having a file on-disk (thanks @lucywang000). 25 | * license information is added to setup.py (thanks @nils-werner). 26 | 27 | 0.9.5 (2017-09-05) 28 | ------------------ 29 | 30 | * Python 3.6 wheels for Windows (thanks @fgregg). 31 | 32 | 0.9.4 (2017-09-04) 33 | ------------------ 34 | 35 | * Packaging fix (thanks @fgregg). 36 | 37 | 0.9.3 (2017-09-03) 38 | ------------------ 39 | 40 | * Fixed compatibility with Python 3.5+ on Windows (thanks @fgregg); 41 | * CRFSuite C++ library is updated to latest version, this fixes several 42 | memory leaks and improves performance (thanks @fgregg); 43 | * extension is rebuilt with Cython 0.26.1. 44 | 45 | 0.9.2 (2017-05-04) 46 | ------------------ 47 | 48 | * binary wheels for OS X and Linux (thanks @jeancochrane). 49 | 50 | 0.9.1 (2016-12-19) 51 | ------------------ 52 | 53 | This is a release without changes in functionality. 54 | 55 | * Repository is moved to https://github.com/scrapinghub/python-crfsuite; 56 | * We're now providing Windows wheels for Python 2.7, 3.3. and 3.4. 57 | 58 | 0.9 (2016-12-08) 59 | ---------------- 60 | 61 | * Python 2.6 support is dropped; 62 | * CRFSuite C++ library is updated to a more recent commit; 63 | * improved Windows support (thanks @fgregg); 64 | * fixed building with gcc < 5.0.0 (thanks @kantan2015); 65 | * extension is rebuilt with Cython 0.25.1; this improves PyPy compatibility 66 | (but we're not quite there yet). 67 | * docs: trainer.logparser example is added to the notebook (thanks @samgalen). 68 | 69 | 0.8.4 (2015-11-25) 70 | ------------------ 71 | 72 | * the wrapper is rebuilt with Cython 0.23.4; 73 | * declared Python 3.5 compatibility; 74 | * fixed an issue with feature names ending with white spaces. 75 | 76 | 0.8.3 (2015-04-24) 77 | ------------------ 78 | 79 | * fix build on Windows. (thanks @fgregg) 80 | 81 | 0.8.2 (2015-02-04) 82 | ------------------ 83 | 84 | * memory leak is fixed by updating the bundled CRFsuite C++ library; 85 | * the wrapper is rebuilt with Cython 0.21.2. 86 | 87 | 0.8.1 (2014-10-10) 88 | ------------------ 89 | 90 | * fix packaging issues with 0.8 release. 91 | 92 | 0.8 (2014-10-10) 93 | ---------------- 94 | 95 | * :class:`~ItemSequence` wrapper is added; 96 | * tox tests are fixed. 97 | 98 | 0.7 (2014-08-11) 99 | ---------------- 100 | 101 | * More data formats for ``xseq``: ``{"prefix": {feature_dict}}`` and 102 | ``{"key": set(["key1",...])}`` feature dicts are now accepted by 103 | :class:`pycrfsuite.Trainer` and :class:`pycrfsuite.Tagger`; 104 | * feature separator changed from "=" to ":" (it looks better in case of 105 | multi-level features); 106 | * small doc and README fixes. 107 | 108 | 109 | 0.6.1 (2014-06-06) 110 | ------------------ 111 | 112 | * Switch to setuptools; 113 | * wheels are uploaded to pypi for faster installation. 114 | 115 | 0.6 (2014-05-29) 116 | ---------------- 117 | 118 | * More data formats for ``xseq``: ``{"key": "value"}`` and 119 | ``{"key": bool_value}`` feature dicts are now accepted by 120 | :class:`pycrfsuite.Trainer` and :class:`pycrfsuite.Tagger`. 121 | 122 | 0.5 (2014-05-27) 123 | ---------------- 124 | 125 | * Exceptions in logging message handlers are now propagated and raised. This 126 | allows, for example, to stop training earlier by pressing Ctrl-C. 127 | 128 | * It is now possible to customize :class:`pycrfsuite.Trainer` logging 129 | more easily by overriding the following methods: 130 | :meth:`pycrfsuite.Trainer.on_start`, 131 | :meth:`pycrfsuite.Trainer.on_featgen_progress`, 132 | :meth:`pycrfsuite.Trainer.on_featgen_end`, 133 | :meth:`pycrfsuite.Trainer.on_prepared`, 134 | :meth:`pycrfsuite.Trainer.on_prepare_error`, 135 | :meth:`pycrfsuite.Trainer.on_iteration`, 136 | :meth:`pycrfsuite.Trainer.on_optimization_end` 137 | :meth:`pycrfsuite.Trainer.on_end`. The feature is implemented by parsing 138 | CRFsuite log. There is :class:`pycrfsuite.BaseTrainer` that is not 139 | doing this. 140 | 141 | 0.4.1 (2014-05-18) 142 | ------------------ 143 | 144 | * :meth:`pycrfsuite.Tagger.info()` is fixed. 145 | 146 | 0.4 (2014-05-16) 147 | ---------------- 148 | 149 | * (backwards-incompatible) training parameters are now passed 150 | using ``params`` argument of :class:`pycrfsuite.Trainer` constructor 151 | instead of ``**kwargs``; 152 | * (backwards-incompatible) logging support is dropped; 153 | * `verbose` argument for :class:`pycrfsuite.Trainer` constructor; 154 | * :meth:`pycrfsuite.Trainer.get_params` and 155 | :meth:`pycrfsuite.Trainer.set_params` for getting/setting multiple training 156 | parameters at once; 157 | * string handling in Python 3.x is fixed by rebuilding the wrapper with 158 | Cython 0.21dev; 159 | * algorithm names are normalized to support names used 160 | by crfsuite console utility and documented in crfsuite manual; 161 | * type conversion for training parameters is fixed: ``feature.minfreq`` 162 | now works, and boolean arguments become boolean. 163 | 164 | 0.3 (2014-05-14) 165 | ---------------- 166 | 167 | python-crfsuite now detects the feature format (dict vs list of strings) 168 | automatically - it turns out the performance overhead is negligible. 169 | 170 | * ``Trainer.append_stringslists`` and ``Trainer.append_dicts`` methods 171 | are replaced with a single :meth:`pycrfsuite.Trainer.append` method; 172 | * ``Tagger.set_stringlists`` and ``Tagger.set_dicts`` methods are 173 | removed in favor of :meth:`pycrfsuite.Tagger.set` method; 174 | * ``feature_format`` arguments in :class:`pycrfsuite.Tagger` methods 175 | and constructor are dropped. 176 | 177 | 0.2 (2014-05-14) 178 | ---------------- 179 | 180 | * :meth:`pycrfsuite.Tagger.dump()` and :meth:`pycrfsuite.Tagger.info()` 181 | methods for model debugging; 182 | * a memory leak in Trainer is fixed (trainer instances were never 183 | garbage collected); 184 | * documentation and testing improvements. 185 | 186 | 0.1 (2014-04-30) 187 | ---------------- 188 | 189 | Many changes; python-crfsuite is almost rewritten. 190 | 191 | 0.0.1 (2014-04-24) 192 | ------------------ 193 | 194 | Initial release. 195 | -------------------------------------------------------------------------------- /pycrfsuite/_logparser.py: -------------------------------------------------------------------------------- 1 | import re 2 | import fractions 3 | from collections import namedtuple 4 | 5 | LabelScore = namedtuple("LabelScore", "match model ref precision recall f1") 6 | 7 | 8 | class TrainLogParser: 9 | def __init__(self): 10 | self.state = None 11 | self.featgen_percent = -2 12 | self.featgen_num_features = None 13 | self.featgen_seconds = None 14 | self.training_seconds = None 15 | self.storing_seconds = None 16 | 17 | self.iterations = [] 18 | self.last_iteration = None 19 | self.log = [] 20 | self.events = [] 21 | 22 | def feed(self, line): 23 | # if line != '\n': 24 | self.log.append(line) 25 | if self.state is None: 26 | self.state = "STARTING" 27 | self.handle_STARTING(line) 28 | self.events.append(("start", 0, len(self.log))) 29 | return "start" 30 | event = getattr(self, "handle_" + self.state)(line) 31 | if event is not None: 32 | start, end = self.events[-1][2], len(self.log) 33 | if event in ("prepared", "optimization_end"): 34 | end -= 1 35 | self.events.append((event, start, end)) 36 | return event 37 | 38 | @property 39 | def last_log(self): 40 | event, start, end = self.events[-1] 41 | return "".join(self.log[start:end]) 42 | 43 | def handle_STARTING(self, line): 44 | if line.startswith("Feature generation"): 45 | self.state = "FEATGEN" 46 | 47 | def handle_FEATGEN(self, line): 48 | if line in "0123456789.10": 49 | self.featgen_percent += 2 50 | return "featgen_progress" 51 | 52 | m = re.match(r"Number of features: (\d+)", line) 53 | if m: 54 | self.featgen_num_features = int(m.group(1)) 55 | return None 56 | 57 | if self._seconds(line) is not None: 58 | self.featgen_seconds = self._seconds(line) 59 | self.state = "AFTER_FEATGEN" 60 | return "featgen_end" 61 | 62 | def handle_AFTER_FEATGEN(self, line): 63 | if self._iteration_head(line) is not None: 64 | self.state = "ITERATION" 65 | self.handle_ITERATION(line) 66 | return "prepared" 67 | 68 | if "terminated with error" in line: 69 | self.state = "AFTER_ITERATION" 70 | return "prepare_error" 71 | 72 | def handle_ITERATION(self, line): 73 | if self._iteration_head(line) is not None: 74 | self.last_iteration = { 75 | "num": self._iteration_head(line), 76 | "scores": {}, 77 | } 78 | self.iterations.append(self.last_iteration) 79 | elif line == "\n": 80 | self.state = "AFTER_ITERATION" 81 | return "iteration" 82 | 83 | def add_re(key, pattern, typ): 84 | m = re.match(pattern, line) 85 | if m: 86 | self.last_iteration[key] = typ(m.group(1)) 87 | 88 | add_re("loss", r"Loss: (\d+\.\d+)", float) 89 | add_re("feature_norm", r"Feature norm: (\d+\.\d+)", float) 90 | add_re("error_norm", r"Error norm: (\d+\.\d+)", float) 91 | add_re("active_features", r"Active features: (\d+)", int) 92 | add_re("linesearch_trials", r"Line search trials: (\d+)", int) 93 | add_re("linesearch_step", r"Line search step: (\d+\.\d+)", float) 94 | add_re("time", r"Seconds required for this iteration: (\d+\.\d+)", float) 95 | 96 | m = re.match( 97 | r"Macro-average precision, recall, F1: \((\d\.\d+), (\d\.\d+), (\d\.\d+)\)", 98 | line, 99 | ) 100 | if m: 101 | self.last_iteration["avg_precision"] = float(m.group(1)) 102 | self.last_iteration["avg_recall"] = float(m.group(2)) 103 | self.last_iteration["avg_f1"] = float(m.group(3)) 104 | 105 | m = re.match(r"Item accuracy: (\d+) / (\d+)", line) 106 | if m: 107 | acc = fractions.Fraction(int(m.group(1)), int(m.group(2))) 108 | self.last_iteration["item_accuracy"] = acc 109 | self.last_iteration["item_accuracy_float"] = float(acc) 110 | 111 | m = re.match(r"Instance accuracy: (\d+) / (\d+)", line) 112 | if m: 113 | acc = fractions.Fraction(int(m.group(1)), int(m.group(2))) 114 | self.last_iteration["instance_accuracy"] = acc 115 | self.last_iteration["instance_accuracy_float"] = float(acc) 116 | 117 | m = re.match( 118 | r"\s{4}(.+): \((\d+), (\d+), (\d+)\) \((\d\.\d+), (\d\.\d+), (\d\.\d+)\)", 119 | line, 120 | ) 121 | if m: 122 | self.last_iteration["scores"][m.group(1)] = LabelScore( 123 | **{ 124 | "match": int(m.group(2)), 125 | "model": int(m.group(3)), 126 | "ref": int(m.group(4)), 127 | "precision": float(m.group(5)), 128 | "recall": float(m.group(6)), 129 | "f1": float(m.group(7)), 130 | } 131 | ) 132 | 133 | m = re.match(r"\s{4}(.+): \(0, 0, 0\) \(\*{6}, \*{6}, \*{6}\)", line) 134 | if m: 135 | self.last_iteration["scores"][m.group(1)] = LabelScore( 136 | **{ 137 | "match": 0, 138 | "model": 0, 139 | "ref": 0, 140 | "precision": None, 141 | "recall": None, 142 | "f1": None, 143 | } 144 | ) 145 | 146 | def handle_AFTER_ITERATION(self, line): 147 | if self._iteration_head(line) is not None: 148 | self.state = "ITERATION" 149 | return self.handle_ITERATION(line) 150 | 151 | m = re.match(r"Total seconds required for training: (\d+\.\d+)", line) 152 | if m: 153 | self.training_seconds = float(m.group(1)) 154 | 155 | if line.startswith("Storing the model"): 156 | self.state = "STORING" 157 | return "optimization_end" 158 | 159 | def handle_STORING(self, line): 160 | if line == "\n": 161 | return "end" 162 | elif self._seconds(line): 163 | self.storing_seconds = self._seconds(line) 164 | 165 | def _iteration_head(self, line): 166 | m = re.match(r"\*{5} (?:Iteration|Epoch) #(\d+) \*{5}\n", line) 167 | if m: 168 | return int(m.group(1)) 169 | 170 | def _seconds(self, line): 171 | m = re.match(r"Seconds required: (\d+\.\d+)", line) 172 | if m: 173 | return float(m.group(1)) 174 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 36 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 38 | @echo " text to make text files" 39 | @echo " man to make manual pages" 40 | @echo " texinfo to make Texinfo files" 41 | @echo " info to make Texinfo files and run them through makeinfo" 42 | @echo " gettext to make PO message catalogs" 43 | @echo " changes to make an overview of all changed/added/deprecated items" 44 | @echo " xml to make Docutils-native XML files" 45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 46 | @echo " linkcheck to check all external links for integrity" 47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 48 | 49 | clean: 50 | rm -rf $(BUILDDIR)/* 51 | 52 | html: 53 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 54 | @echo 55 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 56 | 57 | dirhtml: 58 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 59 | @echo 60 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 61 | 62 | singlehtml: 63 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 64 | @echo 65 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 66 | 67 | pickle: 68 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 69 | @echo 70 | @echo "Build finished; now you can process the pickle files." 71 | 72 | json: 73 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 74 | @echo 75 | @echo "Build finished; now you can process the JSON files." 76 | 77 | htmlhelp: 78 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 79 | @echo 80 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 81 | ".hhp project file in $(BUILDDIR)/htmlhelp." 82 | 83 | qthelp: 84 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 85 | @echo 86 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 87 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 88 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/python-crfsuite.qhcp" 89 | @echo "To view the help file:" 90 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/python-crfsuite.qhc" 91 | 92 | devhelp: 93 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 94 | @echo 95 | @echo "Build finished." 96 | @echo "To view the help file:" 97 | @echo "# mkdir -p $$HOME/.local/share/devhelp/python-crfsuite" 98 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/python-crfsuite" 99 | @echo "# devhelp" 100 | 101 | epub: 102 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 103 | @echo 104 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 105 | 106 | latex: 107 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 108 | @echo 109 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 110 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 111 | "(use \`make latexpdf' here to do that automatically)." 112 | 113 | latexpdf: 114 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 115 | @echo "Running LaTeX files through pdflatex..." 116 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 117 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 118 | 119 | latexpdfja: 120 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 121 | @echo "Running LaTeX files through platex and dvipdfmx..." 122 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 123 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 124 | 125 | text: 126 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 127 | @echo 128 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 129 | 130 | man: 131 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 132 | @echo 133 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 134 | 135 | texinfo: 136 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 137 | @echo 138 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 139 | @echo "Run \`make' in that directory to run these through makeinfo" \ 140 | "(use \`make info' here to do that automatically)." 141 | 142 | info: 143 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 144 | @echo "Running Texinfo files through makeinfo..." 145 | make -C $(BUILDDIR)/texinfo info 146 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 147 | 148 | gettext: 149 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 150 | @echo 151 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 152 | 153 | changes: 154 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 155 | @echo 156 | @echo "The overview file is in $(BUILDDIR)/changes." 157 | 158 | linkcheck: 159 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 160 | @echo 161 | @echo "Link check complete; look for any errors in the above output " \ 162 | "or in $(BUILDDIR)/linkcheck/output.txt." 163 | 164 | doctest: 165 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 166 | @echo "Testing of doctests in the sources finished, look at the " \ 167 | "results in $(BUILDDIR)/doctest/output.txt." 168 | 169 | xml: 170 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 171 | @echo 172 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 173 | 174 | pseudoxml: 175 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 176 | @echo 177 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 178 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. xml to make Docutils-native XML files 37 | echo. pseudoxml to make pseudoxml-XML files for display purposes 38 | echo. linkcheck to check all external links for integrity 39 | echo. doctest to run all doctests embedded in the documentation if enabled 40 | goto end 41 | ) 42 | 43 | if "%1" == "clean" ( 44 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 45 | del /q /s %BUILDDIR%\* 46 | goto end 47 | ) 48 | 49 | 50 | %SPHINXBUILD% 2> nul 51 | if errorlevel 9009 ( 52 | echo. 53 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 54 | echo.installed, then set the SPHINXBUILD environment variable to point 55 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 56 | echo.may add the Sphinx directory to PATH. 57 | echo. 58 | echo.If you don't have Sphinx installed, grab it from 59 | echo.http://sphinx-doc.org/ 60 | exit /b 1 61 | ) 62 | 63 | if "%1" == "html" ( 64 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 65 | if errorlevel 1 exit /b 1 66 | echo. 67 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 68 | goto end 69 | ) 70 | 71 | if "%1" == "dirhtml" ( 72 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 73 | if errorlevel 1 exit /b 1 74 | echo. 75 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 76 | goto end 77 | ) 78 | 79 | if "%1" == "singlehtml" ( 80 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 81 | if errorlevel 1 exit /b 1 82 | echo. 83 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 84 | goto end 85 | ) 86 | 87 | if "%1" == "pickle" ( 88 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 89 | if errorlevel 1 exit /b 1 90 | echo. 91 | echo.Build finished; now you can process the pickle files. 92 | goto end 93 | ) 94 | 95 | if "%1" == "json" ( 96 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 97 | if errorlevel 1 exit /b 1 98 | echo. 99 | echo.Build finished; now you can process the JSON files. 100 | goto end 101 | ) 102 | 103 | if "%1" == "htmlhelp" ( 104 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 105 | if errorlevel 1 exit /b 1 106 | echo. 107 | echo.Build finished; now you can run HTML Help Workshop with the ^ 108 | .hhp project file in %BUILDDIR%/htmlhelp. 109 | goto end 110 | ) 111 | 112 | if "%1" == "qthelp" ( 113 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 114 | if errorlevel 1 exit /b 1 115 | echo. 116 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 117 | .qhcp project file in %BUILDDIR%/qthelp, like this: 118 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\python-crfsuite.qhcp 119 | echo.To view the help file: 120 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\python-crfsuite.ghc 121 | goto end 122 | ) 123 | 124 | if "%1" == "devhelp" ( 125 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished. 129 | goto end 130 | ) 131 | 132 | if "%1" == "epub" ( 133 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 134 | if errorlevel 1 exit /b 1 135 | echo. 136 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 137 | goto end 138 | ) 139 | 140 | if "%1" == "latex" ( 141 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 142 | if errorlevel 1 exit /b 1 143 | echo. 144 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 145 | goto end 146 | ) 147 | 148 | if "%1" == "latexpdf" ( 149 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 150 | cd %BUILDDIR%/latex 151 | make all-pdf 152 | cd %BUILDDIR%/.. 153 | echo. 154 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 155 | goto end 156 | ) 157 | 158 | if "%1" == "latexpdfja" ( 159 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 160 | cd %BUILDDIR%/latex 161 | make all-pdf-ja 162 | cd %BUILDDIR%/.. 163 | echo. 164 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 165 | goto end 166 | ) 167 | 168 | if "%1" == "text" ( 169 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 170 | if errorlevel 1 exit /b 1 171 | echo. 172 | echo.Build finished. The text files are in %BUILDDIR%/text. 173 | goto end 174 | ) 175 | 176 | if "%1" == "man" ( 177 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 178 | if errorlevel 1 exit /b 1 179 | echo. 180 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 181 | goto end 182 | ) 183 | 184 | if "%1" == "texinfo" ( 185 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 186 | if errorlevel 1 exit /b 1 187 | echo. 188 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 189 | goto end 190 | ) 191 | 192 | if "%1" == "gettext" ( 193 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 194 | if errorlevel 1 exit /b 1 195 | echo. 196 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 197 | goto end 198 | ) 199 | 200 | if "%1" == "changes" ( 201 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 202 | if errorlevel 1 exit /b 1 203 | echo. 204 | echo.The overview file is in %BUILDDIR%/changes. 205 | goto end 206 | ) 207 | 208 | if "%1" == "linkcheck" ( 209 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 210 | if errorlevel 1 exit /b 1 211 | echo. 212 | echo.Link check complete; look for any errors in the above output ^ 213 | or in %BUILDDIR%/linkcheck/output.txt. 214 | goto end 215 | ) 216 | 217 | if "%1" == "doctest" ( 218 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 219 | if errorlevel 1 exit /b 1 220 | echo. 221 | echo.Testing of doctests in the sources finished, look at the ^ 222 | results in %BUILDDIR%/doctest/output.txt. 223 | goto end 224 | ) 225 | 226 | if "%1" == "xml" ( 227 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml 228 | if errorlevel 1 exit /b 1 229 | echo. 230 | echo.Build finished. The XML files are in %BUILDDIR%/xml. 231 | goto end 232 | ) 233 | 234 | if "%1" == "pseudoxml" ( 235 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml 236 | if errorlevel 1 exit /b 1 237 | echo. 238 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. 239 | goto end 240 | ) 241 | 242 | :end 243 | -------------------------------------------------------------------------------- /tests/test_tagger.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from pycrfsuite import ItemSequence, Tagger, Trainer 4 | 5 | 6 | def test_open_close_labels(model_filename, yseq): 7 | tagger = Tagger() 8 | 9 | with pytest.raises(ValueError): 10 | # tagger should be closed, so labels() method should fail here 11 | labels = tagger.labels() 12 | 13 | with tagger.open(model_filename): 14 | labels = tagger.labels() 15 | assert set(labels) == set(yseq) 16 | 17 | with pytest.raises(ValueError): 18 | # tagger should be closed, so labels() method should fail here 19 | labels = tagger.labels() 20 | 21 | 22 | def test_open_non_existing(): 23 | tagger = Tagger() 24 | with pytest.raises(IOError): 25 | tagger.open("foo") 26 | 27 | 28 | def test_open_invalid(): 29 | tagger = Tagger() 30 | with pytest.raises(ValueError): 31 | tagger.open(__file__) 32 | 33 | 34 | def test_open_invalid_small(tmpdir): 35 | tmp = tmpdir.join("tmp.txt") 36 | tmp.write(b"foo") 37 | tagger = Tagger() 38 | with pytest.raises(ValueError): 39 | tagger.open(str(tmp)) 40 | 41 | 42 | def test_open_invalid_small_with_correct_signature(tmpdir): 43 | tmp = tmpdir.join("tmp.txt") 44 | tmp.write(b"lCRFfoo") 45 | tagger = Tagger() 46 | with pytest.raises(ValueError): 47 | tagger.open(str(tmp)) 48 | 49 | 50 | @pytest.mark.xfail(reason="see https://github.com/chokkan/crfsuite/pull/24", run=False) 51 | def test_open_invalid_with_correct_signature(tmpdir): 52 | tmp = tmpdir.join("tmp.txt") 53 | tmp.write(b"lCRFfoo" * 100) 54 | tagger = Tagger() 55 | with pytest.raises(ValueError): 56 | tagger.open(str(tmp)) 57 | 58 | 59 | def test_open_inmemory(model_bytes, xseq, yseq): 60 | with Tagger().open_inmemory(model_bytes) as tagger: 61 | assert tagger.tag(xseq) == yseq 62 | 63 | 64 | def test_open_inmemory_invalid(): 65 | tagger = Tagger() 66 | with pytest.raises(ValueError): 67 | tagger.open_inmemory(b"") 68 | 69 | with pytest.raises(ValueError): 70 | tagger.open_inmemory(b"lCRFabc") 71 | 72 | 73 | @pytest.mark.xfail( 74 | reason="see https://github.com/scrapinghub/python-crfsuite/issues/28", run=False 75 | ) 76 | def test_tag_not_opened(xseq): 77 | tagger = Tagger() 78 | with pytest.raises(Exception): 79 | tagger.tag(xseq) 80 | 81 | 82 | def test_tag(model_filename, xseq, yseq): 83 | with Tagger().open(model_filename) as tagger: 84 | assert tagger.tag(xseq) == yseq 85 | 86 | 87 | def test_tag_item_sequence(model_filename, xseq, yseq): 88 | with Tagger().open(model_filename) as tagger: 89 | assert tagger.tag(ItemSequence(xseq)) == yseq 90 | 91 | 92 | def test_tag_string_lists(model_filename, xseq, yseq): 93 | with Tagger().open(model_filename) as tagger: 94 | # Working with lists is supported, 95 | # but if we discard weights the results become different 96 | data = [x.keys() for x in xseq] 97 | assert tagger.tag(data) != yseq 98 | 99 | 100 | def test_tag_bools(model_filename, xseq, yseq): 101 | with Tagger().open(model_filename) as tagger: 102 | # Some values are bools: 103 | # True <=> 1.0; False <=> 0.0 104 | data = [ 105 | {k: bool(v) if v == 0 or v == 1 else v for (k, v) in x.items()} 106 | for x in xseq 107 | ] 108 | assert tagger.tag(data) == yseq 109 | 110 | 111 | def test_tag_formats(tmpdir, xseq, yseq): 112 | # make all coefficients 1 and check that results are the same 113 | model_filename = str(tmpdir.join("model.crfsuite")) 114 | xseq = [{key: 1 for key in x} for x in xseq] 115 | 116 | trainer = Trainer() 117 | trainer.set("c2", 1e-6) # make sure model overfits 118 | trainer.append(xseq, yseq) 119 | trainer.train(model_filename) 120 | 121 | with Tagger().open(model_filename) as tagger: 122 | assert tagger.tag(xseq) == yseq 123 | 124 | # strings 125 | with Tagger().open(model_filename) as tagger: 126 | data = [x.keys() for x in xseq] 127 | assert tagger.tag(data) == yseq 128 | 129 | 130 | @pytest.mark.xfail() 131 | @pytest.mark.parametrize( 132 | "bad_seq", 133 | [ 134 | "foo", 135 | ["foo"], # should be a list of lists of strings 136 | [[{"foo": 1.0}]], # should be a list of dicts 137 | ], 138 | ) 139 | def test_tag_invalid_feature_format(model_filename, bad_seq): 140 | with Tagger().open(model_filename) as tagger: 141 | with pytest.raises(ValueError): 142 | tagger.tag(bad_seq) 143 | 144 | 145 | def test_tag_probability(model_filename, xseq, yseq): 146 | with Tagger().open(model_filename) as tagger: 147 | res = tagger.tag(xseq) 148 | prob = tagger.probability(res) 149 | prob2 = tagger.probability([yseq[0]] * len(yseq)) 150 | assert prob > prob2 151 | assert 0 < prob < 1 152 | assert 0 < prob2 < 1 153 | 154 | 155 | def test_dump(tmpdir, model_filename): 156 | with Tagger().open(model_filename) as tagger: 157 | dump_filename = str(tmpdir.join("dump.txt")) 158 | tagger.dump(dump_filename) 159 | 160 | with open(dump_filename, "rb") as f: 161 | res = f.read().decode("utf8") 162 | assert "LABELS = {" in res 163 | assert "солнце:не светит --> rainy:" in res 164 | 165 | # it shouldn't segfault on a closed tagger 166 | with pytest.raises(RuntimeError): 167 | tagger.dump(dump_filename) 168 | 169 | 170 | def test_info(model_filename): 171 | with Tagger().open(model_filename) as tagger: 172 | res = tagger.info() 173 | 174 | assert res.transitions[("sunny", "sunny")] > res.transitions[("sunny", "rainy")] 175 | assert ( 176 | res.state_features[("walk", "sunny")] 177 | > res.state_features[("walk", "rainy")] 178 | ) 179 | assert ("солнце:не светит", "rainy") in res.state_features 180 | assert res.header["num_labels"] == "3" 181 | assert set(res.labels.keys()) == set(["sunny", "rainy", "好"]) 182 | assert set(res.attributes.keys()) == set( 183 | ["shop", "walk", "clean", "солнце:не светит", "world"] 184 | ) 185 | 186 | # it shouldn't segfault on a closed tagger 187 | with pytest.raises(RuntimeError): 188 | tagger.info() 189 | 190 | 191 | def test_append_strstr_dicts(tmpdir): 192 | trainer = Trainer() 193 | trainer.append( 194 | [{"foo": "bar"}, {"baz": False}, {"foo": "bar", "baz": True}, {"baz": 0.2}], 195 | ["spam", "egg", "spam", "spam"], 196 | ) 197 | model_filename = str(tmpdir.join("model.crfsuite")) 198 | trainer.train(model_filename) 199 | 200 | with Tagger().open(model_filename) as tagger: 201 | info = tagger.info() 202 | assert set(info.attributes.keys()) == {"foo:bar", "baz"} 203 | assert info.state_features[("foo:bar", "spam")] > 0 204 | 205 | 206 | def test_append_nested_dicts(tmpdir): 207 | trainer = Trainer() 208 | trainer.append( 209 | [ 210 | { 211 | "foo": { 212 | "bar": "baz", 213 | "spam": 0.5, 214 | "egg": ["x", "y"], 215 | "ham": {"x": -0.5, "y": -0.1}, 216 | }, 217 | }, 218 | { 219 | "foo": {"bar": "ham", "spam": -0.5, "ham": {"x", "y"}}, 220 | }, 221 | ], 222 | ["first", "second"], 223 | ) 224 | model_filename = str(tmpdir.join("model.crfsuite")) 225 | trainer.train(model_filename) 226 | 227 | with Tagger().open(model_filename) as tagger: 228 | info = tagger.info() 229 | assert set(info.attributes.keys()) == { 230 | "foo:bar:baz", 231 | "foo:spam", 232 | "foo:egg:x", 233 | "foo:egg:y", 234 | "foo:ham:x", 235 | "foo:ham:y", 236 | "foo:bar:ham", 237 | } 238 | 239 | for feat in ["foo:bar:baz", "foo:spam", "foo:egg:x", "foo:egg:y"]: 240 | assert info.state_features[(feat, "first")] > 0 241 | assert info.state_features.get((feat, "second"), 0) <= 0 242 | 243 | for feat in ["foo:bar:ham", "foo:ham:x", "foo:ham:y"]: 244 | assert info.state_features[(feat, "second")] > 0 245 | assert info.state_features.get((feat, "first"), 0) <= 0 246 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # python-crfsuite documentation build configuration file, created by 4 | # sphinx-quickstart on Sun Apr 27 15:19:14 2014. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | import sys 16 | import os 17 | 18 | # If extensions (or modules to document with autodoc) are in another directory, 19 | # add these directories to sys.path here. If the directory is relative to the 20 | # documentation root, use os.path.abspath to make it absolute, like shown here. 21 | #sys.path.insert(0, os.path.abspath('.')) 22 | 23 | # -- General configuration ------------------------------------------------ 24 | 25 | # If your documentation needs a minimal Sphinx version, state it here. 26 | #needs_sphinx = '1.0' 27 | 28 | # Add any Sphinx extension module names here, as strings. They can be 29 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 30 | # ones. 31 | extensions = [ 32 | 'sphinx.ext.autodoc', 33 | 'sphinx.ext.viewcode', 34 | 'numpydoc', 35 | ] 36 | numpydoc_show_class_members = False 37 | 38 | # Add any paths that contain templates here, relative to this directory. 39 | templates_path = ['_templates'] 40 | 41 | # The suffix of source filenames. 42 | source_suffix = '.rst' 43 | 44 | # The encoding of source files. 45 | #source_encoding = 'utf-8-sig' 46 | 47 | # The master toctree document. 48 | master_doc = 'index' 49 | 50 | # General information about the project. 51 | project = u'python-crfsuite' 52 | copyright = u'2020, Terry Peng, Mikhail Korobov' 53 | 54 | # The version info for the project you're documenting, acts as replacement for 55 | # |version| and |release|, also used in various other places throughout the 56 | # built documents. 57 | # 58 | # The short X.Y version. 59 | version = '0.9' 60 | # The full version, including alpha/beta/rc tags. 61 | release = '0.9.7' 62 | 63 | # The language for content autogenerated by Sphinx. Refer to documentation 64 | # for a list of supported languages. 65 | #language = None 66 | 67 | # There are two options for replacing |today|: either, you set today to some 68 | # non-false value, then it is used: 69 | #today = '' 70 | # Else, today_fmt is used as the format for a strftime call. 71 | #today_fmt = '%B %d, %Y' 72 | 73 | # List of patterns, relative to source directory, that match files and 74 | # directories to ignore when looking for source files. 75 | exclude_patterns = ['_build'] 76 | 77 | # The reST default role (used for this markup: `text`) to use for all 78 | # documents. 79 | #default_role = None 80 | 81 | # If true, '()' will be appended to :func: etc. cross-reference text. 82 | #add_function_parentheses = True 83 | 84 | # If true, the current module name will be prepended to all description 85 | # unit titles (such as .. function::). 86 | #add_module_names = True 87 | 88 | # If true, sectionauthor and moduleauthor directives will be shown in the 89 | # output. They are ignored by default. 90 | #show_authors = False 91 | 92 | # The name of the Pygments (syntax highlighting) style to use. 93 | pygments_style = 'sphinx' 94 | 95 | # A list of ignored prefixes for module index sorting. 96 | #modindex_common_prefix = [] 97 | 98 | # If true, keep warnings as "system message" paragraphs in the built documents. 99 | #keep_warnings = False 100 | 101 | 102 | # -- Options for HTML output ---------------------------------------------- 103 | 104 | # The theme to use for HTML and HTML Help pages. See the documentation for 105 | # a list of builtin themes. 106 | html_theme = 'default' 107 | 108 | # Theme options are theme-specific and customize the look and feel of a theme 109 | # further. For a list of options available for each theme, see the 110 | # documentation. 111 | #html_theme_options = {} 112 | 113 | # Add any paths that contain custom themes here, relative to this directory. 114 | #html_theme_path = [] 115 | 116 | # The name for this set of Sphinx documents. If None, it defaults to 117 | # " v documentation". 118 | #html_title = None 119 | 120 | # A shorter title for the navigation bar. Default is the same as html_title. 121 | #html_short_title = None 122 | 123 | # The name of an image file (relative to this directory) to place at the top 124 | # of the sidebar. 125 | #html_logo = None 126 | 127 | # The name of an image file (within the static path) to use as favicon of the 128 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 129 | # pixels large. 130 | #html_favicon = None 131 | 132 | # Add any paths that contain custom static files (such as style sheets) here, 133 | # relative to this directory. They are copied after the builtin static files, 134 | # so a file named "default.css" will overwrite the builtin "default.css". 135 | html_static_path = ['_static'] 136 | 137 | # Add any extra paths that contain custom files (such as robots.txt or 138 | # .htaccess) here, relative to this directory. These files are copied 139 | # directly to the root of the documentation. 140 | #html_extra_path = [] 141 | 142 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 143 | # using the given strftime format. 144 | #html_last_updated_fmt = '%b %d, %Y' 145 | 146 | # If true, SmartyPants will be used to convert quotes and dashes to 147 | # typographically correct entities. 148 | #html_use_smartypants = True 149 | 150 | # Custom sidebar templates, maps document names to template names. 151 | #html_sidebars = {} 152 | 153 | # Additional templates that should be rendered to pages, maps page names to 154 | # template names. 155 | #html_additional_pages = {} 156 | 157 | # If false, no module index is generated. 158 | #html_domain_indices = True 159 | 160 | # If false, no index is generated. 161 | #html_use_index = True 162 | 163 | # If true, the index is split into individual pages for each letter. 164 | #html_split_index = False 165 | 166 | # If true, links to the reST sources are added to the pages. 167 | #html_show_sourcelink = True 168 | 169 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 170 | #html_show_sphinx = True 171 | 172 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 173 | #html_show_copyright = True 174 | 175 | # If true, an OpenSearch description file will be output, and all pages will 176 | # contain a tag referring to it. The value of this option must be the 177 | # base URL from which the finished HTML is served. 178 | #html_use_opensearch = '' 179 | 180 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 181 | #html_file_suffix = None 182 | 183 | # Output file base name for HTML help builder. 184 | htmlhelp_basename = 'python-crfsuitedoc' 185 | 186 | 187 | # -- Options for LaTeX output --------------------------------------------- 188 | 189 | latex_elements = { 190 | # The paper size ('letterpaper' or 'a4paper'). 191 | #'papersize': 'letterpaper', 192 | 193 | # The font size ('10pt', '11pt' or '12pt'). 194 | #'pointsize': '10pt', 195 | 196 | # Additional stuff for the LaTeX preamble. 197 | #'preamble': '', 198 | } 199 | 200 | # Grouping the document tree into LaTeX files. List of tuples 201 | # (source start file, target name, title, 202 | # author, documentclass [howto, manual, or own class]). 203 | latex_documents = [ 204 | ('index', 'python-crfsuite.tex', u'python-crfsuite Documentation', 205 | u'Terry Peng, Mikhail Korobov', 'manual'), 206 | ] 207 | 208 | # The name of an image file (relative to this directory) to place at the top of 209 | # the title page. 210 | #latex_logo = None 211 | 212 | # For "manual" documents, if this is true, then toplevel headings are parts, 213 | # not chapters. 214 | #latex_use_parts = False 215 | 216 | # If true, show page references after internal links. 217 | #latex_show_pagerefs = False 218 | 219 | # If true, show URL addresses after external links. 220 | #latex_show_urls = False 221 | 222 | # Documents to append as an appendix to all manuals. 223 | #latex_appendices = [] 224 | 225 | # If false, no module index is generated. 226 | #latex_domain_indices = True 227 | 228 | 229 | # -- Options for manual page output --------------------------------------- 230 | 231 | # One entry per manual page. List of tuples 232 | # (source start file, name, description, authors, manual section). 233 | man_pages = [ 234 | ('index', 'python-crfsuite', u'python-crfsuite Documentation', 235 | [u'Terry Peng, Mikhail Korobov'], 1) 236 | ] 237 | 238 | # If true, show URL addresses after external links. 239 | #man_show_urls = False 240 | 241 | 242 | # -- Options for Texinfo output ------------------------------------------- 243 | 244 | # Grouping the document tree into Texinfo files. List of tuples 245 | # (source start file, target name, title, author, 246 | # dir menu entry, description, category) 247 | texinfo_documents = [ 248 | ('index', 'python-crfsuite', u'python-crfsuite Documentation', 249 | u'Terry Peng, Mikhail Korobov', 'python-crfsuite', 'Python CRFSuite wrapper.', 250 | 'Miscellaneous'), 251 | ] 252 | 253 | # Documents to append as an appendix to all manuals. 254 | #texinfo_appendices = [] 255 | 256 | # If false, no module index is generated. 257 | #texinfo_domain_indices = True 258 | 259 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 260 | #texinfo_show_urls = 'footnote' 261 | 262 | # If true, do not generate a @detailmenu in the "Top" node's menu. 263 | #texinfo_no_detailmenu = False 264 | -------------------------------------------------------------------------------- /pycrfsuite/_pycrfsuite.pyx: -------------------------------------------------------------------------------- 1 | # cython: embedsignature=True 2 | # cython: c_string_type=str 3 | # cython: c_string_encoding=utf-8 4 | # cython: profile=False 5 | # distutils: language=c++ 6 | from . cimport crfsuite_api 7 | from libcpp.string cimport string 8 | 9 | import sys 10 | import os 11 | import contextlib 12 | import tempfile 13 | 14 | from pycrfsuite import _dumpparser 15 | from pycrfsuite import _logparser 16 | 17 | CRFSUITE_VERSION = crfsuite_api.version() 18 | 19 | 20 | class CRFSuiteError(Exception): 21 | 22 | _messages = { 23 | crfsuite_api.CRFSUITEERR_UNKNOWN: "Unknown error occurred", 24 | crfsuite_api.CRFSUITEERR_OUTOFMEMORY: "Insufficient memory", 25 | crfsuite_api.CRFSUITEERR_NOTSUPPORTED: "Unsupported operation", 26 | crfsuite_api.CRFSUITEERR_INCOMPATIBLE: "Incompatible data", 27 | crfsuite_api.CRFSUITEERR_INTERNAL_LOGIC: "Internal error", 28 | crfsuite_api.CRFSUITEERR_OVERFLOW: "Overflow", 29 | crfsuite_api.CRFSUITEERR_NOTIMPLEMENTED: "Not implemented", 30 | } 31 | 32 | def __init__(self, code): 33 | self.code = code 34 | Exception.__init__(self._messages.get(self.code, "Unexpected error")) 35 | 36 | 37 | cdef string _SEP = b':' 38 | 39 | cdef extern crfsuite_api.Item to_item(x) except+: 40 | """ Convert a Python object to an Item. """ 41 | cdef crfsuite_api.Item c_item 42 | cdef double c_value 43 | cdef string c_key 44 | cdef bint is_dict, is_nested_value 45 | 46 | is_dict = isinstance(x, dict) 47 | c_item = crfsuite_api.Item() 48 | c_item.reserve(len(x)) # at least this amount is required 49 | for key in x: 50 | if isinstance(key, unicode): 51 | c_key = (key).encode('utf8') 52 | else: 53 | c_key = key 54 | 55 | if not is_dict: 56 | # "string_key" 57 | c_value = 1.0 58 | c_item.push_back(crfsuite_api.Attribute(c_key, c_value)) 59 | else: 60 | value = (x)[key] 61 | 62 | if isinstance(value, (dict, list, set)): 63 | # {"string_prefix": {...}} 64 | for attr in to_item(value): 65 | c_item.push_back( 66 | crfsuite_api.Attribute(c_key + _SEP + attr.attr, attr.value) 67 | ) 68 | else: 69 | if isinstance(value, unicode): 70 | # {"string_key": "string_value"} 71 | c_key += _SEP 72 | c_key += (value).encode('utf8') 73 | c_value = 1.0 74 | elif isinstance(value, bytes): 75 | # {"string_key": "string_value"} 76 | c_key += _SEP 77 | c_key += value 78 | c_value = 1.0 79 | else: 80 | # {"string_key": float_value} 81 | # {"string_key": bool} 82 | c_value = value 83 | 84 | c_item.push_back(crfsuite_api.Attribute(c_key, c_value)) 85 | 86 | return c_item 87 | 88 | 89 | cdef extern crfsuite_api.ItemSequence to_seq(pyseq) except+: 90 | """ 91 | Convert an iterable to an ItemSequence. 92 | Elements of an iterable could be: 93 | 94 | * {"string_key": float_value} dicts; 95 | * {"string_key": bool} dicts: True is converted to 1.0, False - to 0.0; 96 | * {"string_key": "string_value"} dicts: result is {"string_key=string_value": 1.0} 97 | * "string_key": result is {"string_key": 1.0} 98 | * {"string_prefix": {...}} nested dicts: nested dict is processed and 99 | "string_prefix" s prepended to each key. 100 | * {"string_prefix": [...]} dicts: nested list is processed and 101 | "string_prefix" s prepended to each key. 102 | """ 103 | cdef crfsuite_api.ItemSequence c_seq 104 | 105 | if isinstance(pyseq, ItemSequence): 106 | c_seq = (pyseq).c_seq 107 | else: 108 | for x in pyseq: 109 | c_seq.push_back(to_item(x)) 110 | return c_seq 111 | 112 | 113 | cdef class ItemSequence(object): 114 | """ 115 | A wrapper for crfsuite ItemSequence - a class for storing 116 | features for all items in a single sequence. 117 | 118 | Using this class is an alternative to passing data to :class:`~Trainer` 119 | and :class:`Tagger` directly. By using this class it is possible to 120 | save some time if the same input sequence is passed to trainers/taggers 121 | more than once - features won't be processed multiple times. 122 | It also allows to get "processed" features/attributes that are sent 123 | to CRFsuite - they could be helpful e.g. to check which attributes 124 | (returned by :meth:`~Tagger.info`) are active for a given observation. 125 | 126 | Initialize ItemSequence with a list of item features: 127 | 128 | >>> ItemSequence([{'foo': 1, 'bar': 0}, {'foo': 1.5, 'baz': 2}]) 129 | 130 | 131 | Item features could be in one of the following formats: 132 | 133 | * {"string_key": float_weight, ...} dict where keys are 134 | observed features and values are their weights; 135 | * {"string_key": bool, ...} dict; True is converted to 1.0 weight, 136 | False - to 0.0; 137 | * {"string_key": "string_value", ...} dict; that's the same as 138 | {"string_key=string_value": 1.0, ...} 139 | * ["string_key1", "string_key2", ...] list; that's the same as 140 | {"string_key1": 1.0, "string_key2": 1.0, ...} 141 | * {"string_prefix": {...}} dicts: nested dict is processed and 142 | "string_prefix" s prepended to each key. 143 | * {"string_prefix": [...]} dicts: nested list is processed and 144 | "string_prefix" s prepended to each key. 145 | * {"string_prefix": set([...])} dicts: nested list is processed and 146 | "string_prefix" s prepended to each key. 147 | 148 | Dict-based features can be mixed, i.e. this is allowed:: 149 | 150 | {"key1": float_weight, 151 | "key2": "string_value", 152 | "key3": bool_value, 153 | "key4: {"key5": ["x", "y"], "key6": float_value}, 154 | } 155 | 156 | """ 157 | cdef crfsuite_api.ItemSequence c_seq 158 | 159 | def __init__(self, pyseq): 160 | self.c_seq = to_seq(pyseq) 161 | 162 | def items(self): 163 | """ 164 | Return a list of prepared item features: 165 | a list of ``{unicode_key: float_value}`` dicts. 166 | 167 | >>> ItemSequence([["foo"], {"bar": {"baz": 1}}]).items() 168 | [{'foo': 1.0}, {'bar:baz': 1.0}] 169 | 170 | """ 171 | cdef crfsuite_api.Item c_item 172 | cdef crfsuite_api.Attribute c_attr 173 | cdef bytes key 174 | seq = [] 175 | 176 | for c_item in self.c_seq: 177 | x = {} 178 | for c_attr in c_item: 179 | # Always decode keys from utf-8. It means binary keys are 180 | # not supported. I think it is OK because Tagger.info() 181 | # also only supports utf-8. 182 | 183 | # XXX: (c_attr.attr).decode('utf8') doesn't 184 | # work properly in Cython 0.21 185 | key = c_attr.attr.c_str() 186 | x[key.decode('utf8')] = c_attr.value 187 | seq.append(x) 188 | return seq 189 | 190 | def __len__(self): 191 | return self.c_seq.size() 192 | 193 | def __repr__(self): 194 | return "" % len(self) 195 | 196 | 197 | def _intbool(txt): 198 | return bool(int(txt)) 199 | 200 | 201 | cdef class BaseTrainer(object): 202 | """ 203 | The trainer class. 204 | 205 | This class maintains a data set for training, and provides an interface 206 | to various training algorithms. 207 | 208 | Parameters 209 | ---------- 210 | algorithm : {'lbfgs', 'l2sgd', 'ap', 'pa', 'arow'} 211 | The name of the training algorithm. See :meth:`Trainer.select`. 212 | 213 | params : dict, optional 214 | Training parameters. See :meth:`Trainer.set_params` 215 | and :meth:`Trainer.set`. 216 | 217 | verbose : boolean 218 | Whether to print debug messages during training. Default is True. 219 | 220 | """ 221 | cdef crfsuite_api.Trainer c_trainer 222 | 223 | _PARAMETER_TYPES = { 224 | 'feature.minfreq': float, 225 | 'feature.possible_states': _intbool, 226 | 'feature.possible_transitions': _intbool, 227 | 'c1': float, 228 | 'c2': float, 229 | 'max_iterations': int, 230 | 'num_memories': int, 231 | 'epsilon': float, 232 | 'period': int, # XXX: is it called 'stop' in docs? 233 | 'delta': float, 234 | 'linesearch': str, 235 | 'max_linesearch': int, 236 | 'calibration.eta': float, 237 | 'calibration.rate': float, 238 | 'calibration.samples': float, 239 | 'calibration.candidates': int, 240 | 'calibration.max_trials': int, 241 | 'type': int, 242 | 'c': float, 243 | 'error_sensitive': _intbool, 244 | 'averaging': _intbool, 245 | 'variance': float, 246 | 'gamma': float, 247 | } 248 | 249 | _ALGORITHM_ALIASES = { 250 | 'ap': 'averaged-perceptron', 251 | 'pa': 'passive-aggressive', 252 | } 253 | 254 | cdef public verbose 255 | 256 | def __init__(self, algorithm=None, params=None, verbose=True): 257 | if algorithm is not None: 258 | self.select(algorithm) 259 | if params is not None: 260 | self.set_params(params) 261 | self.verbose = verbose 262 | 263 | def __cinit__(self): 264 | # setup message handler 265 | self.c_trainer.set_handler(self, self._on_message) 266 | 267 | # fix segfaults, see https://github.com/chokkan/crfsuite/pull/21 268 | self.c_trainer.select("lbfgs".encode('ascii'), "crf1d".encode('ascii')) 269 | self.c_trainer._init_hack() 270 | 271 | cdef _on_message(self, string message): 272 | self.message(message) 273 | 274 | def message(self, message): 275 | """ 276 | Receive messages from the training algorithm. 277 | Override this method to receive messages of the training 278 | process. 279 | 280 | By default, this method prints messages 281 | if ``Trainer.verbose`` is True. 282 | 283 | Parameters 284 | ---------- 285 | message : string 286 | The message 287 | """ 288 | if self.verbose: 289 | print(message, end='') 290 | 291 | def append(self, xseq, yseq, int group=0): 292 | """ 293 | Append an instance (item/label sequence) to the data set. 294 | 295 | Parameters 296 | ---------- 297 | xseq : a sequence of item features 298 | The item sequence of the instance. ``xseq`` should be a list 299 | of item features or an :class:`~ItemSequence` instance. 300 | Allowed item features formats are the same as described 301 | in :class:`~ItemSequence` docs. 302 | 303 | yseq : a sequence of strings 304 | The label sequence of the instance. The number 305 | of elements in yseq must be identical to that 306 | in xseq. 307 | 308 | group : int, optional 309 | The group number of the instance. Group numbers are used to 310 | select subset of data for heldout evaluation. 311 | """ 312 | self.c_trainer.append(to_seq(xseq), yseq, group) 313 | 314 | def select(self, algorithm, type='crf1d'): 315 | """ 316 | Initialize the training algorithm. 317 | 318 | Parameters 319 | ---------- 320 | algorithm : {'lbfgs', 'l2sgd', 'ap', 'pa', 'arow'} 321 | The name of the training algorithm. 322 | 323 | * 'lbfgs' for Gradient descent using the L-BFGS method, 324 | * 'l2sgd' for Stochastic Gradient Descent with L2 regularization term 325 | * 'ap' for Averaged Perceptron 326 | * 'pa' for Passive Aggressive 327 | * 'arow' for Adaptive Regularization Of Weight Vector 328 | 329 | type : string, optional 330 | The name of the graphical model. 331 | """ 332 | algorithm = algorithm.lower() 333 | algorithm = self._ALGORITHM_ALIASES.get(algorithm, algorithm) 334 | if not self.c_trainer.select(algorithm.encode('ascii'), type.encode('ascii')): 335 | raise ValueError( 336 | "Bad arguments: algorithm=%r, type=%r" % (algorithm, type) 337 | ) 338 | 339 | def train(self, model, int holdout=-1): 340 | """ 341 | Run the training algorithm. 342 | This function starts the training algorithm with the data set given 343 | by :meth:`Trainer.append` method. 344 | 345 | Parameters 346 | ---------- 347 | model : string 348 | The filename to which the trained model is stored. 349 | If this value is empty, this function does not 350 | write out a model file. 351 | 352 | holdout : int, optional 353 | The group number of holdout evaluation. The 354 | instances with this group number will not be used 355 | for training, but for holdout evaluation. 356 | Default value is -1, meaning "use all instances for training". 357 | """ 358 | self._before_train() 359 | status_code = self.c_trainer.train(model, holdout) 360 | if status_code != crfsuite_api.CRFSUITE_SUCCESS: 361 | raise CRFSuiteError(status_code) 362 | 363 | def params(self): 364 | """ 365 | Obtain the list of parameters. 366 | 367 | This function returns the list of parameter names available for the 368 | graphical model and training algorithm specified in Trainer constructor 369 | or by :meth:`Trainer.select` method. 370 | 371 | Returns 372 | ------- 373 | list of strings 374 | The list of parameters available for the current 375 | graphical model and training algorithm. 376 | 377 | """ 378 | return self.c_trainer.params() 379 | 380 | def set_params(self, params): 381 | """ 382 | Set training parameters. 383 | 384 | Parameters 385 | ---------- 386 | params : dict 387 | A dict with parameters ``{name: value}`` 388 | """ 389 | for key, value in params.items(): 390 | self.set(key, value) 391 | 392 | def get_params(self): 393 | """ 394 | Get training parameters. 395 | 396 | Returns 397 | ------- 398 | dict 399 | A dictionary with ``{parameter_name: parameter_value}`` 400 | with all trainer parameters. 401 | """ 402 | # params = self.params() 403 | return dict((name, self.get(name)) for name in self.params()) 404 | 405 | def set(self, name, value): 406 | """ 407 | Set a training parameter. 408 | This function sets a parameter value for the graphical model and 409 | training algorithm specified by :meth:`Trainer.select` method. 410 | 411 | Parameters 412 | ---------- 413 | name : string 414 | The parameter name. 415 | value : string 416 | The value of the parameter. 417 | 418 | """ 419 | if isinstance(value, bool): 420 | value = int(value) 421 | self.c_trainer.set(name, str(value)) 422 | 423 | def get(self, name): 424 | """ 425 | Get the value of a training parameter. 426 | This function gets a parameter value for the graphical model and 427 | training algorithm specified by :meth:`Trainer.select` method. 428 | 429 | Parameters 430 | ---------- 431 | name : string 432 | The parameter name. 433 | """ 434 | return self._cast_parameter(name, self.c_trainer.get(name)) 435 | 436 | def help(self, name): 437 | """ 438 | Get the description of a training parameter. 439 | This function obtains the help message for the parameter specified 440 | by the name. The graphical model and training algorithm must be 441 | selected by :meth:`Trainer.select` method before calling this method. 442 | 443 | Parameters 444 | ---------- 445 | name : string 446 | The parameter name. 447 | 448 | Returns 449 | ------- 450 | string 451 | The description (help message) of the parameter. 452 | 453 | """ 454 | if name not in self.params(): 455 | # c_trainer.help(name) segfaults without this workaround; 456 | # see https://github.com/chokkan/crfsuite/pull/21 457 | raise ValueError("Parameter not found: %s" % name) 458 | return self.c_trainer.help(name) 459 | 460 | def clear(self): 461 | """ Remove all instances in the data set. """ 462 | self.c_trainer.clear() 463 | 464 | def _cast_parameter(self, name, value): 465 | if name in self._PARAMETER_TYPES: 466 | return self._PARAMETER_TYPES[name](value) 467 | return value 468 | 469 | def _before_train(self): 470 | pass 471 | 472 | 473 | class Trainer(BaseTrainer): 474 | """ 475 | The trainer class. 476 | 477 | This class maintains a data set for training, and provides an interface 478 | to various training algorithms. 479 | 480 | Parameters 481 | ---------- 482 | algorithm : {'lbfgs', 'l2sgd', 'ap', 'pa', 'arow'} 483 | The name of the training algorithm. See :meth:`Trainer.select`. 484 | 485 | params : dict, optional 486 | Training parameters. See :meth:`Trainer.set_params` 487 | and :meth:`Trainer.set`. 488 | 489 | verbose : boolean 490 | Whether to print debug messages during training. Default is True. 491 | 492 | """ 493 | logparser = None 494 | 495 | def _before_train(self): 496 | self.logparser = _logparser.TrainLogParser() 497 | 498 | def message(self, message): 499 | event = self.logparser.feed(message) 500 | 501 | if not self.verbose or event is None: 502 | return 503 | 504 | log = self.logparser.last_log 505 | if event == 'start': 506 | self.on_start(log) 507 | elif event == 'featgen_progress': 508 | self.on_featgen_progress(log, self.logparser.featgen_percent) 509 | elif event == 'featgen_end': 510 | self.on_featgen_end(log) 511 | elif event == 'prepared': 512 | self.on_prepared(log) 513 | elif event == 'prepare_error': 514 | self.on_prepare_error(log) 515 | elif event == 'iteration': 516 | self.on_iteration(log, self.logparser.last_iteration) 517 | elif event == 'optimization_end': 518 | self.on_optimization_end(log) 519 | elif event == 'end': 520 | self.on_end(log) 521 | else: 522 | raise Exception("Unknown event %r" % event) 523 | 524 | def on_start(self, log): 525 | print(log, end='') 526 | 527 | def on_featgen_progress(self, log, percent): 528 | print(log, end='') 529 | 530 | def on_featgen_end(self, log): 531 | print(log, end='') 532 | 533 | def on_prepared(self, log): 534 | print(log, end='') 535 | 536 | def on_prepare_error(self, log): 537 | print(log, end='') 538 | 539 | def on_iteration(self, log, info): 540 | print(log, end='') 541 | 542 | def on_optimization_end(self, log): 543 | print(log, end='') 544 | 545 | def on_end(self, log): 546 | print(log, end='') 547 | 548 | 549 | cdef class Tagger(object): 550 | """ 551 | The tagger class. 552 | 553 | This class provides the functionality for predicting label sequences for 554 | input sequences using a model. 555 | """ 556 | cdef crfsuite_api.Tagger c_tagger 557 | 558 | def open(self, name): 559 | """ 560 | Open a model file. 561 | 562 | Parameters 563 | ---------- 564 | name : string 565 | The file name of the model file. 566 | 567 | """ 568 | # We need to do some basic checks ourselves because crfsuite 569 | # may segfault if the file is invalid. 570 | # See https://github.com/chokkan/crfsuite/pull/24 571 | self._check_model(name) 572 | if not self.c_tagger.open(name): 573 | raise ValueError("Error opening model file %r" % name) 574 | return contextlib.closing(self) 575 | 576 | def open_inmemory(self, bytes value): 577 | """ 578 | Open a model from memory. 579 | 580 | Parameters 581 | ---------- 582 | value : bytes 583 | Binary model data (content of a file saved by Trainer.train). 584 | 585 | """ 586 | self._check_inmemory_model(value) 587 | cdef const char *v = value 588 | if not self.c_tagger.open(v, len(value)): 589 | raise ValueError("Error opening model") 590 | return contextlib.closing(self) 591 | 592 | def close(self): 593 | """ 594 | Close the model. 595 | """ 596 | self.c_tagger.close() 597 | 598 | def labels(self): 599 | """ 600 | Obtain the list of labels. 601 | 602 | Returns 603 | ------- 604 | list of strings 605 | The list of labels in the model. 606 | """ 607 | return self.c_tagger.labels() 608 | 609 | def tag(self, xseq=None): 610 | """ 611 | Predict the label sequence for the item sequence. 612 | 613 | Parameters 614 | ---------- 615 | xseq : item sequence, optional 616 | The item sequence. If omitted, the current sequence is used 617 | (a sequence set using :meth:`Tagger.set` method or 618 | a sequence used in a previous :meth:`Tagger.tag` call). 619 | 620 | ``xseq`` should be a list of item features or 621 | an :class:`~ItemSequence` instance. Allowed item features formats 622 | are the same as described in :class:`~ItemSequence` docs. 623 | 624 | Returns 625 | ------- 626 | list of strings 627 | The label sequence predicted. 628 | """ 629 | if xseq is not None: 630 | self.set(xseq) 631 | 632 | return self.c_tagger.viterbi() 633 | 634 | def probability(self, yseq): 635 | """ 636 | Compute the probability of the label sequence for the current input 637 | sequence (a sequence set using :meth:`Tagger.set` method or 638 | a sequence used in a previous :meth:`Tagger.tag` call). 639 | 640 | Parameters 641 | ---------- 642 | yseq : list of strings 643 | The label sequence. 644 | 645 | Returns 646 | ------- 647 | float 648 | The probability ``P(yseq|xseq)``. 649 | """ 650 | return self.c_tagger.probability(yseq) 651 | 652 | def marginal(self, y, pos): 653 | """ 654 | Compute the marginal probability of the label ``y`` at position ``pos`` 655 | for the current input sequence (i.e. a sequence set using 656 | :meth:`Tagger.set` method or a sequence used in a previous 657 | :meth:`Tagger.tag` call). 658 | 659 | Parameters 660 | ---------- 661 | y : string 662 | The label. 663 | t : int 664 | The position of the label. 665 | 666 | Returns 667 | ------- 668 | float 669 | The marginal probability of the label ``y`` at position ``t``. 670 | """ 671 | return self.c_tagger.marginal(y, pos) 672 | 673 | cpdef extern set(self, xseq) except +: 674 | """ 675 | Set an instance (item sequence) for future calls of 676 | :meth:`Tagger.tag`, :meth:`Tagger.probability` 677 | and :meth:`Tagger.marginal` methods. 678 | 679 | Parameters 680 | ---------- 681 | xseq : item sequence 682 | The item sequence of the instance. ``xseq`` should be a list of 683 | item features or an :class:`~ItemSequence` instance. 684 | Allowed item features formats are the same as described 685 | in :class:`~ItemSequence` docs. 686 | 687 | """ 688 | self.c_tagger.set(to_seq(xseq)) 689 | 690 | def dump(self, filename=None): 691 | """ 692 | Dump a CRF model in plain-text format. 693 | 694 | Parameters 695 | ---------- 696 | filename : string, optional 697 | File name to dump the model to. 698 | If None, the model is dumped to stdout. 699 | """ 700 | if filename is None: 701 | self.c_tagger.dump(os.dup(sys.stdout.fileno())) 702 | else: 703 | fd = os.open(filename, os.O_CREAT | os.O_WRONLY) 704 | try: 705 | self.c_tagger.dump(fd) 706 | finally: 707 | try: 708 | os.close(fd) 709 | except OSError: 710 | pass # already closed by Tagger::dump 711 | 712 | def info(self): 713 | """ 714 | Return a :class:`~.ParsedDump` structure with model internal information. 715 | """ 716 | parser = _dumpparser.CRFsuiteDumpParser() 717 | fd, name = tempfile.mkstemp() 718 | try: 719 | self.c_tagger.dump(fd) 720 | with open(name, 'rb') as f: 721 | for line in f: 722 | parser.feed(line.decode('utf8')) 723 | finally: 724 | try: 725 | os.unlink(name) 726 | except OSError: 727 | pass 728 | return parser.result 729 | 730 | def _check_model(self, name): 731 | # See https://github.com/chokkan/crfsuite/pull/24 732 | # 1. Check that the file can be opened. 733 | with open(name, 'rb') as f: 734 | 735 | # 2. Check that file magic is correct. 736 | magic = f.read(4) 737 | if magic != b'lCRF': 738 | raise ValueError("Invalid model file %r" % name) 739 | 740 | # 3. Make sure crfsuite won't read past allocated memory 741 | # in case of incomplete header. 742 | f.seek(0, os.SEEK_END) 743 | size = f.tell() 744 | if size <= 48: # header size 745 | raise ValueError("Model file %r doesn't have a complete header" % name) 746 | 747 | def _check_inmemory_model(self, bytes value): 748 | magic = value[:4] 749 | if magic != b'lCRF': 750 | raise ValueError("Invalid model") 751 | 752 | if len(value) < 48: 753 | raise ValueError("Invalid model: incomplete header") 754 | -------------------------------------------------------------------------------- /examples/CoNLL 2002.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [ 10 | { 11 | "name": "stdout", 12 | "output_type": "stream", 13 | "text": [ 14 | "0.16.1\n" 15 | ] 16 | } 17 | ], 18 | "source": [ 19 | "from itertools import chain\n", 20 | "import nltk\n", 21 | "from sklearn.metrics import classification_report, confusion_matrix\n", 22 | "from sklearn.preprocessing import LabelBinarizer\n", 23 | "import sklearn\n", 24 | "import pycrfsuite\n", 25 | "\n", 26 | "print(sklearn.__version__)" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "# Let's use CoNLL 2002 data to build a NER system\n", 34 | "\n", 35 | "CoNLL2002 corpus is available in NLTK. We use Spanish data." 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 2, 41 | "metadata": { 42 | "collapsed": false 43 | }, 44 | "outputs": [ 45 | { 46 | "data": { 47 | "text/plain": [ 48 | "[u'esp.testa',\n", 49 | " u'esp.testb',\n", 50 | " u'esp.train',\n", 51 | " u'ned.testa',\n", 52 | " u'ned.testb',\n", 53 | " u'ned.train']" 54 | ] 55 | }, 56 | "execution_count": 2, 57 | "metadata": {}, 58 | "output_type": "execute_result" 59 | } 60 | ], 61 | "source": [ 62 | "nltk.corpus.conll2002.fileids()" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 3, 68 | "metadata": { 69 | "collapsed": false 70 | }, 71 | "outputs": [ 72 | { 73 | "name": "stdout", 74 | "output_type": "stream", 75 | "text": [ 76 | "CPU times: user 2.42 s, sys: 70.4 ms, total: 2.49 s\n", 77 | "Wall time: 2.55 s\n" 78 | ] 79 | } 80 | ], 81 | "source": [ 82 | "%%time\n", 83 | "train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))\n", 84 | "test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "Data format:" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 4, 97 | "metadata": { 98 | "collapsed": false 99 | }, 100 | "outputs": [ 101 | { 102 | "data": { 103 | "text/plain": [ 104 | "[(u'Melbourne', u'NP', u'B-LOC'),\n", 105 | " (u'(', u'Fpa', u'O'),\n", 106 | " (u'Australia', u'NP', u'B-LOC'),\n", 107 | " (u')', u'Fpt', u'O'),\n", 108 | " (u',', u'Fc', u'O'),\n", 109 | " (u'25', u'Z', u'O'),\n", 110 | " (u'may', u'NC', u'O'),\n", 111 | " (u'(', u'Fpa', u'O'),\n", 112 | " (u'EFE', u'NC', u'B-ORG'),\n", 113 | " (u')', u'Fpt', u'O'),\n", 114 | " (u'.', u'Fp', u'O')]" 115 | ] 116 | }, 117 | "execution_count": 4, 118 | "metadata": {}, 119 | "output_type": "execute_result" 120 | } 121 | ], 122 | "source": [ 123 | "train_sents[0]" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "## Features\n", 131 | "\n", 132 | "Next, define some features. In this example we use word identity, word suffix, word shape and word POS tag; also, some information from nearby words is used. \n", 133 | "\n", 134 | "This makes a simple baseline, but you certainly can add and remove some features to get (much?) better results - experiment with it." 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 5, 140 | "metadata": { 141 | "collapsed": false 142 | }, 143 | "outputs": [], 144 | "source": [ 145 | "def word2features(sent, i):\n", 146 | " word = sent[i][0]\n", 147 | " postag = sent[i][1]\n", 148 | " features = [\n", 149 | " 'bias',\n", 150 | " 'word.lower=' + word.lower(),\n", 151 | " 'word[-3:]=' + word[-3:],\n", 152 | " 'word[-2:]=' + word[-2:],\n", 153 | " 'word.isupper=%s' % word.isupper(),\n", 154 | " 'word.istitle=%s' % word.istitle(),\n", 155 | " 'word.isdigit=%s' % word.isdigit(),\n", 156 | " 'postag=' + postag,\n", 157 | " 'postag[:2]=' + postag[:2],\n", 158 | " ]\n", 159 | " if i > 0:\n", 160 | " word1 = sent[i-1][0]\n", 161 | " postag1 = sent[i-1][1]\n", 162 | " features.extend([\n", 163 | " '-1:word.lower=' + word1.lower(),\n", 164 | " '-1:word.istitle=%s' % word1.istitle(),\n", 165 | " '-1:word.isupper=%s' % word1.isupper(),\n", 166 | " '-1:postag=' + postag1,\n", 167 | " '-1:postag[:2]=' + postag1[:2],\n", 168 | " ])\n", 169 | " else:\n", 170 | " features.append('BOS')\n", 171 | " \n", 172 | " if i < len(sent)-1:\n", 173 | " word1 = sent[i+1][0]\n", 174 | " postag1 = sent[i+1][1]\n", 175 | " features.extend([\n", 176 | " '+1:word.lower=' + word1.lower(),\n", 177 | " '+1:word.istitle=%s' % word1.istitle(),\n", 178 | " '+1:word.isupper=%s' % word1.isupper(),\n", 179 | " '+1:postag=' + postag1,\n", 180 | " '+1:postag[:2]=' + postag1[:2],\n", 181 | " ])\n", 182 | " else:\n", 183 | " features.append('EOS')\n", 184 | " \n", 185 | " return features\n", 186 | "\n", 187 | "\n", 188 | "def sent2features(sent):\n", 189 | " return [word2features(sent, i) for i in range(len(sent))]\n", 190 | "\n", 191 | "def sent2labels(sent):\n", 192 | " return [label for token, postag, label in sent]\n", 193 | "\n", 194 | "def sent2tokens(sent):\n", 195 | " return [token for token, postag, label in sent] " 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": {}, 201 | "source": [ 202 | "This is what word2features extracts:" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 6, 208 | "metadata": { 209 | "collapsed": false 210 | }, 211 | "outputs": [ 212 | { 213 | "data": { 214 | "text/plain": [ 215 | "['bias',\n", 216 | " u'word.lower=melbourne',\n", 217 | " u'word[-3:]=rne',\n", 218 | " u'word[-2:]=ne',\n", 219 | " 'word.isupper=False',\n", 220 | " 'word.istitle=True',\n", 221 | " 'word.isdigit=False',\n", 222 | " u'postag=NP',\n", 223 | " u'postag[:2]=NP',\n", 224 | " 'BOS',\n", 225 | " u'+1:word.lower=(',\n", 226 | " '+1:word.istitle=False',\n", 227 | " '+1:word.isupper=False',\n", 228 | " u'+1:postag=Fpa',\n", 229 | " u'+1:postag[:2]=Fp']" 230 | ] 231 | }, 232 | "execution_count": 6, 233 | "metadata": {}, 234 | "output_type": "execute_result" 235 | } 236 | ], 237 | "source": [ 238 | "sent2features(train_sents[0])[0]" 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": {}, 244 | "source": [ 245 | "Extract the features from the data:" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 7, 251 | "metadata": { 252 | "collapsed": false 253 | }, 254 | "outputs": [ 255 | { 256 | "name": "stdout", 257 | "output_type": "stream", 258 | "text": [ 259 | "CPU times: user 2.24 s, sys: 287 ms, total: 2.53 s\n", 260 | "Wall time: 2.53 s\n" 261 | ] 262 | } 263 | ], 264 | "source": [ 265 | "%%time\n", 266 | "X_train = [sent2features(s) for s in train_sents]\n", 267 | "y_train = [sent2labels(s) for s in train_sents]\n", 268 | "\n", 269 | "X_test = [sent2features(s) for s in test_sents]\n", 270 | "y_test = [sent2labels(s) for s in test_sents]" 271 | ] 272 | }, 273 | { 274 | "cell_type": "markdown", 275 | "metadata": {}, 276 | "source": [ 277 | "## Train the model\n", 278 | "\n", 279 | "To train the model, we create pycrfsuite.Trainer, load the training data and call 'train' method. \n", 280 | "First, create pycrfsuite.Trainer and load the training data to CRFsuite:" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": 8, 286 | "metadata": { 287 | "collapsed": false 288 | }, 289 | "outputs": [ 290 | { 291 | "name": "stdout", 292 | "output_type": "stream", 293 | "text": [ 294 | "CPU times: user 3.48 s, sys: 90.2 ms, total: 3.57 s\n", 295 | "Wall time: 3.56 s\n" 296 | ] 297 | } 298 | ], 299 | "source": [ 300 | "%%time\n", 301 | "trainer = pycrfsuite.Trainer(verbose=False)\n", 302 | "\n", 303 | "for xseq, yseq in zip(X_train, y_train):\n", 304 | " trainer.append(xseq, yseq)" 305 | ] 306 | }, 307 | { 308 | "cell_type": "markdown", 309 | "metadata": {}, 310 | "source": [ 311 | "Set training parameters. We will use L-BFGS training algorithm (it is default) with Elastic Net (L1 + L2) regularization." 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": 9, 317 | "metadata": { 318 | "collapsed": false 319 | }, 320 | "outputs": [], 321 | "source": [ 322 | "trainer.set_params({\n", 323 | " 'c1': 1.0, # coefficient for L1 penalty\n", 324 | " 'c2': 1e-3, # coefficient for L2 penalty\n", 325 | " 'max_iterations': 50, # stop earlier\n", 326 | "\n", 327 | " # include transitions that are possible, but not observed\n", 328 | " 'feature.possible_transitions': True\n", 329 | "})" 330 | ] 331 | }, 332 | { 333 | "cell_type": "markdown", 334 | "metadata": {}, 335 | "source": [ 336 | "Possible parameters for the default training algorithm:" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": 10, 342 | "metadata": { 343 | "collapsed": false 344 | }, 345 | "outputs": [ 346 | { 347 | "data": { 348 | "text/plain": [ 349 | "['feature.minfreq',\n", 350 | " 'feature.possible_states',\n", 351 | " 'feature.possible_transitions',\n", 352 | " 'c1',\n", 353 | " 'c2',\n", 354 | " 'max_iterations',\n", 355 | " 'num_memories',\n", 356 | " 'epsilon',\n", 357 | " 'period',\n", 358 | " 'delta',\n", 359 | " 'linesearch',\n", 360 | " 'max_linesearch']" 361 | ] 362 | }, 363 | "execution_count": 10, 364 | "metadata": {}, 365 | "output_type": "execute_result" 366 | } 367 | ], 368 | "source": [ 369 | "trainer.params()" 370 | ] 371 | }, 372 | { 373 | "cell_type": "markdown", 374 | "metadata": {}, 375 | "source": [ 376 | "Train the model:" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": 11, 382 | "metadata": { 383 | "collapsed": false 384 | }, 385 | "outputs": [ 386 | { 387 | "name": "stdout", 388 | "output_type": "stream", 389 | "text": [ 390 | "CPU times: user 18.8 s, sys: 102 ms, total: 18.9 s\n", 391 | "Wall time: 19.2 s\n" 392 | ] 393 | } 394 | ], 395 | "source": [ 396 | "%%time\n", 397 | "trainer.train('conll2002-esp.crfsuite')" 398 | ] 399 | }, 400 | { 401 | "cell_type": "markdown", 402 | "metadata": {}, 403 | "source": [ 404 | "trainer.train saves model to a file:" 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": 12, 410 | "metadata": { 411 | "collapsed": false 412 | }, 413 | "outputs": [ 414 | { 415 | "name": "stdout", 416 | "output_type": "stream", 417 | "text": [ 418 | "-rw-r--r-- 1 gsh25 staff 600K Jun 22 14:56 ./conll2002-esp.crfsuite\r\n" 419 | ] 420 | } 421 | ], 422 | "source": [ 423 | "!ls -lh ./conll2002-esp.crfsuite" 424 | ] 425 | }, 426 | { 427 | "cell_type": "markdown", 428 | "metadata": {}, 429 | "source": [ 430 | "We can also get information about the final state of the model by looking at the trainer's logparser. If we had tagged our input data using the optional group argument in add, and had used the optional holdout argument during train, there would be information about the trainer's performance on the holdout set as well. " 431 | ] 432 | }, 433 | { 434 | "cell_type": "code", 435 | "execution_count": 13, 436 | "metadata": { 437 | "collapsed": false 438 | }, 439 | "outputs": [ 440 | { 441 | "data": { 442 | "text/plain": [ 443 | "{'active_features': 11346,\n", 444 | " 'error_norm': 1262.912078,\n", 445 | " 'feature_norm': 79.110017,\n", 446 | " 'linesearch_step': 1.0,\n", 447 | " 'linesearch_trials': 1,\n", 448 | " 'loss': 14807.577946,\n", 449 | " 'num': 50,\n", 450 | " 'scores': {},\n", 451 | " 'time': 0.342}" 452 | ] 453 | }, 454 | "execution_count": 13, 455 | "metadata": {}, 456 | "output_type": "execute_result" 457 | } 458 | ], 459 | "source": [ 460 | "trainer.logparser.last_iteration" 461 | ] 462 | }, 463 | { 464 | "cell_type": "markdown", 465 | "metadata": {}, 466 | "source": [ 467 | "We can also get this information for every step using trainer.logparser.iterations" 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": 15, 473 | "metadata": { 474 | "collapsed": false 475 | }, 476 | "outputs": [ 477 | { 478 | "name": "stdout", 479 | "output_type": "stream", 480 | "text": [ 481 | "50 {'loss': 14807.577946, 'error_norm': 1262.912078, 'linesearch_trials': 1, 'active_features': 11346, 'num': 50, 'time': 0.342, 'scores': {}, 'linesearch_step': 1.0, 'feature_norm': 79.110017}\n" 482 | ] 483 | } 484 | ], 485 | "source": [ 486 | "print len(trainer.logparser.iterations), trainer.logparser.iterations[-1]" 487 | ] 488 | }, 489 | { 490 | "cell_type": "markdown", 491 | "metadata": {}, 492 | "source": [ 493 | "## Make predictions\n", 494 | "\n", 495 | "To use the trained model, create pycrfsuite.Tagger, open the model and use \"tag\" method:" 496 | ] 497 | }, 498 | { 499 | "cell_type": "code", 500 | "execution_count": 13, 501 | "metadata": { 502 | "collapsed": false 503 | }, 504 | "outputs": [ 505 | { 506 | "data": { 507 | "text/plain": [ 508 | "" 509 | ] 510 | }, 511 | "execution_count": 13, 512 | "metadata": {}, 513 | "output_type": "execute_result" 514 | } 515 | ], 516 | "source": [ 517 | "tagger = pycrfsuite.Tagger()\n", 518 | "tagger.open('conll2002-esp.crfsuite')" 519 | ] 520 | }, 521 | { 522 | "cell_type": "markdown", 523 | "metadata": {}, 524 | "source": [ 525 | "Let's tag a sentence to see how it works:" 526 | ] 527 | }, 528 | { 529 | "cell_type": "code", 530 | "execution_count": 14, 531 | "metadata": { 532 | "collapsed": false 533 | }, 534 | "outputs": [ 535 | { 536 | "name": "stdout", 537 | "output_type": "stream", 538 | "text": [ 539 | "La Coruña , 23 may ( EFECOM ) .\n", 540 | "\n", 541 | "Predicted: B-LOC I-LOC O O O O B-ORG O O\n", 542 | "Correct: B-LOC I-LOC O O O O B-ORG O O\n" 543 | ] 544 | } 545 | ], 546 | "source": [ 547 | "example_sent = test_sents[0]\n", 548 | "print(' '.join(sent2tokens(example_sent)), end='\\n\\n')\n", 549 | "\n", 550 | "print(\"Predicted:\", ' '.join(tagger.tag(sent2features(example_sent))))\n", 551 | "print(\"Correct: \", ' '.join(sent2labels(example_sent)))" 552 | ] 553 | }, 554 | { 555 | "cell_type": "markdown", 556 | "metadata": {}, 557 | "source": [ 558 | "## Evaluate the model" 559 | ] 560 | }, 561 | { 562 | "cell_type": "code", 563 | "execution_count": 15, 564 | "metadata": { 565 | "collapsed": false 566 | }, 567 | "outputs": [], 568 | "source": [ 569 | "def bio_classification_report(y_true, y_pred):\n", 570 | " \"\"\"\n", 571 | " Classification report for a list of BIO-encoded sequences.\n", 572 | " It computes token-level metrics and discards \"O\" labels.\n", 573 | " \n", 574 | " Note that it requires scikit-learn 0.15+ (or a version from github master)\n", 575 | " to calculate averages properly!\n", 576 | " \"\"\"\n", 577 | " lb = LabelBinarizer()\n", 578 | " y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))\n", 579 | " y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))\n", 580 | " \n", 581 | " tagset = set(lb.classes_) - {'O'}\n", 582 | " tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])\n", 583 | " class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}\n", 584 | " \n", 585 | " return classification_report(\n", 586 | " y_true_combined,\n", 587 | " y_pred_combined,\n", 588 | " labels = [class_indices[cls] for cls in tagset],\n", 589 | " target_names = tagset,\n", 590 | " )" 591 | ] 592 | }, 593 | { 594 | "cell_type": "markdown", 595 | "metadata": {}, 596 | "source": [ 597 | "Predict entity labels for all sentences in our testing set ('testb' Spanish data):" 598 | ] 599 | }, 600 | { 601 | "cell_type": "code", 602 | "execution_count": 16, 603 | "metadata": { 604 | "collapsed": false 605 | }, 606 | "outputs": [ 607 | { 608 | "name": "stdout", 609 | "output_type": "stream", 610 | "text": [ 611 | "CPU times: user 598 ms, sys: 17.4 ms, total: 616 ms\n", 612 | "Wall time: 615 ms\n" 613 | ] 614 | } 615 | ], 616 | "source": [ 617 | "%%time\n", 618 | "y_pred = [tagger.tag(xseq) for xseq in X_test]" 619 | ] 620 | }, 621 | { 622 | "cell_type": "markdown", 623 | "metadata": {}, 624 | "source": [ 625 | "..and check the result. Note this report is not comparable to results in CONLL2002 papers because here we check per-token results (not per-entity). Per-entity numbers will be worse. " 626 | ] 627 | }, 628 | { 629 | "cell_type": "code", 630 | "execution_count": 17, 631 | "metadata": { 632 | "collapsed": false 633 | }, 634 | "outputs": [ 635 | { 636 | "name": "stdout", 637 | "output_type": "stream", 638 | "text": [ 639 | " precision recall f1-score support\n", 640 | "\n", 641 | " B-LOC 0.78 0.75 0.76 1084\n", 642 | " I-LOC 0.87 0.93 0.90 634\n", 643 | " B-MISC 0.69 0.47 0.56 339\n", 644 | " I-MISC 0.87 0.93 0.90 634\n", 645 | " B-ORG 0.82 0.87 0.84 735\n", 646 | " I-ORG 0.87 0.93 0.90 634\n", 647 | " B-PER 0.61 0.49 0.54 557\n", 648 | " I-PER 0.87 0.93 0.90 634\n", 649 | "\n", 650 | "avg / total 0.81 0.81 0.80 5251\n", 651 | "\n" 652 | ] 653 | } 654 | ], 655 | "source": [ 656 | "print(bio_classification_report(y_test, y_pred))" 657 | ] 658 | }, 659 | { 660 | "cell_type": "markdown", 661 | "metadata": {}, 662 | "source": [ 663 | "## Let's check what classifier learned" 664 | ] 665 | }, 666 | { 667 | "cell_type": "code", 668 | "execution_count": 18, 669 | "metadata": { 670 | "collapsed": false 671 | }, 672 | "outputs": [ 673 | { 674 | "name": "stdout", 675 | "output_type": "stream", 676 | "text": [ 677 | "Top likely transitions:\n", 678 | "B-ORG -> I-ORG 8.631963\n", 679 | "I-ORG -> I-ORG 7.833706\n", 680 | "B-PER -> I-PER 6.998706\n", 681 | "B-LOC -> I-LOC 6.913675\n", 682 | "I-MISC -> I-MISC 6.129735\n", 683 | "B-MISC -> I-MISC 5.538291\n", 684 | "I-LOC -> I-LOC 4.983567\n", 685 | "I-PER -> I-PER 3.748358\n", 686 | "B-ORG -> B-LOC 1.727090\n", 687 | "B-PER -> B-LOC 1.388267\n", 688 | "B-LOC -> B-LOC 1.240278\n", 689 | "O -> O 1.197929\n", 690 | "O -> B-ORG 1.097062\n", 691 | "I-PER -> B-LOC 1.083332\n", 692 | "O -> B-MISC 1.046113\n", 693 | "\n", 694 | "Top unlikely transitions:\n", 695 | "I-PER -> B-ORG -2.056130\n", 696 | "I-LOC -> I-ORG -2.143940\n", 697 | "B-ORG -> I-MISC -2.167501\n", 698 | "I-PER -> I-ORG -2.369380\n", 699 | "B-ORG -> I-PER -2.378110\n", 700 | "I-MISC -> I-PER -2.458788\n", 701 | "B-LOC -> I-PER -2.516414\n", 702 | "I-ORG -> I-MISC -2.571973\n", 703 | "I-LOC -> B-PER -2.697791\n", 704 | "I-LOC -> I-PER -3.065950\n", 705 | "I-ORG -> I-PER -3.364434\n", 706 | "O -> I-PER -7.322841\n", 707 | "O -> I-MISC -7.648246\n", 708 | "O -> I-ORG -8.024126\n", 709 | "O -> I-LOC -8.333815\n" 710 | ] 711 | } 712 | ], 713 | "source": [ 714 | "from collections import Counter\n", 715 | "info = tagger.info()\n", 716 | "\n", 717 | "def print_transitions(trans_features):\n", 718 | " for (label_from, label_to), weight in trans_features:\n", 719 | " print(\"%-6s -> %-7s %0.6f\" % (label_from, label_to, weight))\n", 720 | "\n", 721 | "print(\"Top likely transitions:\")\n", 722 | "print_transitions(Counter(info.transitions).most_common(15))\n", 723 | "\n", 724 | "print(\"\\nTop unlikely transitions:\")\n", 725 | "print_transitions(Counter(info.transitions).most_common()[-15:])" 726 | ] 727 | }, 728 | { 729 | "cell_type": "markdown", 730 | "metadata": {}, 731 | "source": [ 732 | "We can see that, for example, it is very likely that the beginning of an organization name (B-ORG) will be followed by a token inside organization name (I-ORG), but transitions to I-ORG from tokens with other labels are penalized. Also note I-PER -> B-LOC transition: a positive weight means that model thinks that a person name is often followed by a location.\n", 733 | "\n", 734 | "Check the state features:" 735 | ] 736 | }, 737 | { 738 | "cell_type": "code", 739 | "execution_count": 19, 740 | "metadata": { 741 | "collapsed": false 742 | }, 743 | "outputs": [ 744 | { 745 | "name": "stdout", 746 | "output_type": "stream", 747 | "text": [ 748 | "Top positive:\n", 749 | "8.886516 B-ORG word.lower=efe-cantabria\n", 750 | "8.743642 B-ORG word.lower=psoe-progresistas\n", 751 | "5.769032 B-LOC -1:word.lower=cantabria\n", 752 | "5.195429 I-LOC -1:word.lower=calle\n", 753 | "5.116821 O word.lower=mayo\n", 754 | "4.990871 O -1:word.lower=día\n", 755 | "4.910915 I-ORG -1:word.lower=l\n", 756 | "4.721572 B-MISC word.lower=diversia\n", 757 | "4.676259 B-ORG word.lower=telefónica\n", 758 | "4.334354 B-ORG word[-2:]=-e\n", 759 | "4.149862 B-ORG word.lower=amena\n", 760 | "4.141370 B-ORG word.lower=terra\n", 761 | "3.942852 O word.istitle=False\n", 762 | "3.926397 B-ORG word.lower=continente\n", 763 | "3.924672 B-ORG word.lower=acesa\n", 764 | "3.888706 O word.lower=euro\n", 765 | "3.856445 B-PER -1:word.lower=según\n", 766 | "3.812373 B-MISC word.lower=exteriores\n", 767 | "3.807582 I-MISC -1:word.lower=1.9\n", 768 | "3.807098 B-MISC word.lower=sanidad\n", 769 | "\n", 770 | "Top negative:\n", 771 | "-1.965379 O word.lower=fundación\n", 772 | "-1.981541 O -1:word.lower=británica\n", 773 | "-2.118347 O word.lower=061\n", 774 | "-2.190653 B-PER word[-3:]=nes\n", 775 | "-2.226373 B-ORG postag=SP\n", 776 | "-2.226373 B-ORG postag[:2]=SP\n", 777 | "-2.260972 O word[-3:]=uia\n", 778 | "-2.384920 O -1:word.lower=sección\n", 779 | "-2.483009 O word[-2:]=s.\n", 780 | "-2.535050 I-LOC BOS\n", 781 | "-2.583123 O -1:word.lower=sánchez\n", 782 | "-2.585756 O postag[:2]=NP\n", 783 | "-2.585756 O postag=NP\n", 784 | "-2.588899 O word[-2:]=om\n", 785 | "-2.738583 O -1:word.lower=carretera\n", 786 | "-2.913103 O word.istitle=True\n", 787 | "-2.926560 O word[-2:]=nd\n", 788 | "-2.946862 I-PER -1:word.lower=san\n", 789 | "-2.954094 B-PER -1:word.lower=del\n", 790 | "-3.529449 O word.isupper=True\n" 791 | ] 792 | } 793 | ], 794 | "source": [ 795 | "def print_state_features(state_features):\n", 796 | " for (attr, label), weight in state_features:\n", 797 | " print(\"%0.6f %-6s %s\" % (weight, label, attr)) \n", 798 | "\n", 799 | "print(\"Top positive:\")\n", 800 | "print_state_features(Counter(info.state_features).most_common(20))\n", 801 | "\n", 802 | "print(\"\\nTop negative:\")\n", 803 | "print_state_features(Counter(info.state_features).most_common()[-20:])" 804 | ] 805 | }, 806 | { 807 | "cell_type": "markdown", 808 | "metadata": {}, 809 | "source": [ 810 | "Some observations:\n", 811 | "\n", 812 | "* **8.743642 B-ORG word.lower=psoe-progresistas** - the model remembered names of some entities - maybe it is overfit, or maybe our features are not adequate, or maybe remembering is indeed helpful;\n", 813 | "* **5.195429 I-LOC -1:word.lower=calle**: \"calle\" is a street in Spanish; model learns that if a previous word was \"calle\" then the token is likely a part of location;\n", 814 | "* **-3.529449 O word.isupper=True**, ** -2.913103 O word.istitle=True **: UPPERCASED or TitleCased words are likely entities of some kind;\n", 815 | "* **-2.585756 O postag=NP** - proper nouns (NP is a proper noun in the Spanish tagset) are often entities." 816 | ] 817 | }, 818 | { 819 | "cell_type": "markdown", 820 | "metadata": {}, 821 | "source": [ 822 | "## What to do next\n", 823 | "\n", 824 | "1. Load 'testa' Spanish data.\n", 825 | "2. Use it to develop better features and to find best model parameters.\n", 826 | "3. Apply the model to 'testb' data again.\n", 827 | "\n", 828 | "The model in this notebook is just a starting point; you certainly can do better!" 829 | ] 830 | } 831 | ], 832 | "metadata": { 833 | "kernelspec": { 834 | "display_name": "Python 2", 835 | "language": "python", 836 | "name": "python2" 837 | }, 838 | "language_info": { 839 | "codemirror_mode": { 840 | "name": "ipython", 841 | "version": 2 842 | }, 843 | "file_extension": ".py", 844 | "mimetype": "text/x-python", 845 | "name": "python", 846 | "nbconvert_exporter": "python", 847 | "pygments_lexer": "ipython2", 848 | "version": "2.7.11" 849 | } 850 | }, 851 | "nbformat": 4, 852 | "nbformat_minor": 0 853 | } 854 | -------------------------------------------------------------------------------- /tests/test_logparser.py: -------------------------------------------------------------------------------- 1 | from pycrfsuite._logparser import TrainLogParser # noqa: F401 2 | 3 | 4 | def _apply_parser(parser, log): 5 | for line in log: 6 | event = parser.feed(line) 7 | if event and event != "featgen_progress": 8 | print(parser.last_log, end="") 9 | print("============== " + event) 10 | 11 | 12 | log1 = [ 13 | "Holdout group: 2\n", 14 | "\n", 15 | "Feature generation\n", 16 | "type: CRF1d\n", 17 | "feature.minfreq: 0.000000\n", 18 | "feature.possible_states: 0\n", 19 | "feature.possible_transitions: 1\n", 20 | "0", 21 | ".", 22 | ".", 23 | ".", 24 | ".", 25 | "1", 26 | ".", 27 | ".", 28 | ".", 29 | ".", 30 | "2", 31 | ".", 32 | ".", 33 | ".", 34 | ".", 35 | "3", 36 | ".", 37 | ".", 38 | ".", 39 | ".", 40 | "4", 41 | ".", 42 | ".", 43 | ".", 44 | ".", 45 | "5", 46 | ".", 47 | ".", 48 | ".", 49 | ".", 50 | "6", 51 | ".", 52 | ".", 53 | ".", 54 | ".", 55 | "7", 56 | ".", 57 | ".", 58 | ".", 59 | ".", 60 | "8", 61 | ".", 62 | ".", 63 | ".", 64 | ".", 65 | "9", 66 | ".", 67 | ".", 68 | ".", 69 | ".", 70 | "10", 71 | "\n", 72 | "Number of features: 3948\n", 73 | "Seconds required: 0.022\n", 74 | "\n", 75 | "L-BFGS optimization\n", 76 | "c1: 1.000000\n", 77 | "c2: 0.001000\n", 78 | "num_memories: 6\n", 79 | "max_iterations: 5\n", 80 | "epsilon: 0.000010\n", 81 | "stop: 10\n", 82 | "delta: 0.000010\n", 83 | "linesearch: MoreThuente\n", 84 | "linesearch.max_iterations: 20\n", 85 | "\n", 86 | "***** Iteration #1 *****\n", 87 | "Loss: 1450.519004\n", 88 | "Feature norm: 1.000000\n", 89 | "Error norm: 713.784994\n", 90 | "Active features: 1794\n", 91 | "Line search trials: 1\n", 92 | "Line search step: 0.000228\n", 93 | "Seconds required for this iteration: 0.008\n", 94 | "Performance by label (#match, #model, #ref) (precision, recall, F1):\n", 95 | " B-LOC: (0, 0, 6) (0.0000, 0.0000, 0.0000)\n", 96 | " O: (306, 339, 306) (0.9027, 1.0000, 0.9488)\n", 97 | " B-ORG: (0, 0, 9) (0.0000, 0.0000, 0.0000)\n", 98 | " B-PER: (0, 0, 3) (0.0000, 0.0000, 0.0000)\n", 99 | " I-PER: (0, 0, 4) (0.0000, 0.0000, 0.0000)\n", 100 | " B-MISC: (0, 0, 5) (0.0000, 0.0000, 0.0000)\n", 101 | " I-ORG: (0, 0, 5) (0.0000, 0.0000, 0.0000)\n", 102 | " I-LOC: (0, 0, 1) (0.0000, 0.0000, 0.0000)\n", 103 | " I-MISC: (0, 0, 0) (******, ******, ******)\n", 104 | "Macro-average precision, recall, F1: (0.100295, 0.111111, 0.105426)\n", 105 | "Item accuracy: 306 / 339 (0.9027)\n", 106 | "Instance accuracy: 3 / 10 (0.3000)\n", 107 | "\n", 108 | "***** Iteration #2 *****\n", 109 | "Loss: 1363.687719\n", 110 | "Feature norm: 1.178396\n", 111 | "Error norm: 370.827506\n", 112 | "Active features: 1540\n", 113 | "Line search trials: 1\n", 114 | "Line search step: 1.000000\n", 115 | "Seconds required for this iteration: 0.004\n", 116 | "Performance by label (#match, #model, #ref) (precision, recall, F1):\n", 117 | " B-LOC: (0, 0, 6) (0.0000, 0.0000, 0.0000)\n", 118 | " O: (306, 339, 306) (0.9027, 1.0000, 0.9488)\n", 119 | " B-ORG: (0, 0, 9) (0.0000, 0.0000, 0.0000)\n", 120 | " B-PER: (0, 0, 3) (0.0000, 0.0000, 0.0000)\n", 121 | " I-PER: (0, 0, 4) (0.0000, 0.0000, 0.0000)\n", 122 | " B-MISC: (0, 0, 5) (0.0000, 0.0000, 0.0000)\n", 123 | " I-ORG: (0, 0, 5) (0.0000, 0.0000, 0.0000)\n", 124 | " I-LOC: (0, 0, 1) (0.0000, 0.0000, 0.0000)\n", 125 | " I-MISC: (0, 0, 0) (******, ******, ******)\n", 126 | "Macro-average precision, recall, F1: (0.100295, 0.111111, 0.105426)\n", 127 | "Item accuracy: 306 / 339 (0.9027)\n", 128 | "Instance accuracy: 3 / 10 (0.3000)\n", 129 | "\n", 130 | "***** Iteration #3 *****\n", 131 | "Loss: 1309.171814\n", 132 | "Feature norm: 1.266322\n", 133 | "Error norm: 368.739493\n", 134 | "Active features: 1308\n", 135 | "Line search trials: 1\n", 136 | "Line search step: 1.000000\n", 137 | "Seconds required for this iteration: 0.003\n", 138 | "Performance by label (#match, #model, #ref) (precision, recall, F1):\n", 139 | " B-LOC: (0, 0, 6) (0.0000, 0.0000, 0.0000)\n", 140 | " O: (306, 339, 306) (0.9027, 1.0000, 0.9488)\n", 141 | " B-ORG: (0, 0, 9) (0.0000, 0.0000, 0.0000)\n", 142 | " B-PER: (0, 0, 3) (0.0000, 0.0000, 0.0000)\n", 143 | " I-PER: (0, 0, 4) (0.0000, 0.0000, 0.0000)\n", 144 | " B-MISC: (0, 0, 5) (0.0000, 0.0000, 0.0000)\n", 145 | " I-ORG: (0, 0, 5) (0.0000, 0.0000, 0.0000)\n", 146 | " I-LOC: (0, 0, 1) (0.0000, 0.0000, 0.0000)\n", 147 | " I-MISC: (0, 0, 0) (******, ******, ******)\n", 148 | "Macro-average precision, recall, F1: (0.100295, 0.111111, 0.105426)\n", 149 | "Item accuracy: 306 / 339 (0.9027)\n", 150 | "Instance accuracy: 3 / 10 (0.3000)\n", 151 | "\n", 152 | "***** Iteration #4 *****\n", 153 | "Loss: 1019.561634\n", 154 | "Feature norm: 1.929814\n", 155 | "Error norm: 202.976154\n", 156 | "Active features: 1127\n", 157 | "Line search trials: 1\n", 158 | "Line search step: 1.000000\n", 159 | "Seconds required for this iteration: 0.003\n", 160 | "Performance by label (#match, #model, #ref) (precision, recall, F1):\n", 161 | " B-LOC: (0, 0, 6) (0.0000, 0.0000, 0.0000)\n", 162 | " O: (306, 339, 306) (0.9027, 1.0000, 0.9488)\n", 163 | " B-ORG: (0, 0, 9) (0.0000, 0.0000, 0.0000)\n", 164 | " B-PER: (0, 0, 3) (0.0000, 0.0000, 0.0000)\n", 165 | " I-PER: (0, 0, 4) (0.0000, 0.0000, 0.0000)\n", 166 | " B-MISC: (0, 0, 5) (0.0000, 0.0000, 0.0000)\n", 167 | " I-ORG: (0, 0, 5) (0.0000, 0.0000, 0.0000)\n", 168 | " I-LOC: (0, 0, 1) (0.0000, 0.0000, 0.0000)\n", 169 | " I-MISC: (0, 0, 0) (******, ******, ******)\n", 170 | "Macro-average precision, recall, F1: (0.100295, 0.111111, 0.105426)\n", 171 | "Item accuracy: 306 / 339 (0.9027)\n", 172 | "Instance accuracy: 3 / 10 (0.3000)\n", 173 | "\n", 174 | "***** Iteration #5 *****\n", 175 | "Loss: 782.637378\n", 176 | "Feature norm: 3.539391\n", 177 | "Error norm: 121.725020\n", 178 | "Active features: 1035\n", 179 | "Line search trials: 1\n", 180 | "Line search step: 1.000000\n", 181 | "Seconds required for this iteration: 0.003\n", 182 | "Performance by label (#match, #model, #ref) (precision, recall, F1):\n", 183 | " B-LOC: (2, 5, 6) (0.4000, 0.3333, 0.3636)\n", 184 | " O: (305, 318, 306) (0.9591, 0.9967, 0.9776)\n", 185 | " B-ORG: (0, 0, 9) (0.0000, 0.0000, 0.0000)\n", 186 | " B-PER: (2, 4, 3) (0.5000, 0.6667, 0.5714)\n", 187 | " I-PER: (4, 12, 4) (0.3333, 1.0000, 0.5000)\n", 188 | " B-MISC: (0, 0, 5) (0.0000, 0.0000, 0.0000)\n", 189 | " I-ORG: (0, 0, 5) (0.0000, 0.0000, 0.0000)\n", 190 | " I-LOC: (0, 0, 1) (0.0000, 0.0000, 0.0000)\n", 191 | " I-MISC: (0, 0, 0) (******, ******, ******)\n", 192 | "Macro-average precision, recall, F1: (0.243606, 0.332970, 0.268070)\n", 193 | "Item accuracy: 313 / 339 (0.9233)\n", 194 | "Instance accuracy: 3 / 10 (0.3000)\n", 195 | "\n", 196 | "L-BFGS terminated with the maximum number of iterations\n", 197 | "Total seconds required for training: 0.022\n", 198 | "\n", 199 | "Storing the model\n", 200 | "Number of active features: 1035 (3948)\n", 201 | "Number of active attributes: 507 (3350)\n", 202 | "Number of active labels: 9 (9)\n", 203 | "Writing labels\n", 204 | "Writing attributes\n", 205 | "Writing feature references for transitions\n", 206 | "Writing feature references for attributes\n", 207 | "Seconds required: 0.003\n", 208 | "\n", 209 | ] 210 | 211 | log2 = [ 212 | "Feature generation\n", # featgen_start 213 | "type: CRF1d\n", 214 | "feature.minfreq: 0.000000\n", 215 | "feature.possible_states: 0\n", 216 | "feature.possible_transitions: 1\n", 217 | "0", 218 | ".", 219 | ".", 220 | ".", 221 | ".", # featgen_progress 222 | "1", 223 | ".", 224 | ".", 225 | ".", 226 | ".", 227 | "2", 228 | ".", 229 | ".", 230 | ".", 231 | ".", 232 | "3", 233 | ".", 234 | ".", 235 | ".", 236 | ".", 237 | "4", 238 | ".", 239 | ".", 240 | ".", 241 | ".", 242 | "5", 243 | ".", 244 | ".", 245 | ".", 246 | ".", 247 | "6", 248 | ".", 249 | ".", 250 | ".", 251 | ".", 252 | "7", 253 | ".", 254 | ".", 255 | ".", 256 | ".", 257 | "8", 258 | ".", 259 | ".", 260 | ".", 261 | ".", 262 | "9", 263 | ".", 264 | ".", 265 | ".", 266 | ".", 267 | "10", 268 | "\n", 269 | "Number of features: 4379\n", 270 | "Seconds required: 0.021\n", # featgen_end 271 | "\n", 272 | "Averaged perceptron\n", 273 | "max_iterations: 5\n", 274 | "epsilon: 0.000000\n", 275 | "\n", 276 | "***** Iteration #1 *****\n", # iteration 277 | "Loss: 16.359638\n", 278 | "Feature norm: 112.848688\n", 279 | "Seconds required for this iteration: 0.005\n", # iteration end 280 | "\n", 281 | "***** Iteration #2 *****\n", 282 | "Loss: 12.449970\n", 283 | "Feature norm: 126.174821\n", 284 | "Seconds required for this iteration: 0.004\n", 285 | "\n", 286 | "***** Iteration #3 *****\n", 287 | "Loss: 9.451751\n", 288 | "Feature norm: 145.482678\n", 289 | "Seconds required for this iteration: 0.003\n", 290 | "\n", 291 | "***** Iteration #4 *****\n", 292 | "Loss: 8.652287\n", 293 | "Feature norm: 155.495167\n", 294 | "Seconds required for this iteration: 0.003\n", 295 | "\n", 296 | "***** Iteration #5 *****\n", 297 | "Loss: 7.442703\n", 298 | "Feature norm: 166.818487\n", 299 | "Seconds required for this iteration: 0.002\n", 300 | "\n", 301 | "Total seconds required for training: 0.017\n", # optimization_end 302 | "\n", 303 | "Storing the model\n", # storing_start 304 | "Number of active features: 2265 (4379)\n", 305 | "Number of active attributes: 1299 (3350)\n", 306 | "Number of active labels: 9 (9)\n", 307 | "Writing labels\n", 308 | "Writing attributes\n", 309 | "Writing feature references for transitions\n", 310 | "Writing feature references for attributes\n", 311 | "Seconds required: 0.007\n", # storing_end 312 | "\n", # end 313 | ] 314 | 315 | log3 = [ 316 | "Holdout group: 2\n", 317 | "\n", 318 | "Feature generation\n", 319 | "type: CRF1d\n", 320 | "feature.minfreq: 0.000000\n", 321 | "feature.possible_states: 0\n", 322 | "feature.possible_transitions: 1\n", 323 | "0", 324 | ".", 325 | ".", 326 | ".", 327 | ".", 328 | "1", 329 | ".", 330 | ".", 331 | ".", 332 | ".", 333 | "2", 334 | ".", 335 | ".", 336 | ".", 337 | ".", 338 | "3", 339 | ".", 340 | ".", 341 | ".", 342 | ".", 343 | "4", 344 | ".", 345 | ".", 346 | ".", 347 | ".", 348 | "5", 349 | ".", 350 | ".", 351 | ".", 352 | ".", 353 | "6", 354 | ".", 355 | ".", 356 | ".", 357 | ".", 358 | "7", 359 | ".", 360 | ".", 361 | ".", 362 | ".", 363 | "8", 364 | ".", 365 | ".", 366 | ".", 367 | ".", 368 | "9", 369 | ".", 370 | ".", 371 | ".", 372 | ".", 373 | "10", 374 | "\n", 375 | "Number of features: 96180\n", 376 | "Seconds required: 1.263\n", 377 | "\n", 378 | "Stochastic Gradient Descent (SGD)\n", 379 | "c2: 1.000000\n", 380 | "max_iterations: 5\n", 381 | "period: 10\n", 382 | "delta: 0.000001\n", 383 | "\n", 384 | "Calibrating the learning rate (eta)\n", 385 | "calibration.eta: 0.100000\n", 386 | "calibration.rate: 2.000000\n", 387 | "calibration.samples: 1000\n", 388 | "calibration.candidates: 10\n", 389 | "calibration.max_trials: 20\n", 390 | "Initial loss: 69781.655352\n", 391 | "Trial #1 (eta = 0.100000): ", 392 | "12808.890280\n", 393 | "Trial #2 (eta = 0.200000): ", 394 | "26716.801091\n", 395 | "Trial #3 (eta = 0.400000): ", 396 | "51219.321368\n", 397 | "Trial #4 (eta = 0.800000): ", 398 | "104398.795416 (worse)\n", 399 | "Trial #5 (eta = 0.050000): ", 400 | "7804.492475\n", 401 | "Trial #6 (eta = 0.025000): ", 402 | "6419.964967\n", 403 | "Trial #7 (eta = 0.012500): ", 404 | "6989.552193\n", 405 | "Trial #8 (eta = 0.006250): ", 406 | "8303.107921\n", 407 | "Trial #9 (eta = 0.003125): ", 408 | "9934.052819\n", 409 | "Trial #10 (eta = 0.001563): ", 410 | "11782.234687\n", 411 | "Trial #11 (eta = 0.000781): ", 412 | "13777.708878\n", 413 | "Trial #12 (eta = 0.000391): ", 414 | "15891.422697\n", 415 | "Trial #13 (eta = 0.000195): ", 416 | "18174.499245\n", 417 | "Trial #14 (eta = 0.000098): ", 418 | "20955.855446\n", 419 | "Best learning rate (eta): 0.025000\n", 420 | "Seconds required: 0.858\n", 421 | "\n", 422 | "***** Epoch #1 *****\n", 423 | "Loss: 36862.915596\n", 424 | "Feature L2-norm: 24.717729\n", 425 | "Learning rate (eta): 0.023810\n", 426 | "Total number of feature updates: 8323\n", 427 | "Seconds required for this iteration: 0.462\n", 428 | "Performance by label (#match, #model, #ref) (precision, recall, F1):\n", 429 | " B-LOC: (778, 1193, 1084) (0.6521, 0.7177, 0.6834)\n", 430 | " O: (45103, 45519, 45355) (0.9909, 0.9944, 0.9926)\n", 431 | " B-ORG: (1003, 1326, 1400) (0.7564, 0.7164, 0.7359)\n", 432 | " B-PER: (583, 764, 735) (0.7631, 0.7932, 0.7779)\n", 433 | " I-PER: (565, 681, 634) (0.8297, 0.8912, 0.8593)\n", 434 | " B-MISC: (76, 181, 339) (0.4199, 0.2242, 0.2923)\n", 435 | " I-ORG: (735, 933, 1104) (0.7878, 0.6658, 0.7216)\n", 436 | " I-LOC: (191, 455, 325) (0.4198, 0.5877, 0.4897)\n", 437 | " I-MISC: (204, 481, 557) (0.4241, 0.3662, 0.3931)\n", 438 | "Macro-average precision, recall, F1: (0.671525, 0.661871, 0.660646)\n", 439 | "Item accuracy: 49238 / 51533 (0.9555)\n", 440 | "Instance accuracy: 852 / 1517 (0.5616)\n", 441 | "\n", 442 | "***** Epoch #2 *****\n", 443 | "Loss: 31176.026308\n", 444 | "Feature L2-norm: 32.274598\n", 445 | "Learning rate (eta): 0.022727\n", 446 | "Total number of feature updates: 16646\n", 447 | "Seconds required for this iteration: 0.466\n", 448 | "Performance by label (#match, #model, #ref) (precision, recall, F1):\n", 449 | " B-LOC: (708, 1018, 1084) (0.6955, 0.6531, 0.6736)\n", 450 | " O: (45101, 45611, 45355) (0.9888, 0.9944, 0.9916)\n", 451 | " B-ORG: (1053, 1711, 1400) (0.6154, 0.7521, 0.6770)\n", 452 | " B-PER: (594, 777, 735) (0.7645, 0.8082, 0.7857)\n", 453 | " I-PER: (589, 778, 634) (0.7571, 0.9290, 0.8343)\n", 454 | " B-MISC: (94, 264, 339) (0.3561, 0.2773, 0.3118)\n", 455 | " I-ORG: (384, 468, 1104) (0.8205, 0.3478, 0.4885)\n", 456 | " I-LOC: (166, 285, 325) (0.5825, 0.5108, 0.5443)\n", 457 | " I-MISC: (210, 621, 557) (0.3382, 0.3770, 0.3565)\n", 458 | "Macro-average precision, recall, F1: (0.657608, 0.627752, 0.629257)\n", 459 | "Item accuracy: 48899 / 51533 (0.9489)\n", 460 | "Instance accuracy: 813 / 1517 (0.5359)\n", 461 | "\n", 462 | "***** Epoch #3 *****\n", 463 | "Loss: 23705.719839\n", 464 | "Feature L2-norm: 35.255014\n", 465 | "Learning rate (eta): 0.021739\n", 466 | "Total number of feature updates: 24969\n", 467 | "Seconds required for this iteration: 0.472\n", 468 | "Performance by label (#match, #model, #ref) (precision, recall, F1):\n", 469 | " B-LOC: (808, 1210, 1084) (0.6678, 0.7454, 0.7044)\n", 470 | " O: (45244, 45771, 45355) (0.9885, 0.9976, 0.9930)\n", 471 | " B-ORG: (1061, 1403, 1400) (0.7562, 0.7579, 0.7570)\n", 472 | " B-PER: (588, 728, 735) (0.8077, 0.8000, 0.8038)\n", 473 | " I-PER: (565, 640, 634) (0.8828, 0.8912, 0.8870)\n", 474 | " B-MISC: (86, 130, 339) (0.6615, 0.2537, 0.3667)\n", 475 | " I-ORG: (857, 1148, 1104) (0.7465, 0.7763, 0.7611)\n", 476 | " I-LOC: (152, 282, 325) (0.5390, 0.4677, 0.5008)\n", 477 | " I-MISC: (170, 221, 557) (0.7692, 0.3052, 0.4370)\n", 478 | "Macro-average precision, recall, F1: (0.757699, 0.666091, 0.690108)\n", 479 | "Item accuracy: 49531 / 51533 (0.9612)\n", 480 | "Instance accuracy: 889 / 1517 (0.5860)\n", 481 | "\n", 482 | "***** Epoch #4 *****\n", 483 | "Loss: 21273.137466\n", 484 | "Feature L2-norm: 37.985723\n", 485 | "Learning rate (eta): 0.020833\n", 486 | "Total number of feature updates: 33292\n", 487 | "Seconds required for this iteration: 0.468\n", 488 | "Performance by label (#match, #model, #ref) (precision, recall, F1):\n", 489 | " B-LOC: (848, 1276, 1084) (0.6646, 0.7823, 0.7186)\n", 490 | " O: (44212, 44389, 45355) (0.9960, 0.9748, 0.9853)\n", 491 | " B-ORG: (784, 896, 1400) (0.8750, 0.5600, 0.6829)\n", 492 | " B-PER: (582, 686, 735) (0.8484, 0.7918, 0.8191)\n", 493 | " I-PER: (570, 647, 634) (0.8810, 0.8991, 0.8899)\n", 494 | " B-MISC: (166, 619, 339) (0.2682, 0.4897, 0.3466)\n", 495 | " I-ORG: (152, 155, 1104) (0.9806, 0.1377, 0.2415)\n", 496 | " I-LOC: (138, 219, 325) (0.6301, 0.4246, 0.5074)\n", 497 | " I-MISC: (467, 2646, 557) (0.1765, 0.8384, 0.2916)\n", 498 | "Macro-average precision, recall, F1: (0.702269, 0.655374, 0.609212)\n", 499 | "Item accuracy: 47919 / 51533 (0.9299)\n", 500 | "Instance accuracy: 793 / 1517 (0.5227)\n", 501 | "\n", 502 | "***** Epoch #5 *****\n", 503 | "Loss: 20806.661564\n", 504 | "Feature L2-norm: 40.673070\n", 505 | "Learning rate (eta): 0.020000\n", 506 | "Total number of feature updates: 41615\n", 507 | "Seconds required for this iteration: 0.460\n", 508 | "Performance by label (#match, #model, #ref) (precision, recall, F1):\n", 509 | " B-LOC: (689, 892, 1084) (0.7724, 0.6356, 0.6974)\n", 510 | " O: (45171, 45556, 45355) (0.9915, 0.9959, 0.9937)\n", 511 | " B-ORG: (1214, 1931, 1400) (0.6287, 0.8671, 0.7289)\n", 512 | " B-PER: (529, 574, 735) (0.9216, 0.7197, 0.8083)\n", 513 | " I-PER: (520, 553, 634) (0.9403, 0.8202, 0.8762)\n", 514 | " B-MISC: (77, 96, 339) (0.8021, 0.2271, 0.3540)\n", 515 | " I-ORG: (1009, 1678, 1104) (0.6013, 0.9139, 0.7254)\n", 516 | " I-LOC: (126, 182, 325) (0.6923, 0.3877, 0.4970)\n", 517 | " I-MISC: (57, 71, 557) (0.8028, 0.1023, 0.1815)\n", 518 | "Macro-average precision, recall, F1: (0.794790, 0.629970, 0.651378)\n", 519 | "Item accuracy: 49392 / 51533 (0.9585)\n", 520 | "Instance accuracy: 885 / 1517 (0.5834)\n", 521 | "\n", 522 | "SGD terminated with the maximum number of iterations\n", 523 | "Loss: 20806.661564\n", 524 | "Total seconds required for training: 3.350\n", 525 | "\n", 526 | "Storing the model\n", 527 | "Number of active features: 96180 (96180)\n", 528 | "Number of active attributes: 76691 (83593)\n", 529 | "Number of active labels: 9 (9)\n", 530 | "Writing labels\n", 531 | "Writing attributes\n", 532 | "Writing feature references for transitions\n", 533 | "Writing feature references for attributes\n", 534 | "Seconds required: 0.329\n", 535 | "\n", 536 | ] 537 | 538 | log4 = [ 539 | "Feature generation\n", 540 | "type: CRF1d\n", 541 | "feature.minfreq: 0.000000\n", 542 | "feature.possible_states: 0\n", 543 | "feature.possible_transitions: 0\n", 544 | "0", 545 | ".", 546 | ".", 547 | ".", 548 | ".", 549 | "1", 550 | ".", 551 | ".", 552 | ".", 553 | ".", 554 | "2", 555 | ".", 556 | ".", 557 | ".", 558 | ".", 559 | "3", 560 | ".", 561 | ".", 562 | ".", 563 | ".", 564 | "4", 565 | ".", 566 | ".", 567 | ".", 568 | ".", 569 | "5", 570 | ".", 571 | ".", 572 | ".", 573 | ".", 574 | "6", 575 | ".", 576 | ".", 577 | ".", 578 | ".", 579 | "7", 580 | ".", 581 | ".", 582 | ".", 583 | ".", 584 | "8", 585 | ".", 586 | ".", 587 | ".", 588 | ".", 589 | "9", 590 | ".", 591 | ".", 592 | ".", 593 | ".", 594 | "10", 595 | "\n", 596 | "Number of features: 0\n", 597 | "Seconds required: 0.001\n", 598 | "\n", 599 | "L-BFGS optimization\n", 600 | "c1: 0.000000\n", 601 | "c2: 1.000000\n", 602 | "num_memories: 6\n", 603 | "max_iterations: 2147483647\n", 604 | "epsilon: 0.000010\n", 605 | "stop: 10\n", 606 | "delta: 0.000010\n", 607 | "linesearch: MoreThuente\n", 608 | "linesearch.max_iterations: 20\n", 609 | "\n", 610 | "L-BFGS terminated with error code (-1020)\n", 611 | "Total seconds required for training: 0.000\n", 612 | "\n", 613 | "Storing the model\n", 614 | "Number of active features: 0 (0)\n", 615 | "Number of active attributes: 0 (0)\n", 616 | "Number of active labels: 0 (0)\n", 617 | "Writing labels\n", 618 | "Writing attributes\n", 619 | "Writing feature references for transitions\n", 620 | "Writing feature references for attributes\n", 621 | "Seconds required: 0.000\n", 622 | "\n", 623 | ] 624 | 625 | 626 | def test_parser_log1(): 627 | """ 628 | >>> parser = TrainLogParser() 629 | >>> _apply_parser(parser, log1) 630 | Holdout group: 2 631 | ============== start 632 | 633 | Number of features: 3948 634 | Seconds required: 0.022 635 | ============== featgen_end 636 | 637 | L-BFGS optimization 638 | c1: 1.000000 639 | c2: 0.001000 640 | num_memories: 6 641 | max_iterations: 5 642 | epsilon: 0.000010 643 | stop: 10 644 | delta: 0.000010 645 | linesearch: MoreThuente 646 | linesearch.max_iterations: 20 647 | 648 | ============== prepared 649 | ***** Iteration #1 ***** 650 | Loss: 1450.519004 651 | Feature norm: 1.000000 652 | Error norm: 713.784994 653 | Active features: 1794 654 | Line search trials: 1 655 | Line search step: 0.000228 656 | Seconds required for this iteration: 0.008 657 | Performance by label (#match, #model, #ref) (precision, recall, F1): 658 | B-LOC: (0, 0, 6) (0.0000, 0.0000, 0.0000) 659 | O: (306, 339, 306) (0.9027, 1.0000, 0.9488) 660 | B-ORG: (0, 0, 9) (0.0000, 0.0000, 0.0000) 661 | B-PER: (0, 0, 3) (0.0000, 0.0000, 0.0000) 662 | I-PER: (0, 0, 4) (0.0000, 0.0000, 0.0000) 663 | B-MISC: (0, 0, 5) (0.0000, 0.0000, 0.0000) 664 | I-ORG: (0, 0, 5) (0.0000, 0.0000, 0.0000) 665 | I-LOC: (0, 0, 1) (0.0000, 0.0000, 0.0000) 666 | I-MISC: (0, 0, 0) (******, ******, ******) 667 | Macro-average precision, recall, F1: (0.100295, 0.111111, 0.105426) 668 | Item accuracy: 306 / 339 (0.9027) 669 | Instance accuracy: 3 / 10 (0.3000) 670 | 671 | ============== iteration 672 | ***** Iteration #2 ***** 673 | Loss: 1363.687719 674 | Feature norm: 1.178396 675 | Error norm: 370.827506 676 | Active features: 1540 677 | Line search trials: 1 678 | Line search step: 1.000000 679 | Seconds required for this iteration: 0.004 680 | Performance by label (#match, #model, #ref) (precision, recall, F1): 681 | B-LOC: (0, 0, 6) (0.0000, 0.0000, 0.0000) 682 | O: (306, 339, 306) (0.9027, 1.0000, 0.9488) 683 | B-ORG: (0, 0, 9) (0.0000, 0.0000, 0.0000) 684 | B-PER: (0, 0, 3) (0.0000, 0.0000, 0.0000) 685 | I-PER: (0, 0, 4) (0.0000, 0.0000, 0.0000) 686 | B-MISC: (0, 0, 5) (0.0000, 0.0000, 0.0000) 687 | I-ORG: (0, 0, 5) (0.0000, 0.0000, 0.0000) 688 | I-LOC: (0, 0, 1) (0.0000, 0.0000, 0.0000) 689 | I-MISC: (0, 0, 0) (******, ******, ******) 690 | Macro-average precision, recall, F1: (0.100295, 0.111111, 0.105426) 691 | Item accuracy: 306 / 339 (0.9027) 692 | Instance accuracy: 3 / 10 (0.3000) 693 | 694 | ============== iteration 695 | ***** Iteration #3 ***** 696 | Loss: 1309.171814 697 | Feature norm: 1.266322 698 | Error norm: 368.739493 699 | Active features: 1308 700 | Line search trials: 1 701 | Line search step: 1.000000 702 | Seconds required for this iteration: 0.003 703 | Performance by label (#match, #model, #ref) (precision, recall, F1): 704 | B-LOC: (0, 0, 6) (0.0000, 0.0000, 0.0000) 705 | O: (306, 339, 306) (0.9027, 1.0000, 0.9488) 706 | B-ORG: (0, 0, 9) (0.0000, 0.0000, 0.0000) 707 | B-PER: (0, 0, 3) (0.0000, 0.0000, 0.0000) 708 | I-PER: (0, 0, 4) (0.0000, 0.0000, 0.0000) 709 | B-MISC: (0, 0, 5) (0.0000, 0.0000, 0.0000) 710 | I-ORG: (0, 0, 5) (0.0000, 0.0000, 0.0000) 711 | I-LOC: (0, 0, 1) (0.0000, 0.0000, 0.0000) 712 | I-MISC: (0, 0, 0) (******, ******, ******) 713 | Macro-average precision, recall, F1: (0.100295, 0.111111, 0.105426) 714 | Item accuracy: 306 / 339 (0.9027) 715 | Instance accuracy: 3 / 10 (0.3000) 716 | 717 | ============== iteration 718 | ***** Iteration #4 ***** 719 | Loss: 1019.561634 720 | Feature norm: 1.929814 721 | Error norm: 202.976154 722 | Active features: 1127 723 | Line search trials: 1 724 | Line search step: 1.000000 725 | Seconds required for this iteration: 0.003 726 | Performance by label (#match, #model, #ref) (precision, recall, F1): 727 | B-LOC: (0, 0, 6) (0.0000, 0.0000, 0.0000) 728 | O: (306, 339, 306) (0.9027, 1.0000, 0.9488) 729 | B-ORG: (0, 0, 9) (0.0000, 0.0000, 0.0000) 730 | B-PER: (0, 0, 3) (0.0000, 0.0000, 0.0000) 731 | I-PER: (0, 0, 4) (0.0000, 0.0000, 0.0000) 732 | B-MISC: (0, 0, 5) (0.0000, 0.0000, 0.0000) 733 | I-ORG: (0, 0, 5) (0.0000, 0.0000, 0.0000) 734 | I-LOC: (0, 0, 1) (0.0000, 0.0000, 0.0000) 735 | I-MISC: (0, 0, 0) (******, ******, ******) 736 | Macro-average precision, recall, F1: (0.100295, 0.111111, 0.105426) 737 | Item accuracy: 306 / 339 (0.9027) 738 | Instance accuracy: 3 / 10 (0.3000) 739 | 740 | ============== iteration 741 | ***** Iteration #5 ***** 742 | Loss: 782.637378 743 | Feature norm: 3.539391 744 | Error norm: 121.725020 745 | Active features: 1035 746 | Line search trials: 1 747 | Line search step: 1.000000 748 | Seconds required for this iteration: 0.003 749 | Performance by label (#match, #model, #ref) (precision, recall, F1): 750 | B-LOC: (2, 5, 6) (0.4000, 0.3333, 0.3636) 751 | O: (305, 318, 306) (0.9591, 0.9967, 0.9776) 752 | B-ORG: (0, 0, 9) (0.0000, 0.0000, 0.0000) 753 | B-PER: (2, 4, 3) (0.5000, 0.6667, 0.5714) 754 | I-PER: (4, 12, 4) (0.3333, 1.0000, 0.5000) 755 | B-MISC: (0, 0, 5) (0.0000, 0.0000, 0.0000) 756 | I-ORG: (0, 0, 5) (0.0000, 0.0000, 0.0000) 757 | I-LOC: (0, 0, 1) (0.0000, 0.0000, 0.0000) 758 | I-MISC: (0, 0, 0) (******, ******, ******) 759 | Macro-average precision, recall, F1: (0.243606, 0.332970, 0.268070) 760 | Item accuracy: 313 / 339 (0.9233) 761 | Instance accuracy: 3 / 10 (0.3000) 762 | 763 | ============== iteration 764 | L-BFGS terminated with the maximum number of iterations 765 | Total seconds required for training: 0.022 766 | 767 | ============== optimization_end 768 | Storing the model 769 | Number of active features: 1035 (3948) 770 | Number of active attributes: 507 (3350) 771 | Number of active labels: 9 (9) 772 | Writing labels 773 | Writing attributes 774 | Writing feature references for transitions 775 | Writing feature references for attributes 776 | Seconds required: 0.003 777 | 778 | ============== end 779 | 780 | >>> len(parser.iterations) 781 | 5 782 | >>> parser.iterations[3]['active_features'] 783 | 1127 784 | """ 785 | pass 786 | 787 | 788 | def test_parser_log2(): 789 | """ 790 | >>> parser = TrainLogParser() 791 | >>> _apply_parser(parser, log2) 792 | Feature generation 793 | ============== start 794 | 795 | Number of features: 4379 796 | Seconds required: 0.021 797 | ============== featgen_end 798 | 799 | Averaged perceptron 800 | max_iterations: 5 801 | epsilon: 0.000000 802 | 803 | ============== prepared 804 | ***** Iteration #1 ***** 805 | Loss: 16.359638 806 | Feature norm: 112.848688 807 | Seconds required for this iteration: 0.005 808 | 809 | ============== iteration 810 | ***** Iteration #2 ***** 811 | Loss: 12.449970 812 | Feature norm: 126.174821 813 | Seconds required for this iteration: 0.004 814 | 815 | ============== iteration 816 | ***** Iteration #3 ***** 817 | Loss: 9.451751 818 | Feature norm: 145.482678 819 | Seconds required for this iteration: 0.003 820 | 821 | ============== iteration 822 | ***** Iteration #4 ***** 823 | Loss: 8.652287 824 | Feature norm: 155.495167 825 | Seconds required for this iteration: 0.003 826 | 827 | ============== iteration 828 | ***** Iteration #5 ***** 829 | Loss: 7.442703 830 | Feature norm: 166.818487 831 | Seconds required for this iteration: 0.002 832 | 833 | ============== iteration 834 | Total seconds required for training: 0.017 835 | 836 | ============== optimization_end 837 | Storing the model 838 | Number of active features: 2265 (4379) 839 | Number of active attributes: 1299 (3350) 840 | Number of active labels: 9 (9) 841 | Writing labels 842 | Writing attributes 843 | Writing feature references for transitions 844 | Writing feature references for attributes 845 | Seconds required: 0.007 846 | 847 | ============== end 848 | """ 849 | pass 850 | 851 | 852 | def test_parser_log3(): 853 | """ 854 | >>> parser = TrainLogParser() 855 | >>> _apply_parser(parser, log3) 856 | Holdout group: 2 857 | ============== start 858 | 859 | Number of features: 96180 860 | Seconds required: 1.263 861 | ============== featgen_end 862 | 863 | Stochastic Gradient Descent (SGD) 864 | c2: 1.000000 865 | max_iterations: 5 866 | period: 10 867 | delta: 0.000001 868 | 869 | Calibrating the learning rate (eta) 870 | calibration.eta: 0.100000 871 | calibration.rate: 2.000000 872 | calibration.samples: 1000 873 | calibration.candidates: 10 874 | calibration.max_trials: 20 875 | Initial loss: 69781.655352 876 | Trial #1 (eta = 0.100000): 12808.890280 877 | Trial #2 (eta = 0.200000): 26716.801091 878 | Trial #3 (eta = 0.400000): 51219.321368 879 | Trial #4 (eta = 0.800000): 104398.795416 (worse) 880 | Trial #5 (eta = 0.050000): 7804.492475 881 | Trial #6 (eta = 0.025000): 6419.964967 882 | Trial #7 (eta = 0.012500): 6989.552193 883 | Trial #8 (eta = 0.006250): 8303.107921 884 | Trial #9 (eta = 0.003125): 9934.052819 885 | Trial #10 (eta = 0.001563): 11782.234687 886 | Trial #11 (eta = 0.000781): 13777.708878 887 | Trial #12 (eta = 0.000391): 15891.422697 888 | Trial #13 (eta = 0.000195): 18174.499245 889 | Trial #14 (eta = 0.000098): 20955.855446 890 | Best learning rate (eta): 0.025000 891 | Seconds required: 0.858 892 | 893 | ============== prepared 894 | ***** Epoch #1 ***** 895 | Loss: 36862.915596 896 | Feature L2-norm: 24.717729 897 | Learning rate (eta): 0.023810 898 | Total number of feature updates: 8323 899 | Seconds required for this iteration: 0.462 900 | Performance by label (#match, #model, #ref) (precision, recall, F1): 901 | B-LOC: (778, 1193, 1084) (0.6521, 0.7177, 0.6834) 902 | O: (45103, 45519, 45355) (0.9909, 0.9944, 0.9926) 903 | B-ORG: (1003, 1326, 1400) (0.7564, 0.7164, 0.7359) 904 | B-PER: (583, 764, 735) (0.7631, 0.7932, 0.7779) 905 | I-PER: (565, 681, 634) (0.8297, 0.8912, 0.8593) 906 | B-MISC: (76, 181, 339) (0.4199, 0.2242, 0.2923) 907 | I-ORG: (735, 933, 1104) (0.7878, 0.6658, 0.7216) 908 | I-LOC: (191, 455, 325) (0.4198, 0.5877, 0.4897) 909 | I-MISC: (204, 481, 557) (0.4241, 0.3662, 0.3931) 910 | Macro-average precision, recall, F1: (0.671525, 0.661871, 0.660646) 911 | Item accuracy: 49238 / 51533 (0.9555) 912 | Instance accuracy: 852 / 1517 (0.5616) 913 | 914 | ============== iteration 915 | ***** Epoch #2 ***** 916 | Loss: 31176.026308 917 | Feature L2-norm: 32.274598 918 | Learning rate (eta): 0.022727 919 | Total number of feature updates: 16646 920 | Seconds required for this iteration: 0.466 921 | Performance by label (#match, #model, #ref) (precision, recall, F1): 922 | B-LOC: (708, 1018, 1084) (0.6955, 0.6531, 0.6736) 923 | O: (45101, 45611, 45355) (0.9888, 0.9944, 0.9916) 924 | B-ORG: (1053, 1711, 1400) (0.6154, 0.7521, 0.6770) 925 | B-PER: (594, 777, 735) (0.7645, 0.8082, 0.7857) 926 | I-PER: (589, 778, 634) (0.7571, 0.9290, 0.8343) 927 | B-MISC: (94, 264, 339) (0.3561, 0.2773, 0.3118) 928 | I-ORG: (384, 468, 1104) (0.8205, 0.3478, 0.4885) 929 | I-LOC: (166, 285, 325) (0.5825, 0.5108, 0.5443) 930 | I-MISC: (210, 621, 557) (0.3382, 0.3770, 0.3565) 931 | Macro-average precision, recall, F1: (0.657608, 0.627752, 0.629257) 932 | Item accuracy: 48899 / 51533 (0.9489) 933 | Instance accuracy: 813 / 1517 (0.5359) 934 | 935 | ============== iteration 936 | ***** Epoch #3 ***** 937 | Loss: 23705.719839 938 | Feature L2-norm: 35.255014 939 | Learning rate (eta): 0.021739 940 | Total number of feature updates: 24969 941 | Seconds required for this iteration: 0.472 942 | Performance by label (#match, #model, #ref) (precision, recall, F1): 943 | B-LOC: (808, 1210, 1084) (0.6678, 0.7454, 0.7044) 944 | O: (45244, 45771, 45355) (0.9885, 0.9976, 0.9930) 945 | B-ORG: (1061, 1403, 1400) (0.7562, 0.7579, 0.7570) 946 | B-PER: (588, 728, 735) (0.8077, 0.8000, 0.8038) 947 | I-PER: (565, 640, 634) (0.8828, 0.8912, 0.8870) 948 | B-MISC: (86, 130, 339) (0.6615, 0.2537, 0.3667) 949 | I-ORG: (857, 1148, 1104) (0.7465, 0.7763, 0.7611) 950 | I-LOC: (152, 282, 325) (0.5390, 0.4677, 0.5008) 951 | I-MISC: (170, 221, 557) (0.7692, 0.3052, 0.4370) 952 | Macro-average precision, recall, F1: (0.757699, 0.666091, 0.690108) 953 | Item accuracy: 49531 / 51533 (0.9612) 954 | Instance accuracy: 889 / 1517 (0.5860) 955 | 956 | ============== iteration 957 | ***** Epoch #4 ***** 958 | Loss: 21273.137466 959 | Feature L2-norm: 37.985723 960 | Learning rate (eta): 0.020833 961 | Total number of feature updates: 33292 962 | Seconds required for this iteration: 0.468 963 | Performance by label (#match, #model, #ref) (precision, recall, F1): 964 | B-LOC: (848, 1276, 1084) (0.6646, 0.7823, 0.7186) 965 | O: (44212, 44389, 45355) (0.9960, 0.9748, 0.9853) 966 | B-ORG: (784, 896, 1400) (0.8750, 0.5600, 0.6829) 967 | B-PER: (582, 686, 735) (0.8484, 0.7918, 0.8191) 968 | I-PER: (570, 647, 634) (0.8810, 0.8991, 0.8899) 969 | B-MISC: (166, 619, 339) (0.2682, 0.4897, 0.3466) 970 | I-ORG: (152, 155, 1104) (0.9806, 0.1377, 0.2415) 971 | I-LOC: (138, 219, 325) (0.6301, 0.4246, 0.5074) 972 | I-MISC: (467, 2646, 557) (0.1765, 0.8384, 0.2916) 973 | Macro-average precision, recall, F1: (0.702269, 0.655374, 0.609212) 974 | Item accuracy: 47919 / 51533 (0.9299) 975 | Instance accuracy: 793 / 1517 (0.5227) 976 | 977 | ============== iteration 978 | ***** Epoch #5 ***** 979 | Loss: 20806.661564 980 | Feature L2-norm: 40.673070 981 | Learning rate (eta): 0.020000 982 | Total number of feature updates: 41615 983 | Seconds required for this iteration: 0.460 984 | Performance by label (#match, #model, #ref) (precision, recall, F1): 985 | B-LOC: (689, 892, 1084) (0.7724, 0.6356, 0.6974) 986 | O: (45171, 45556, 45355) (0.9915, 0.9959, 0.9937) 987 | B-ORG: (1214, 1931, 1400) (0.6287, 0.8671, 0.7289) 988 | B-PER: (529, 574, 735) (0.9216, 0.7197, 0.8083) 989 | I-PER: (520, 553, 634) (0.9403, 0.8202, 0.8762) 990 | B-MISC: (77, 96, 339) (0.8021, 0.2271, 0.3540) 991 | I-ORG: (1009, 1678, 1104) (0.6013, 0.9139, 0.7254) 992 | I-LOC: (126, 182, 325) (0.6923, 0.3877, 0.4970) 993 | I-MISC: (57, 71, 557) (0.8028, 0.1023, 0.1815) 994 | Macro-average precision, recall, F1: (0.794790, 0.629970, 0.651378) 995 | Item accuracy: 49392 / 51533 (0.9585) 996 | Instance accuracy: 885 / 1517 (0.5834) 997 | 998 | ============== iteration 999 | SGD terminated with the maximum number of iterations 1000 | Loss: 20806.661564 1001 | Total seconds required for training: 3.350 1002 | 1003 | ============== optimization_end 1004 | Storing the model 1005 | Number of active features: 96180 (96180) 1006 | Number of active attributes: 76691 (83593) 1007 | Number of active labels: 9 (9) 1008 | Writing labels 1009 | Writing attributes 1010 | Writing feature references for transitions 1011 | Writing feature references for attributes 1012 | Seconds required: 0.329 1013 | 1014 | ============== end 1015 | 1016 | """ 1017 | pass 1018 | 1019 | 1020 | def test_parser_log4(): 1021 | """ 1022 | >>> parser = TrainLogParser() 1023 | >>> _apply_parser(parser, log4) 1024 | Feature generation 1025 | ============== start 1026 | 1027 | Number of features: 0 1028 | Seconds required: 0.001 1029 | ============== featgen_end 1030 | 1031 | L-BFGS optimization 1032 | c1: 0.000000 1033 | c2: 1.000000 1034 | num_memories: 6 1035 | max_iterations: 2147483647 1036 | epsilon: 0.000010 1037 | stop: 10 1038 | delta: 0.000010 1039 | linesearch: MoreThuente 1040 | linesearch.max_iterations: 20 1041 | 1042 | L-BFGS terminated with error code (-1020) 1043 | ============== prepare_error 1044 | Total seconds required for training: 0.000 1045 | 1046 | ============== optimization_end 1047 | Storing the model 1048 | Number of active features: 0 (0) 1049 | Number of active attributes: 0 (0) 1050 | Number of active labels: 0 (0) 1051 | Writing labels 1052 | Writing attributes 1053 | Writing feature references for transitions 1054 | Writing feature references for attributes 1055 | Seconds required: 0.000 1056 | 1057 | ============== end 1058 | 1059 | """ 1060 | pass 1061 | --------------------------------------------------------------------------------