├── requirements-doc.txt
├── pycrfsuite
    ├── __init__.py
    ├── trainer_wrapper.hpp
    ├── tagger_wrapper.hpp
    ├── trainer_wrapper.cpp
    ├── crfsuite_api.pxd
    ├── _dumpparser.py
    ├── _logparser.py
    └── _pycrfsuite.pyx
├── .flake8
├── tests
    ├── test_misc.py
    ├── conftest.py
    ├── test_itemsequence.py
    ├── test_trainer.py
    ├── test_tagger.py
    └── test_logparser.py
├── .gitignore
├── .gitmodules
├── MANIFEST.in
├── tox.ini
├── docs
    ├── pycrfsuite.rst
    ├── index.rst
    ├── Makefile
    ├── make.bat
    └── conf.py
├── .github
    └── workflows
    │   ├── tests.yml
    │   └── build_and_upload.yml
├── LICENSE.txt
├── pyproject.toml
├── setup.py
├── README.rst
├── CHANGES.rst
└── examples
    └── CoNLL 2002.ipynb


/requirements-doc.txt:
--------------------------------------------------------------------------------
1 | numpydoc
2 | 


--------------------------------------------------------------------------------
/pycrfsuite/__init__.py:
--------------------------------------------------------------------------------
1 | from ._pycrfsuite import *
2 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length=160
3 | extend-ignore = E203


--------------------------------------------------------------------------------
/tests/test_misc.py:
--------------------------------------------------------------------------------
1 | def test_version():
2 |     from pycrfsuite import CRFSUITE_VERSION
3 | 
4 |     assert bool(CRFSUITE_VERSION), CRFSUITE_VERSION
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *.so
 3 | *.html
 4 | .tox
 5 | dist
 6 | build
 7 | _build
 8 | MANIFEST
 9 | .ipynb_checkpoints
10 | conll2002-esp.crfsuite
11 | *.egg-info/
12 | .cache


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "crfsuite"]
2 | 	path = crfsuite
3 | 	url = https://github.com/chokkan/crfsuite.git
4 | [submodule "liblbfgs"]
5 | 	path = liblbfgs
6 | 	url = https://github.com/chokkan/liblbfgs.git
7 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include LICENSE.txt
 2 | include README.rst
 3 | include CHANGES.rst
 4 | include update_cpp.sh
 5 | 
 6 | recursive-include crfsuite *
 7 | recursive-include liblbfgs *
 8 | recursive-include tests *.py
 9 | recursive-include pycrfsuite *.py *.pxd *.pyx *.cpp *.hpp
10 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py36,py37,py38,py39,py310,py311,py312
 3 | 
 4 | [gh-actions]
 5 | python =
 6 |     3.6: py36
 7 |     3.7: py37
 8 |     3.8: py38
 9 |     3.9: py39
10 |     3.10: py310
11 |     3.11: py311
12 |     3.12: py312
13 | 
14 | [testenv]
15 | changedir = {envtmpdir}
16 | deps =
17 |     pytest
18 | commands =
19 |     py.test {toxinidir}/tests --doctest-modules {posargs}
20 | 
21 | [testenv:manylinux]
22 | changedir = {envtmpdir}
23 | deps = 
24 |     pytest
25 | commands =
26 |     py.test {toxinidir}/tests --doctest-modules {posargs}
27 | 


--------------------------------------------------------------------------------
/docs/pycrfsuite.rst:
--------------------------------------------------------------------------------
 1 | .. _api-reference:
 2 | 
 3 | API Reference
 4 | =============
 5 | 
 6 | .. automodule:: pycrfsuite
 7 | 
 8 | .. autoclass:: ItemSequence
 9 |     :members:
10 |     :undoc-members:
11 | 
12 | Training
13 | --------
14 | 
15 | .. autoclass:: Trainer
16 |     :members:
17 |     :undoc-members:
18 |     :inherited-members:
19 |     :show-inheritance:
20 | 
21 | Tagging
22 | -------
23 | 
24 | .. autoclass:: Tagger
25 |     :members:
26 |     :undoc-members:
27 | 
28 | Debugging
29 | ---------
30 | 
31 | .. automodule:: pycrfsuite._dumpparser
32 |     :members: ParsedDump
33 |     :undoc-members:
34 | 


--------------------------------------------------------------------------------
/pycrfsuite/trainer_wrapper.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef TRAINER_WRAPPER_H
 2 | #define TRAINER_WRAPPER_H 1
 3 | 
 4 | #include <string>
 5 | #include "crfsuite_api.hpp"
 6 | 
 7 | struct _object;
 8 | typedef _object PyObject;
 9 | 
10 | namespace CRFSuiteWrapper
11 | {
12 | 
13 | typedef PyObject* (*messagefunc)(PyObject *self, std::string message);
14 | 
15 | /**
16 | * A wrapper around CRFSuite::Trainer that allows overriding
17 | * 'message' method from Python.
18 | */
19 | class Trainer : public CRFSuite::Trainer
20 | {
21 | protected:
22 |     PyObject *m_obj;
23 |     messagefunc handler;
24 | 
25 | public:
26 |     void set_handler(PyObject *obj, messagefunc handler);
27 |     virtual void message(const std::string& msg);
28 |     void _init_hack();
29 | };
30 | 
31 | }
32 | #endif
33 | 


--------------------------------------------------------------------------------
/pycrfsuite/tagger_wrapper.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef TAGGER_WRAPPER_H
 2 | #define TAGGER_WRAPPER_H 1
 3 | 
 4 | #include <stdio.h>
 5 | #include <errno.h>
 6 | #include <stdexcept>
 7 | #include "crfsuite_api.hpp"
 8 | 
 9 | 
10 | namespace CRFSuiteWrapper
11 | {
12 | 
13 | /**
14 | * A wrapper around CRFSuite::Tagger that allows to call 'dump' method
15 | * from Python.
16 | */
17 | class Tagger : public CRFSuite::Tagger
18 | {
19 | public:
20 |     void dump(int fileno)
21 |     {
22 |         if (model == NULL) {
23 |             throw std::runtime_error("Tagger is closed");
24 |         }
25 | 
26 |         FILE* file = fdopen(fileno, "w");
27 |         if (!file){
28 |             throw std::runtime_error("Can't open file");
29 |         }
30 | 
31 |         model->dump(model, file);
32 | 
33 |         if (fclose(file)){
34 |             throw std::runtime_error("Can't close file");
35 |         };
36 |     }
37 | };
38 | 
39 | }
40 | #endif
41 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | name: tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: ["master"]
 6 |   pull_request:
 7 |     branches: ["master"]
 8 | 
 9 | jobs:
10 |   tests:
11 |     name: "Python ${{ matrix.python-version }} ${{ matrix.os }}"
12 |     runs-on: ${{ matrix.os }}
13 | 
14 |     strategy:
15 |       matrix:
16 |         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
17 |         os: [ubuntu-latest, macos-latest, windows-latest]
18 |         exclude:
19 |           - os: ubuntu-latest
20 |             python-version: "3.6"
21 |       fail-fast: false
22 | 
23 |     steps:
24 |       - uses: "actions/checkout@v4"
25 |         with:
26 |           submodules: true
27 |       - uses: "actions/setup-python@v4"
28 |         with:
29 |           python-version: "${{ matrix.python-version }}"
30 |       - name: "Install dependencies"
31 |         run: |
32 |           python -VV
33 |           python -m site
34 |           python -m pip install --upgrade pip setuptools wheel
35 |           python -m pip install --upgrade virtualenv tox tox-gh-actions
36 | 
37 |       - name: "Run tox targets for ${{ matrix.python-version }}"
38 |         run: "python -m tox"
39 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2014-2017 ScrapingHub Inc. and contributors.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/pycrfsuite/trainer_wrapper.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include "Python.h"
 3 | #include "trainer_wrapper.hpp"
 4 | #include <stdexcept>
 5 | 
 6 | namespace CRFSuiteWrapper
 7 | {
 8 | 
 9 | 
10 | void Trainer::set_handler(PyObject *obj, messagefunc handler)
11 | {
12 |     // We are not holding a reference to obj (no PY_INCREF) here
13 |     // because doing so prevents __del__ from being called
14 |     this->m_obj = obj;
15 |     this->handler = handler;
16 | }
17 | 
18 | 
19 | void Trainer::message(const std::string& msg)
20 | {
21 |     if (this->m_obj == NULL) {
22 |         std::cerr << "** Trainer invalid state: obj [" << this->m_obj << "]\n";
23 |         return;
24 |     }
25 |     PyObject* result = handler(this->m_obj, msg);
26 |     if (result == NULL){
27 |         // Python exception is raised in the handler.
28 |         // Raise a C++ exception to stop training.
29 |         // Cython will catch it and re-raise the previous Python exception
30 |         // (which is the one raised in a handler).
31 |         throw std::runtime_error("You shouldn't have seen this message!");
32 |     }
33 | }
34 | 
35 | void Trainer::_init_hack()
36 | {
37 |     Trainer::init();
38 | }
39 | 
40 | 
41 | }
42 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | ===============
 2 | python-crfsuite
 3 | ===============
 4 | 
 5 | python-crfsuite is a python binding to CRFsuite_.
 6 | 
 7 | .. _CRFsuite: https://github.com/chokkan/crfsuite
 8 | 
 9 | 
10 | Installation
11 | ============
12 | 
13 | ::
14 | 
15 |     pip install python-crfsuite
16 | 
17 | Usage
18 | =====
19 | 
20 | * :ref:`api-reference`
21 | * Example_: building a Named Entity Recognition system with python-crfsuite.
22 | 
23 | .. _Example: https://github.com/scrapinghub/python-crfsuite/blob/master/examples/CoNLL%202002.ipynb
24 | 
25 | python-crfsuite is licensed under MIT license.
26 | CRFsuite_ C/C++ library is licensed under BSD license.
27 | 
28 | Development happens at github: https://github.com/scrapinghub/python-crfsuite
29 | 
30 | .. toctree::
31 |    :hidden:
32 | 
33 |    pycrfsuite
34 | 
35 | See Also
36 | ========
37 | 
38 | sklearn-crfsuite_ is a python-crfsuite wrapper which provides
39 | API similar to scikit-learn.
40 | 
41 | .. _sklearn-crfsuite: https://github.com/TeamHG-Memex/sklearn-crfsuite
42 | 
43 | 
44 | Indices and tables
45 | ==================
46 | 
47 | * :ref:`genindex`
48 | * :ref:`modindex`
49 | * :ref:`search`
50 | 
51 | 
52 | 
53 | .. include:: ../CHANGES.rst
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.fixture()
 5 | def xseq():
 6 |     return [
 7 |         {"walk": 1, "shop": 0.5},
 8 |         {"walk": 1},
 9 |         {"walk": 1, "clean": 0.5},
10 |         {"shop": 0.5, "clean": 0.5},
11 |         {"walk": 0.5, "clean": 1},
12 |         {"clean": 1, "shop": 0.1},
13 |         {"walk": 1, "shop": 0.5},
14 |         {},
15 |         {"clean": 1},
16 |         {"солнце": "не светит".encode(), "clean": 1},
17 |         {"world": 2},
18 |     ]
19 | 
20 | 
21 | @pytest.fixture
22 | def yseq():
23 |     return [
24 |         "sunny",
25 |         "sunny",
26 |         "sunny",
27 |         "rainy",
28 |         "rainy",
29 |         "rainy",
30 |         "sunny",
31 |         "sunny",
32 |         "rainy",
33 |         "rainy",
34 |         "好",
35 |     ]
36 | 
37 | 
38 | @pytest.fixture
39 | def model_filename(tmpdir, xseq, yseq):
40 |     from pycrfsuite import Trainer
41 | 
42 |     trainer = Trainer("lbfgs", verbose=False)
43 |     trainer.append(xseq, yseq)
44 |     model_filename = str(tmpdir.join("model.crfsuite"))
45 |     trainer.train(model_filename)
46 |     return model_filename
47 | 
48 | 
49 | @pytest.fixture
50 | def model_bytes(model_filename):
51 |     with open(model_filename, "rb") as f:
52 |         return f.read()
53 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "python-crfsuite"
 3 | version = "0.9.11"
 4 | description = "Python binding for CRFsuite"
 5 | authors = [
 6 |   {name = "Terry Peng", email = "pengtaoo@gmail.com"},
 7 |   {name = "Mikhail Korobov", email = "kmike84@gmail.com"},
 8 | ]
 9 | readme = "README.rst"
10 | license = {text = "MIT License", url = "http://www.opensource.org/licenses/mit-license.php"}
11 | requires-python = ">=3.8"
12 | classifiers = [
13 |    "Development Status :: 4 - Beta",
14 |    "Intended Audience :: Developers",
15 |    "Intended Audience :: Science/Research",
16 |    "License :: OSI Approved :: MIT License",
17 |    "Programming Language :: Cython",
18 |    "Programming Language :: Python",
19 |    "Programming Language :: Python :: 3",
20 |    "Programming Language :: Python :: 3.6",
21 |    "Programming Language :: Python :: 3.7",
22 |    "Programming Language :: Python :: 3.8",
23 |    "Programming Language :: Python :: 3.9",
24 |    "Programming Language :: Python :: 3.10",
25 |    "Topic :: Software Development",
26 |    "Topic :: Software Development :: Libraries :: Python Modules",
27 |    "Topic :: Scientific/Engineering",
28 |    "Topic :: Scientific/Engineering :: Information Analysis",
29 |    "Topic :: Text Processing :: Linguistic",
30 | ]  
31 | 
32 | [project.urls]
33 | Homepage = "https://github.com/scrapinghub/python-crfsuite"
34 | 
35 | [project.optional-dependencies]
36 | dev = ["tox",
37 |        "black",
38 |        "isort",
39 |        "flake8",
40 | ]
41 | 
42 | [build-system]
43 | requires = ["setuptools>=42", "wheel", "cython"]
44 | build-backend = "setuptools.build_meta"
45 | 
46 | 
47 | [tool.setuptools.packages.find]
48 | include = ["pycrfsuite"]
49 | 
50 | 
51 | [tool.pytest.ini_options]
52 | addopts = [
53 |     "--import-mode=importlib",
54 | ]
55 | testpaths = [
56 |   "tests",
57 |   ]
58 | 
59 | [tool.isort]
60 | profile = "black"
61 | src_paths = ["usaddress", "tests"]
62 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import glob
 3 | import sys
 4 | from distutils.command.build_ext import build_ext
 5 | 
 6 | from Cython.Build import cythonize
 7 | from setuptools import Extension, setup
 8 | 
 9 | sources = ["pycrfsuite/_pycrfsuite.pyx", "pycrfsuite/trainer_wrapper.cpp"]
10 | 
11 | # crfsuite
12 | sources += glob.glob("crfsuite/lib/crf/src/*.c")
13 | sources += glob.glob("crfsuite/swig/*.cpp")
14 | 
15 | sources += ["crfsuite/lib/cqdb/src/cqdb.c"]
16 | sources += ["crfsuite/lib/cqdb/src/lookup3.c"]
17 | 
18 | # lbfgs
19 | sources += glob.glob("liblbfgs/lib/*.c")
20 | 
21 | includes = [
22 |     "crfsuite/include/",
23 |     "crfsuite/lib/cqdb/include",
24 |     "liblbfgs/include",
25 |     "pycrfsuite/",
26 | ]
27 | 
28 | 
29 | class build_ext_check_gcc(build_ext):
30 |     def build_extensions(self):
31 |         c = self.compiler
32 | 
33 |         _compile = c._compile
34 | 
35 |         def c_compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
36 |             cc_args = (
37 |                 cc_args + ["-D_POSIX_C_SOURCE=200112L"]
38 |                 if src.startswith("crfsuite/")
39 |                 else cc_args
40 |             )
41 |             cc_args = cc_args + ["-std=c99"] if src.endswith(".c") else cc_args
42 |             return _compile(obj, src, ext, cc_args, extra_postargs, pp_opts)
43 | 
44 |         if c.compiler_type == "unix" and any(
45 |             item == "gcc" or item.endswith("-gcc") for item in c.compiler
46 |         ):
47 |             c._compile = c_compile
48 | 
49 |         elif self.compiler.compiler_type == "msvc":
50 |             if sys.version_info[:2] < (3, 5):
51 |                 c.include_dirs.extend(["crfsuite/win32"])
52 | 
53 |         build_ext.build_extensions(self)
54 | 
55 | 
56 | setup(
57 |     ext_modules=cythonize(
58 |         [
59 |             Extension(
60 |                 "pycrfsuite._pycrfsuite",
61 |                 include_dirs=includes,
62 |                 language="c++",
63 |                 sources=sorted(sources),
64 |             )
65 |         ]
66 |     ),
67 |     cmdclass={"build_ext": build_ext_check_gcc},
68 | )
69 | 


--------------------------------------------------------------------------------
/.github/workflows/build_and_upload.yml:
--------------------------------------------------------------------------------
 1 | name: Build and upload to PyPI
 2 | 
 3 | on:
 4 |   push:
 5 |   pull_request:
 6 |   release:
 7 |     types:
 8 |       - published
 9 | 
10 | jobs:
11 |   build_wheels:
12 |     name: "Build wheels on ${{ matrix.os }}"
13 |     runs-on: ${{ matrix.os }}
14 |     strategy:
15 |       fail-fast: false
16 |       matrix:
17 |         os: [ubuntu-latest, macos-latest, windows-latest]
18 |     steps:
19 |       - uses: "actions/checkout@v4"
20 |         with:
21 |           submodules: true
22 |       - name: "Set up QEMU"
23 |         if: matrix.os == 'ubuntu-latest'
24 |         uses: "docker/setup-qemu-action@v3"
25 |         with:
26 |           platforms: arm64
27 |       - name: "Build wheels"
28 |         uses: "pypa/cibuildwheel@v2.21.1"
29 |         env:
30 |           CIBW_SKIP: "pp*"  # FIXME
31 |           CIBW_ARCHS_LINUX: "auto aarch64"
32 |           CIBW_TEST_REQUIRES: "pytest"
33 |           CIBW_TEST_COMMAND: "pytest {project}/tests --doctest-modules"
34 |       - uses: "actions/upload-artifact@v4"
35 |         with:
36 |           path: "./wheelhouse/*.whl"
37 |           name: wheel-${{ matrix.os }}
38 |  
39 |   make_sdist:
40 |     name: "Build source distribution"
41 |     runs-on: ubuntu-latest
42 |     steps:
43 |       - uses: "actions/checkout@v4"
44 |         with:
45 |           submodules: true
46 |       - name: "Build source distribution"
47 |         run: "pipx run build --sdist"
48 |       - uses: "actions/upload-artifact@v4"
49 |         with:
50 |           path: "./dist/*.tar.gz"
51 |           name: sdist
52 | 
53 |   upload_to_pypi:
54 |     name: "Upload to PyPI"
55 |     runs-on: ubuntu-latest
56 |     if: github.event_name == 'release' && github.event.action == 'published'
57 |     needs:
58 |       - build_wheels
59 |       - make_sdist
60 |     steps:
61 |       - uses: "actions/download-artifact@v4"
62 |         with:
63 |           path: dist
64 |           merge-multiple: true
65 |       - uses: "pypa/gh-action-pypi-publish@v1.13.0"
66 |         with:
67 |           user: __token__
68 |           password: ${{ secrets.PYPI_TOKEN }}
69 |           print_hash: true
70 |           verbose: true
71 |           skip_existing: true
72 | 


--------------------------------------------------------------------------------
/tests/test_itemsequence.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | import pycrfsuite
 4 | 
 5 | 
 6 | def test_basic():
 7 |     seq = pycrfsuite.ItemSequence([])
 8 |     assert len(seq) == 0
 9 |     assert seq.items() == []
10 | 
11 | 
12 | def test_lists():
13 |     seq = pycrfsuite.ItemSequence([["foo", "bar"], ["bar", "baz"]])
14 |     assert len(seq) == 2
15 |     assert seq.items() == [{"foo": 1.0, "bar": 1.0}, {"bar": 1.0, "baz": 1.0}]
16 |     assert pycrfsuite.ItemSequence(seq.items()).items() == seq.items()
17 | 
18 | 
19 | def test_dicts():
20 |     seq = pycrfsuite.ItemSequence(
21 |         [
22 |             {"foo": True, "bar": {"foo": -1, "baz": False}},
23 |         ]
24 |     )
25 |     assert len(seq) == 1
26 |     assert seq.items() == [{"foo": 1.0, "bar:foo": -1, "bar:baz": 0.0}]
27 | 
28 | 
29 | def test_unicode():
30 |     seq = pycrfsuite.ItemSequence(
31 |         [
32 |             {"foo": "привет", "ключ": 1.0, "привет": "мир"},
33 |         ]
34 |     )
35 |     assert seq.items() == [{"foo:привет": 1.0, "ключ": 1.0, "привет:мир": 1.0}]
36 | 
37 | 
38 | @pytest.mark.xfail()
39 | def test_bad():
40 |     with pytest.raises(ValueError):
41 |         seq = pycrfsuite.ItemSequence("foo")
42 |         print(seq.items())
43 | 
44 |     with pytest.raises(ValueError):
45 |         seq = pycrfsuite.ItemSequence([[{"foo": "bar"}]])
46 |         print(seq.items())
47 | 
48 | 
49 | def test_nested():
50 |     seq = pycrfsuite.ItemSequence(
51 |         [
52 |             {
53 |                 "foo": {
54 |                     "bar": "baz",
55 |                     "spam": 0.5,
56 |                     "egg": ["x", "y"],
57 |                     "ham": {"x": -0.5, "y": -0.1},
58 |                 },
59 |             },
60 |             {
61 |                 "foo": {"bar": "ham", "spam": -0.5, "ham": {"x", "y"}},
62 |             },
63 |         ]
64 |     )
65 |     assert len(seq) == 2
66 |     assert seq.items() == [
67 |         {
68 |             "foo:bar:baz": 1.0,
69 |             "foo:spam": 0.5,
70 |             "foo:egg:x": 1.0,
71 |             "foo:egg:y": 1.0,
72 |             "foo:ham:x": -0.5,
73 |             "foo:ham:y": -0.1,
74 |         },
75 |         {
76 |             "foo:bar:ham": 1.0,
77 |             "foo:spam": -0.5,
78 |             "foo:ham:x": 1.0,
79 |             "foo:ham:y": 1.0,
80 |         },
81 |     ]
82 |     assert pycrfsuite.ItemSequence(seq.items()).items() == seq.items()
83 | 


--------------------------------------------------------------------------------
/pycrfsuite/crfsuite_api.pxd:
--------------------------------------------------------------------------------
 1 | from libcpp.string cimport string
 2 | from libcpp.vector cimport vector
 3 | 
 4 | cdef extern from "../crfsuite/include/crfsuite.h":
 5 |     ctypedef enum:
 6 |         CRFSUITE_SUCCESS
 7 |         CRFSUITEERR_UNKNOWN         # Unknown error occurred.
 8 |         CRFSUITEERR_OUTOFMEMORY     # Insufficient memory.
 9 |         CRFSUITEERR_NOTSUPPORTED    # Unsupported operation.
10 |         CRFSUITEERR_INCOMPATIBLE    # Incompatible data.
11 |         CRFSUITEERR_INTERNAL_LOGIC  # Internal error.
12 |         CRFSUITEERR_OVERFLOW        # Overflow.
13 |         CRFSUITEERR_NOTIMPLEMENTED  # Not implemented.
14 | 
15 | 
16 | cdef extern from "../crfsuite/include/crfsuite_api.hpp" namespace "CRFSuite":
17 |     cdef cppclass Attribute:
18 |         string attr
19 |         double value
20 | 
21 |         Attribute()
22 |         Attribute(string)
23 |         Attribute(string, double)
24 | 
25 |     ctypedef vector[Attribute] Item
26 |     ctypedef vector[Item] ItemSequence
27 |     ctypedef vector[string] StringList
28 | 
29 |     cdef string version()
30 | 
31 | 
32 | cdef extern from "trainer_wrapper.hpp" namespace "CRFSuiteWrapper":
33 | 
34 |     ctypedef object (*messagefunc)(object self, string message)
35 | 
36 |     cdef cppclass Trainer:
37 |         Trainer() except +
38 |         void set_handler(object, messagefunc) except +
39 |         void clear() except +
40 |         void append(ItemSequence, StringList, int) except +
41 |         bint select(string, string) except +
42 |         int train(string, int) except +
43 |         StringList params() except +
44 |         void set(string, string) except +
45 |         string get(string) except +
46 |         string help(string) except +
47 |         void _init_hack() except +
48 | 
49 | 
50 | cdef extern from "tagger_wrapper.hpp" namespace "CRFSuiteWrapper":
51 | 
52 |     ctypedef object (*messagefunc)(object self, string message)
53 | 
54 |     cdef cppclass Tagger:
55 |         Tagger() except +
56 |         int open(string) except +
57 |         int open(const void*, size_t) except +
58 |         void close() except +
59 |         StringList labels() except +
60 |         StringList tag(ItemSequence) except +
61 |         void set(ItemSequence) except +
62 |         StringList viterbi() except +
63 |         double probability(StringList) except +
64 |         double marginal(string, int) except +
65 |         void dump(int) except +
66 |         void dump2() except +
67 | 


--------------------------------------------------------------------------------
/pycrfsuite/_dumpparser.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | class ParsedDump:
 5 |     """
 6 |     CRFsuite model parameters. Objects of this type are returned by
 7 |     :meth:`pycrfsuite.Tagger.info()` method.
 8 | 
 9 |     Attributes
10 |     ----------
11 | 
12 |     transitions : dict
13 |         ``{(from_label, to_label): weight}`` dict with learned transition weights
14 | 
15 |     state_features : dict
16 |         ``{(attribute, label): weight}`` dict with learned ``(attribute, label)`` weights
17 | 
18 |     header : dict
19 |         Metadata from the file header
20 | 
21 |     labels : dict
22 |         ``{name: internal_id}`` dict with model labels
23 | 
24 |     attributes : dict
25 |         ``{name: internal_id}`` dict with known attributes
26 | 
27 |     """
28 | 
29 |     def __init__(self):
30 |         self.header = {}
31 |         self.labels = {}
32 |         self.attributes = {}
33 |         self.transitions = {}
34 |         self.state_features = {}
35 | 
36 | 
37 | class CRFsuiteDumpParser:
38 |     """
39 |     A hack: parser for `crfsuite dump` results.
40 | 
41 |     Obtaining coefficients "the proper way" is quite hard otherwise
42 |     because in CRFsuite they are hidden in private structures.
43 |     """
44 | 
45 |     def __init__(self):
46 |         self.state = None
47 |         self.result = ParsedDump()
48 | 
49 |     def feed(self, line):
50 |         # Strip initial ws and line terminator, but allow for ws at the end of feature names.
51 |         line = line.lstrip().rstrip("\r\n")
52 |         if not line:
53 |             return
54 | 
55 |         m = re.match(
56 |             r"(FILEHEADER|LABELS|ATTRIBUTES|TRANSITIONS|STATE_FEATURES) = {", line
57 |         )
58 |         if m:
59 |             self.state = m.group(1)
60 |         elif line == "}":
61 |             self.state = None
62 |         else:
63 |             getattr(self, "parse_%s" % self.state)(line)
64 | 
65 |     def parse_FILEHEADER(self, line):
66 |         m = re.match(r"(\w+): (.*)", line)
67 |         self.result.header[m.group(1)] = m.group(2)
68 | 
69 |     def parse_LABELS(self, line):
70 |         m = re.match(r"(\d+): (.*)", line)
71 |         self.result.labels[m.group(2)] = m.group(1)
72 | 
73 |     def parse_ATTRIBUTES(self, line):
74 |         m = re.match(r"(\d+): (.*)", line)
75 |         self.result.attributes[m.group(2)] = m.group(1)
76 | 
77 |     def parse_TRANSITIONS(self, line):
78 |         m = re.match(r"\(\d+\) (.+) --> (.+): ([+-]?\d+\.\d+)", line)
79 |         from_, to_ = m.group(1), m.group(2)
80 |         assert from_ in self.result.labels
81 |         assert to_ in self.result.labels
82 |         self.result.transitions[(from_, to_)] = float(m.group(3))
83 | 
84 |     def parse_STATE_FEATURES(self, line):
85 |         m = re.match(r"\(\d+\) (.+) --> (.+): ([+-]?\d+\.\d+)", line)
86 |         attr, label = m.group(1), m.group(2)
87 |         assert attr in self.result.attributes
88 |         assert label in self.result.labels
89 |         self.result.state_features[(attr, label)] = float(m.group(3))
90 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | ===============
 2 | python-crfsuite
 3 | ===============
 4 | 
 5 | .. image:: https://github.com/scrapinghub/python-crfsuite/actions/workflows/tests.yml/badge.svg
 6 |     :target: https://github.com/scrapinghub/python-crfsuite/actions/workflows/tests.yml
 7 | 
 8 | .. image:: https://img.shields.io/pypi/v/python-crfsuite.svg?style=flat-square
 9 |     :target: https://pypi.python.org/pypi/python-crfsuite
10 |     :alt: pypi Version
11 | 
12 | .. image:: https://anaconda.org/conda-forge/python-crfsuite/badges/version.svg
13 |     :target: https://anaconda.org/conda-forge/python-crfsuite
14 |     :alt: conda Version
15 | 
16 | python-crfsuite is a python binding to CRFsuite_.
17 | 
18 | Installation
19 | ============
20 | 
21 | Using ``pip``::
22 | 
23 |     pip install python-crfsuite
24 | 
25 | Using ``conda``::
26 | 
27 |     conda install -c conda-forge python-crfsuite
28 | 
29 | Usage
30 | =====
31 | 
32 | See docs_ and an example_.
33 | 
34 | .. _docs: http://python-crfsuite.rtfd.org/
35 | .. _example: https://github.com/scrapinghub/python-crfsuite/blob/master/examples/CoNLL%202002.ipynb
36 | 
37 | See Also
38 | ========
39 | 
40 | sklearn-crfsuite_ is a python-crfsuite wrapper which provides
41 | API similar to scikit-learn.
42 | 
43 | .. _sklearn-crfsuite: https://github.com/TeamHG-Memex/sklearn-crfsuite
44 | 
45 | Contributing
46 | ============
47 | 
48 | * Source code: https://github.com/scrapinghub/python-crfsuite
49 | * Issue tracker: https://github.com/scrapinghub/python-crfsuite/issues
50 | 
51 | Feel free to submit ideas, bugs reports, pull requests or regular patches.
52 | 
53 | Please don't commit generated cpp files in the same commit as other files.
54 | 
55 | .. _Cython: http://cython.org/
56 | .. _tox: http://tox.testrun.org
57 | 
58 | Authors and Contributors
59 | ========================
60 | 
61 | Original authors are Terry Peng <pengtaoo@gmail.com> and
62 | Mikhail Korobov <kmike84@gmail.com>. Many other people contributed;
63 | some of them can be found at github Contributors_ page.
64 | 
65 | Bundled CRFSuite_ C/C++ library is by Naoaki Okazaki & contributors.
66 | 
67 | .. _Contributors: https://github.com/scrapinghub/python-crfsuite/graphs/contributors
68 | 
69 | License
70 | =======
71 | 
72 | python-crfsuite is licensed under MIT license.
73 | CRFsuite_ library is licensed under BSD license.
74 | 
75 | .. _CRFsuite: https://github.com/chokkan/crfsuite
76 | 
77 | Alternatives
78 | ============
79 | 
80 | * https://github.com/chokkan/crfsuite/tree/master/swig/python - official
81 |   Python wrapper, exposes C++ API using SWIG.
82 | * https://github.com/jakevdp/pyCRFsuite - uses C API instead of C++ API;
83 |   allows to use scipy sparse matrices as an input. At the time of writing
84 |   it is unmaintained.
85 | * https://github.com/bosondata/crfsuite-rs - uses a Rust wrapper with CFFI instead of C++ API;
86 |   allows to tag with GIL released for better performance.
87 | 
88 | This package (python-crfsuite) wraps CRFsuite C++ API using Cython.
89 | It is faster than official SWIG wrapper and has a simpler codebase than
90 | a more advanced pyCRFsuite. python-crfsuite works in Python 2 and Python 3,
91 | doesn't have external dependencies (CRFsuite is bundled, numpy/scipy stack
92 | is not needed) and workarounds some of the issues with C++ CRFsuite library.
93 | 


--------------------------------------------------------------------------------
/tests/test_trainer.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import pytest
  4 | 
  5 | from pycrfsuite import Trainer
  6 | 
  7 | 
  8 | def test_trainer(tmpdir, xseq, yseq):
  9 |     trainer = Trainer("lbfgs")
 10 |     trainer.append(xseq, yseq)
 11 | 
 12 |     model_filename = str(tmpdir.join("model.crfsuite"))
 13 |     assert not os.path.isfile(model_filename)
 14 |     trainer.train(model_filename)
 15 |     assert os.path.isfile(model_filename)
 16 | 
 17 | 
 18 | def test_trainer_noselect(tmpdir, xseq, yseq):
 19 |     # This shouldn't segfault; see https://github.com/chokkan/crfsuite/pull/21
 20 |     trainer = Trainer()
 21 |     trainer.append(xseq, yseq)
 22 |     model_filename = str(tmpdir.join("model.crfsuite"))
 23 |     trainer.train(model_filename)
 24 | 
 25 | 
 26 | def test_trainer_noappend(tmpdir):
 27 |     # This shouldn't segfault; see https://github.com/chokkan/crfsuite/pull/21
 28 |     trainer = Trainer()
 29 |     trainer.select("lbfgs")
 30 |     model_filename = str(tmpdir.join("model.crfsuite"))
 31 |     trainer.train(model_filename)
 32 | 
 33 | 
 34 | def test_trainer_noselect_noappend(tmpdir):
 35 |     # This shouldn't segfault; see https://github.com/chokkan/crfsuite/pull/21
 36 |     trainer = Trainer()
 37 |     model_filename = str(tmpdir.join("model.crfsuite"))
 38 |     trainer.train(model_filename)
 39 | 
 40 | 
 41 | def test_training_messages(tmpdir, xseq, yseq):
 42 |     class CapturingTrainer(Trainer):
 43 |         def __init__(self):
 44 |             self.messages = []
 45 | 
 46 |         def message(self, message):
 47 |             self.messages.append(message)
 48 | 
 49 |     trainer = CapturingTrainer()
 50 |     trainer.select("lbfgs")
 51 |     trainer.append(xseq, yseq)
 52 |     assert not trainer.messages
 53 | 
 54 |     model_filename = str(tmpdir.join("model.crfsuite"))
 55 |     trainer.train(model_filename)
 56 |     assert trainer.messages
 57 |     assert "type: CRF1d\n" in trainer.messages
 58 |     # print("".join(trainer.messages))
 59 | 
 60 | 
 61 | def test_training_messages_exception(tmpdir, xseq, yseq):
 62 |     class MyException(Exception):
 63 |         pass
 64 | 
 65 |     class BadTrainer(Trainer):
 66 |         def message(self, message):
 67 |             raise MyException("error")
 68 | 
 69 |     trainer = BadTrainer()
 70 |     trainer.select("lbfgs")
 71 |     trainer.append(xseq, yseq)
 72 | 
 73 |     model_filename = str(tmpdir.join("model.crfsuite"))
 74 | 
 75 |     with pytest.raises(MyException):
 76 |         trainer.train(model_filename)
 77 | 
 78 | 
 79 | def test_trainer_select_raises_error():
 80 |     trainer = Trainer()
 81 |     with pytest.raises(ValueError):
 82 |         trainer.select("foo")
 83 | 
 84 | 
 85 | @pytest.mark.parametrize(
 86 |     "algo",
 87 |     [
 88 |         "lbfgs",
 89 |         "l2sgd",
 90 |         "ap",
 91 |         "averaged-perceptron",
 92 |         "pa",
 93 |         "passive-aggressive",
 94 |         "arow",
 95 |     ],
 96 | )
 97 | def test_algorithm_parameters(algo):
 98 |     trainer = Trainer(algo)
 99 |     params = trainer.get_params()
100 |     assert params
101 | 
102 |     # set the same values
103 |     trainer.set_params(params)
104 |     params2 = trainer.get_params()
105 |     assert params2 == params
106 | 
107 |     # change a value
108 |     trainer.set("feature.possible_states", True)
109 |     assert trainer.get_params()["feature.possible_states"] is True
110 | 
111 |     trainer.set("feature.possible_states", False)
112 |     assert trainer.get_params()["feature.possible_states"] is False
113 | 
114 |     # invalid parameter
115 |     params["foo"] = 5
116 |     with pytest.raises(ValueError):
117 |         trainer.set_params(params)
118 | 
119 | 
120 | def test_params_and_help():
121 |     trainer = Trainer()
122 | 
123 |     trainer.select("lbfgs")
124 |     assert "c1" in trainer.params()
125 |     assert "c2" in trainer.params()
126 |     assert "num_memories" in trainer.params()
127 |     assert "L1" in trainer.help("c1")
128 | 
129 |     trainer.select("l2sgd")
130 |     assert "c2" in trainer.params()
131 |     assert "c1" not in trainer.params()
132 |     assert "L2" in trainer.help("c2")
133 | 
134 | 
135 | def test_help_invalid_parameter():
136 |     trainer = Trainer()
137 |     trainer.select("l2sgd")
138 | 
139 |     # This segfaults without a workaround;
140 |     # see https://github.com/chokkan/crfsuite/pull/21
141 |     with pytest.raises(ValueError):
142 |         trainer.help("foo")
143 | 
144 |     with pytest.raises(ValueError):
145 |         trainer.help("c1")
146 | 
147 | 
148 | def test_get_parameter():
149 |     trainer = Trainer()
150 |     trainer.select("l2sgd")
151 |     assert abs(trainer.get("c2") - 0.1) > 1e-6
152 |     trainer.set("c2", 0.1)
153 |     assert abs(trainer.get("c2") - 0.1) < 1e-6
154 | 
155 | 
156 | def test_set_parameters_in_constructor():
157 |     trainer = Trainer(params={"c2": 100})
158 |     assert abs(trainer.get("c2") - 100) < 1e-6
159 | 


--------------------------------------------------------------------------------
/CHANGES.rst:
--------------------------------------------------------------------------------
  1 | Changes
  2 | =======
  3 | 
  4 | 0.9.9 (2023-02-01)
  5 | ------------------
  6 | 
  7 | * Python 3.11 Support
  8 | 
  9 | 0.9.7 (2020-03-15)
 10 | ------------------
 11 | 
 12 | * Python 3.4 is no longer supported (it may work, but CI is disabled)
 13 | * Python 3.8 support
 14 | * fixed installation issues on OS X (thanks @kvinwang)
 15 | * make it easier for distributions to have a reproducible build
 16 |   (thanks @bmwiedemann)
 17 | 
 18 | 0.9.6 (2018-08-01)
 19 | ------------------
 20 | 
 21 | * Python 3.7 support (thanks @fgregg, @danmacnaughtan and @fuhrysteve).
 22 | * Python 3.3 support is dropped.
 23 | * new Tagger.open_inmemory method which allows to load tagger data
 24 |   without having a file on-disk (thanks @lucywang000).
 25 | * license information is added to setup.py (thanks @nils-werner).
 26 | 
 27 | 0.9.5 (2017-09-05)
 28 | ------------------
 29 | 
 30 | * Python 3.6 wheels for Windows (thanks @fgregg).
 31 | 
 32 | 0.9.4 (2017-09-04)
 33 | ------------------
 34 | 
 35 | * Packaging fix (thanks @fgregg).
 36 | 
 37 | 0.9.3 (2017-09-03)
 38 | ------------------
 39 | 
 40 | * Fixed compatibility with Python 3.5+ on Windows (thanks @fgregg);
 41 | * CRFSuite C++ library is updated to latest version, this fixes several
 42 |   memory leaks and improves performance (thanks @fgregg);
 43 | * extension is rebuilt with Cython 0.26.1.
 44 | 
 45 | 0.9.2 (2017-05-04)
 46 | ------------------
 47 | 
 48 | * binary wheels for OS X and Linux (thanks @jeancochrane).
 49 | 
 50 | 0.9.1 (2016-12-19)
 51 | ------------------
 52 | 
 53 | This is a release without changes in functionality.
 54 | 
 55 | * Repository is moved to https://github.com/scrapinghub/python-crfsuite;
 56 | * We're now providing Windows wheels for Python 2.7, 3.3. and 3.4.
 57 | 
 58 | 0.9 (2016-12-08)
 59 | ----------------
 60 | 
 61 | * Python 2.6 support is dropped;
 62 | * CRFSuite C++ library is updated to a more recent commit;
 63 | * improved Windows support (thanks @fgregg);
 64 | * fixed building with gcc < 5.0.0 (thanks @kantan2015);
 65 | * extension is rebuilt with Cython 0.25.1; this improves PyPy compatibility
 66 |   (but we're not quite there yet).
 67 | * docs: trainer.logparser example is added to the notebook (thanks @samgalen).
 68 | 
 69 | 0.8.4 (2015-11-25)
 70 | ------------------
 71 | 
 72 | * the wrapper is rebuilt with Cython 0.23.4;
 73 | * declared Python 3.5 compatibility;
 74 | * fixed an issue with feature names ending with white spaces.
 75 | 
 76 | 0.8.3 (2015-04-24)
 77 | ------------------
 78 | 
 79 | * fix build on Windows. (thanks @fgregg)
 80 | 
 81 | 0.8.2 (2015-02-04)
 82 | ------------------
 83 | 
 84 | * memory leak is fixed by updating the bundled CRFsuite C++ library;
 85 | * the wrapper is rebuilt with Cython 0.21.2.
 86 | 
 87 | 0.8.1 (2014-10-10)
 88 | ------------------
 89 | 
 90 | * fix packaging issues with 0.8 release.
 91 | 
 92 | 0.8 (2014-10-10)
 93 | ----------------
 94 | 
 95 | * :class:`~ItemSequence` wrapper is added;
 96 | * tox tests are fixed.
 97 | 
 98 | 0.7 (2014-08-11)
 99 | ----------------
100 | 
101 | * More data formats for ``xseq``: ``{"prefix": {feature_dict}}`` and
102 |   ``{"key": set(["key1",...])}`` feature dicts are now accepted by
103 |   :class:`pycrfsuite.Trainer` and :class:`pycrfsuite.Tagger`;
104 | * feature separator changed from "=" to ":" (it looks better in case of
105 |   multi-level features);
106 | * small doc and README fixes.
107 | 
108 | 
109 | 0.6.1 (2014-06-06)
110 | ------------------
111 | 
112 | * Switch to setuptools;
113 | * wheels are uploaded to pypi for faster installation.
114 | 
115 | 0.6 (2014-05-29)
116 | ----------------
117 | 
118 | * More data formats for ``xseq``: ``{"key": "value"}`` and
119 |   ``{"key": bool_value}`` feature dicts are now accepted by
120 |   :class:`pycrfsuite.Trainer` and :class:`pycrfsuite.Tagger`.
121 | 
122 | 0.5 (2014-05-27)
123 | ----------------
124 | 
125 | * Exceptions in logging message handlers are now propagated and raised. This
126 |   allows, for example, to stop training earlier by pressing Ctrl-C.
127 | 
128 | * It is now possible to customize :class:`pycrfsuite.Trainer` logging
129 |   more easily by overriding the following methods:
130 |   :meth:`pycrfsuite.Trainer.on_start`,
131 |   :meth:`pycrfsuite.Trainer.on_featgen_progress`,
132 |   :meth:`pycrfsuite.Trainer.on_featgen_end`,
133 |   :meth:`pycrfsuite.Trainer.on_prepared`,
134 |   :meth:`pycrfsuite.Trainer.on_prepare_error`,
135 |   :meth:`pycrfsuite.Trainer.on_iteration`,
136 |   :meth:`pycrfsuite.Trainer.on_optimization_end`
137 |   :meth:`pycrfsuite.Trainer.on_end`. The feature is implemented by parsing
138 |   CRFsuite log. There is :class:`pycrfsuite.BaseTrainer` that is not
139 |   doing this.
140 | 
141 | 0.4.1 (2014-05-18)
142 | ------------------
143 | 
144 | * :meth:`pycrfsuite.Tagger.info()` is fixed.
145 | 
146 | 0.4 (2014-05-16)
147 | ----------------
148 | 
149 | * (backwards-incompatible) training parameters are now passed
150 |   using ``params`` argument of  :class:`pycrfsuite.Trainer` constructor
151 |   instead of ``**kwargs``;
152 | * (backwards-incompatible) logging support is dropped;
153 | * `verbose` argument for :class:`pycrfsuite.Trainer` constructor;
154 | * :meth:`pycrfsuite.Trainer.get_params` and
155 |   :meth:`pycrfsuite.Trainer.set_params` for getting/setting multiple training
156 |   parameters at once;
157 | * string handling in Python 3.x is fixed by rebuilding the wrapper with
158 |   Cython 0.21dev;
159 | * algorithm names are normalized to support names used
160 |   by crfsuite console utility and documented in crfsuite manual;
161 | * type conversion for training parameters is fixed: ``feature.minfreq``
162 |   now works, and boolean arguments become boolean.
163 | 
164 | 0.3 (2014-05-14)
165 | ----------------
166 | 
167 | python-crfsuite now detects the feature format (dict vs list of strings)
168 | automatically - it turns out the performance overhead is negligible.
169 | 
170 | * ``Trainer.append_stringslists`` and ``Trainer.append_dicts`` methods
171 |   are replaced with a single :meth:`pycrfsuite.Trainer.append` method;
172 | * ``Tagger.set_stringlists`` and ``Tagger.set_dicts`` methods are
173 |   removed in favor of :meth:`pycrfsuite.Tagger.set` method;
174 | * ``feature_format`` arguments in :class:`pycrfsuite.Tagger` methods
175 |   and constructor are dropped.
176 | 
177 | 0.2 (2014-05-14)
178 | ----------------
179 | 
180 | * :meth:`pycrfsuite.Tagger.dump()` and :meth:`pycrfsuite.Tagger.info()`
181 |   methods for model debugging;
182 | * a memory leak in Trainer is fixed (trainer instances were never
183 |   garbage collected);
184 | * documentation and testing improvements.
185 | 
186 | 0.1 (2014-04-30)
187 | ----------------
188 | 
189 | Many changes; python-crfsuite is almost rewritten.
190 | 
191 | 0.0.1 (2014-04-24)
192 | ------------------
193 | 
194 | Initial release.
195 | 


--------------------------------------------------------------------------------
/pycrfsuite/_logparser.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import fractions
  3 | from collections import namedtuple
  4 | 
  5 | LabelScore = namedtuple("LabelScore", "match model ref precision recall f1")
  6 | 
  7 | 
  8 | class TrainLogParser:
  9 |     def __init__(self):
 10 |         self.state = None
 11 |         self.featgen_percent = -2
 12 |         self.featgen_num_features = None
 13 |         self.featgen_seconds = None
 14 |         self.training_seconds = None
 15 |         self.storing_seconds = None
 16 | 
 17 |         self.iterations = []
 18 |         self.last_iteration = None
 19 |         self.log = []
 20 |         self.events = []
 21 | 
 22 |     def feed(self, line):
 23 |         # if line != '\n':
 24 |         self.log.append(line)
 25 |         if self.state is None:
 26 |             self.state = "STARTING"
 27 |             self.handle_STARTING(line)
 28 |             self.events.append(("start", 0, len(self.log)))
 29 |             return "start"
 30 |         event = getattr(self, "handle_" + self.state)(line)
 31 |         if event is not None:
 32 |             start, end = self.events[-1][2], len(self.log)
 33 |             if event in ("prepared", "optimization_end"):
 34 |                 end -= 1
 35 |             self.events.append((event, start, end))
 36 |         return event
 37 | 
 38 |     @property
 39 |     def last_log(self):
 40 |         event, start, end = self.events[-1]
 41 |         return "".join(self.log[start:end])
 42 | 
 43 |     def handle_STARTING(self, line):
 44 |         if line.startswith("Feature generation"):
 45 |             self.state = "FEATGEN"
 46 | 
 47 |     def handle_FEATGEN(self, line):
 48 |         if line in "0123456789.10":
 49 |             self.featgen_percent += 2
 50 |             return "featgen_progress"
 51 | 
 52 |         m = re.match(r"Number of features: (\d+)", line)
 53 |         if m:
 54 |             self.featgen_num_features = int(m.group(1))
 55 |             return None
 56 | 
 57 |         if self._seconds(line) is not None:
 58 |             self.featgen_seconds = self._seconds(line)
 59 |             self.state = "AFTER_FEATGEN"
 60 |             return "featgen_end"
 61 | 
 62 |     def handle_AFTER_FEATGEN(self, line):
 63 |         if self._iteration_head(line) is not None:
 64 |             self.state = "ITERATION"
 65 |             self.handle_ITERATION(line)
 66 |             return "prepared"
 67 | 
 68 |         if "terminated with error" in line:
 69 |             self.state = "AFTER_ITERATION"
 70 |             return "prepare_error"
 71 | 
 72 |     def handle_ITERATION(self, line):
 73 |         if self._iteration_head(line) is not None:
 74 |             self.last_iteration = {
 75 |                 "num": self._iteration_head(line),
 76 |                 "scores": {},
 77 |             }
 78 |             self.iterations.append(self.last_iteration)
 79 |         elif line == "\n":
 80 |             self.state = "AFTER_ITERATION"
 81 |             return "iteration"
 82 | 
 83 |         def add_re(key, pattern, typ):
 84 |             m = re.match(pattern, line)
 85 |             if m:
 86 |                 self.last_iteration[key] = typ(m.group(1))
 87 | 
 88 |         add_re("loss", r"Loss: (\d+\.\d+)", float)
 89 |         add_re("feature_norm", r"Feature norm: (\d+\.\d+)", float)
 90 |         add_re("error_norm", r"Error norm: (\d+\.\d+)", float)
 91 |         add_re("active_features", r"Active features: (\d+)", int)
 92 |         add_re("linesearch_trials", r"Line search trials: (\d+)", int)
 93 |         add_re("linesearch_step", r"Line search step: (\d+\.\d+)", float)
 94 |         add_re("time", r"Seconds required for this iteration: (\d+\.\d+)", float)
 95 | 
 96 |         m = re.match(
 97 |             r"Macro-average precision, recall, F1: \((\d\.\d+), (\d\.\d+), (\d\.\d+)\)",
 98 |             line,
 99 |         )
100 |         if m:
101 |             self.last_iteration["avg_precision"] = float(m.group(1))
102 |             self.last_iteration["avg_recall"] = float(m.group(2))
103 |             self.last_iteration["avg_f1"] = float(m.group(3))
104 | 
105 |         m = re.match(r"Item accuracy: (\d+) / (\d+)", line)
106 |         if m:
107 |             acc = fractions.Fraction(int(m.group(1)), int(m.group(2)))
108 |             self.last_iteration["item_accuracy"] = acc
109 |             self.last_iteration["item_accuracy_float"] = float(acc)
110 | 
111 |         m = re.match(r"Instance accuracy: (\d+) / (\d+)", line)
112 |         if m:
113 |             acc = fractions.Fraction(int(m.group(1)), int(m.group(2)))
114 |             self.last_iteration["instance_accuracy"] = acc
115 |             self.last_iteration["instance_accuracy_float"] = float(acc)
116 | 
117 |         m = re.match(
118 |             r"\s{4}(.+): \((\d+), (\d+), (\d+)\) \((\d\.\d+), (\d\.\d+), (\d\.\d+)\)",
119 |             line,
120 |         )
121 |         if m:
122 |             self.last_iteration["scores"][m.group(1)] = LabelScore(
123 |                 **{
124 |                     "match": int(m.group(2)),
125 |                     "model": int(m.group(3)),
126 |                     "ref": int(m.group(4)),
127 |                     "precision": float(m.group(5)),
128 |                     "recall": float(m.group(6)),
129 |                     "f1": float(m.group(7)),
130 |                 }
131 |             )
132 | 
133 |         m = re.match(r"\s{4}(.+): \(0, 0, 0\) \(\*{6}, \*{6}, \*{6}\)", line)
134 |         if m:
135 |             self.last_iteration["scores"][m.group(1)] = LabelScore(
136 |                 **{
137 |                     "match": 0,
138 |                     "model": 0,
139 |                     "ref": 0,
140 |                     "precision": None,
141 |                     "recall": None,
142 |                     "f1": None,
143 |                 }
144 |             )
145 | 
146 |     def handle_AFTER_ITERATION(self, line):
147 |         if self._iteration_head(line) is not None:
148 |             self.state = "ITERATION"
149 |             return self.handle_ITERATION(line)
150 | 
151 |         m = re.match(r"Total seconds required for training: (\d+\.\d+)", line)
152 |         if m:
153 |             self.training_seconds = float(m.group(1))
154 | 
155 |         if line.startswith("Storing the model"):
156 |             self.state = "STORING"
157 |             return "optimization_end"
158 | 
159 |     def handle_STORING(self, line):
160 |         if line == "\n":
161 |             return "end"
162 |         elif self._seconds(line):
163 |             self.storing_seconds = self._seconds(line)
164 | 
165 |     def _iteration_head(self, line):
166 |         m = re.match(r"\*{5} (?:Iteration|Epoch) #(\d+) \*{5}\n", line)
167 |         if m:
168 |             return int(m.group(1))
169 | 
170 |     def _seconds(self, line):
171 |         m = re.match(r"Seconds required: (\d+\.\d+)", line)
172 |         if m:
173 |             return float(m.group(1))
174 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # User-friendly check for sphinx-build
 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 13 | endif
 14 | 
 15 | # Internal variables.
 16 | PAPEROPT_a4     = -D latex_paper_size=a4
 17 | PAPEROPT_letter = -D latex_paper_size=letter
 18 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 19 | # the i18n builder cannot share the environment and doctrees with the others
 20 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 21 | 
 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 23 | 
 24 | help:
 25 | 	@echo "Please use \`make <target>' where <target> is one of"
 26 | 	@echo "  html       to make standalone HTML files"
 27 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 28 | 	@echo "  singlehtml to make a single large HTML file"
 29 | 	@echo "  pickle     to make pickle files"
 30 | 	@echo "  json       to make JSON files"
 31 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 32 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 33 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 34 | 	@echo "  epub       to make an epub"
 35 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 36 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 37 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 38 | 	@echo "  text       to make text files"
 39 | 	@echo "  man        to make manual pages"
 40 | 	@echo "  texinfo    to make Texinfo files"
 41 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 42 | 	@echo "  gettext    to make PO message catalogs"
 43 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 44 | 	@echo "  xml        to make Docutils-native XML files"
 45 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 46 | 	@echo "  linkcheck  to check all external links for integrity"
 47 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 48 | 
 49 | clean:
 50 | 	rm -rf $(BUILDDIR)/*
 51 | 
 52 | html:
 53 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 54 | 	@echo
 55 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 56 | 
 57 | dirhtml:
 58 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 59 | 	@echo
 60 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 61 | 
 62 | singlehtml:
 63 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 64 | 	@echo
 65 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 66 | 
 67 | pickle:
 68 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 69 | 	@echo
 70 | 	@echo "Build finished; now you can process the pickle files."
 71 | 
 72 | json:
 73 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 74 | 	@echo
 75 | 	@echo "Build finished; now you can process the JSON files."
 76 | 
 77 | htmlhelp:
 78 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 79 | 	@echo
 80 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 81 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 82 | 
 83 | qthelp:
 84 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 85 | 	@echo
 86 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 87 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 88 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/python-crfsuite.qhcp"
 89 | 	@echo "To view the help file:"
 90 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/python-crfsuite.qhc"
 91 | 
 92 | devhelp:
 93 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 94 | 	@echo
 95 | 	@echo "Build finished."
 96 | 	@echo "To view the help file:"
 97 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/python-crfsuite"
 98 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/python-crfsuite"
 99 | 	@echo "# devhelp"
100 | 
101 | epub:
102 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
103 | 	@echo
104 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
105 | 
106 | latex:
107 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
108 | 	@echo
109 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
110 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
111 | 	      "(use \`make latexpdf' here to do that automatically)."
112 | 
113 | latexpdf:
114 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
115 | 	@echo "Running LaTeX files through pdflatex..."
116 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
117 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
118 | 
119 | latexpdfja:
120 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
121 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
122 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
123 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
124 | 
125 | text:
126 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
127 | 	@echo
128 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
129 | 
130 | man:
131 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
132 | 	@echo
133 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
134 | 
135 | texinfo:
136 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
137 | 	@echo
138 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
139 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
140 | 	      "(use \`make info' here to do that automatically)."
141 | 
142 | info:
143 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
144 | 	@echo "Running Texinfo files through makeinfo..."
145 | 	make -C $(BUILDDIR)/texinfo info
146 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
147 | 
148 | gettext:
149 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
150 | 	@echo
151 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
152 | 
153 | changes:
154 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
155 | 	@echo
156 | 	@echo "The overview file is in $(BUILDDIR)/changes."
157 | 
158 | linkcheck:
159 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
160 | 	@echo
161 | 	@echo "Link check complete; look for any errors in the above output " \
162 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
163 | 
164 | doctest:
165 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
166 | 	@echo "Testing of doctests in the sources finished, look at the " \
167 | 	      "results in $(BUILDDIR)/doctest/output.txt."
168 | 
169 | xml:
170 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
171 | 	@echo
172 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
173 | 
174 | pseudoxml:
175 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
176 | 	@echo
177 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
178 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | if "%SPHINXBUILD%" == "" (
  6 | 	set SPHINXBUILD=sphinx-build
  7 | )
  8 | set BUILDDIR=_build
  9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
 10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
 11 | if NOT "%PAPER%" == "" (
 12 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 13 | 	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
 14 | )
 15 | 
 16 | if "%1" == "" goto help
 17 | 
 18 | if "%1" == "help" (
 19 | 	:help
 20 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 21 | 	echo.  html       to make standalone HTML files
 22 | 	echo.  dirhtml    to make HTML files named index.html in directories
 23 | 	echo.  singlehtml to make a single large HTML file
 24 | 	echo.  pickle     to make pickle files
 25 | 	echo.  json       to make JSON files
 26 | 	echo.  htmlhelp   to make HTML files and a HTML help project
 27 | 	echo.  qthelp     to make HTML files and a qthelp project
 28 | 	echo.  devhelp    to make HTML files and a Devhelp project
 29 | 	echo.  epub       to make an epub
 30 | 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 31 | 	echo.  text       to make text files
 32 | 	echo.  man        to make manual pages
 33 | 	echo.  texinfo    to make Texinfo files
 34 | 	echo.  gettext    to make PO message catalogs
 35 | 	echo.  changes    to make an overview over all changed/added/deprecated items
 36 | 	echo.  xml        to make Docutils-native XML files
 37 | 	echo.  pseudoxml  to make pseudoxml-XML files for display purposes
 38 | 	echo.  linkcheck  to check all external links for integrity
 39 | 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 40 | 	goto end
 41 | )
 42 | 
 43 | if "%1" == "clean" (
 44 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 45 | 	del /q /s %BUILDDIR%\*
 46 | 	goto end
 47 | )
 48 | 
 49 | 
 50 | %SPHINXBUILD% 2> nul
 51 | if errorlevel 9009 (
 52 | 	echo.
 53 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
 54 | 	echo.installed, then set the SPHINXBUILD environment variable to point
 55 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
 56 | 	echo.may add the Sphinx directory to PATH.
 57 | 	echo.
 58 | 	echo.If you don't have Sphinx installed, grab it from
 59 | 	echo.http://sphinx-doc.org/
 60 | 	exit /b 1
 61 | )
 62 | 
 63 | if "%1" == "html" (
 64 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 65 | 	if errorlevel 1 exit /b 1
 66 | 	echo.
 67 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 68 | 	goto end
 69 | )
 70 | 
 71 | if "%1" == "dirhtml" (
 72 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 73 | 	if errorlevel 1 exit /b 1
 74 | 	echo.
 75 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 76 | 	goto end
 77 | )
 78 | 
 79 | if "%1" == "singlehtml" (
 80 | 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 81 | 	if errorlevel 1 exit /b 1
 82 | 	echo.
 83 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 84 | 	goto end
 85 | )
 86 | 
 87 | if "%1" == "pickle" (
 88 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
 89 | 	if errorlevel 1 exit /b 1
 90 | 	echo.
 91 | 	echo.Build finished; now you can process the pickle files.
 92 | 	goto end
 93 | )
 94 | 
 95 | if "%1" == "json" (
 96 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
 97 | 	if errorlevel 1 exit /b 1
 98 | 	echo.
 99 | 	echo.Build finished; now you can process the JSON files.
100 | 	goto end
101 | )
102 | 
103 | if "%1" == "htmlhelp" (
104 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
105 | 	if errorlevel 1 exit /b 1
106 | 	echo.
107 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
108 | .hhp project file in %BUILDDIR%/htmlhelp.
109 | 	goto end
110 | )
111 | 
112 | if "%1" == "qthelp" (
113 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
114 | 	if errorlevel 1 exit /b 1
115 | 	echo.
116 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
117 | .qhcp project file in %BUILDDIR%/qthelp, like this:
118 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\python-crfsuite.qhcp
119 | 	echo.To view the help file:
120 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\python-crfsuite.ghc
121 | 	goto end
122 | )
123 | 
124 | if "%1" == "devhelp" (
125 | 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
126 | 	if errorlevel 1 exit /b 1
127 | 	echo.
128 | 	echo.Build finished.
129 | 	goto end
130 | )
131 | 
132 | if "%1" == "epub" (
133 | 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
134 | 	if errorlevel 1 exit /b 1
135 | 	echo.
136 | 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
137 | 	goto end
138 | )
139 | 
140 | if "%1" == "latex" (
141 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
142 | 	if errorlevel 1 exit /b 1
143 | 	echo.
144 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
145 | 	goto end
146 | )
147 | 
148 | if "%1" == "latexpdf" (
149 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
150 | 	cd %BUILDDIR%/latex
151 | 	make all-pdf
152 | 	cd %BUILDDIR%/..
153 | 	echo.
154 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
155 | 	goto end
156 | )
157 | 
158 | if "%1" == "latexpdfja" (
159 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
160 | 	cd %BUILDDIR%/latex
161 | 	make all-pdf-ja
162 | 	cd %BUILDDIR%/..
163 | 	echo.
164 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
165 | 	goto end
166 | )
167 | 
168 | if "%1" == "text" (
169 | 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
170 | 	if errorlevel 1 exit /b 1
171 | 	echo.
172 | 	echo.Build finished. The text files are in %BUILDDIR%/text.
173 | 	goto end
174 | )
175 | 
176 | if "%1" == "man" (
177 | 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
178 | 	if errorlevel 1 exit /b 1
179 | 	echo.
180 | 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
181 | 	goto end
182 | )
183 | 
184 | if "%1" == "texinfo" (
185 | 	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
186 | 	if errorlevel 1 exit /b 1
187 | 	echo.
188 | 	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
189 | 	goto end
190 | )
191 | 
192 | if "%1" == "gettext" (
193 | 	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
194 | 	if errorlevel 1 exit /b 1
195 | 	echo.
196 | 	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
197 | 	goto end
198 | )
199 | 
200 | if "%1" == "changes" (
201 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
202 | 	if errorlevel 1 exit /b 1
203 | 	echo.
204 | 	echo.The overview file is in %BUILDDIR%/changes.
205 | 	goto end
206 | )
207 | 
208 | if "%1" == "linkcheck" (
209 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
210 | 	if errorlevel 1 exit /b 1
211 | 	echo.
212 | 	echo.Link check complete; look for any errors in the above output ^
213 | or in %BUILDDIR%/linkcheck/output.txt.
214 | 	goto end
215 | )
216 | 
217 | if "%1" == "doctest" (
218 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
219 | 	if errorlevel 1 exit /b 1
220 | 	echo.
221 | 	echo.Testing of doctests in the sources finished, look at the ^
222 | results in %BUILDDIR%/doctest/output.txt.
223 | 	goto end
224 | )
225 | 
226 | if "%1" == "xml" (
227 | 	%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
228 | 	if errorlevel 1 exit /b 1
229 | 	echo.
230 | 	echo.Build finished. The XML files are in %BUILDDIR%/xml.
231 | 	goto end
232 | )
233 | 
234 | if "%1" == "pseudoxml" (
235 | 	%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
236 | 	if errorlevel 1 exit /b 1
237 | 	echo.
238 | 	echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
239 | 	goto end
240 | )
241 | 
242 | :end
243 | 


--------------------------------------------------------------------------------
/tests/test_tagger.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | from pycrfsuite import ItemSequence, Tagger, Trainer
  4 | 
  5 | 
  6 | def test_open_close_labels(model_filename, yseq):
  7 |     tagger = Tagger()
  8 | 
  9 |     with pytest.raises(ValueError):
 10 |         # tagger should be closed, so labels() method should fail here
 11 |         labels = tagger.labels()
 12 | 
 13 |     with tagger.open(model_filename):
 14 |         labels = tagger.labels()
 15 |     assert set(labels) == set(yseq)
 16 | 
 17 |     with pytest.raises(ValueError):
 18 |         # tagger should be closed, so labels() method should fail here
 19 |         labels = tagger.labels()
 20 | 
 21 | 
 22 | def test_open_non_existing():
 23 |     tagger = Tagger()
 24 |     with pytest.raises(IOError):
 25 |         tagger.open("foo")
 26 | 
 27 | 
 28 | def test_open_invalid():
 29 |     tagger = Tagger()
 30 |     with pytest.raises(ValueError):
 31 |         tagger.open(__file__)
 32 | 
 33 | 
 34 | def test_open_invalid_small(tmpdir):
 35 |     tmp = tmpdir.join("tmp.txt")
 36 |     tmp.write(b"foo")
 37 |     tagger = Tagger()
 38 |     with pytest.raises(ValueError):
 39 |         tagger.open(str(tmp))
 40 | 
 41 | 
 42 | def test_open_invalid_small_with_correct_signature(tmpdir):
 43 |     tmp = tmpdir.join("tmp.txt")
 44 |     tmp.write(b"lCRFfoo")
 45 |     tagger = Tagger()
 46 |     with pytest.raises(ValueError):
 47 |         tagger.open(str(tmp))
 48 | 
 49 | 
 50 | @pytest.mark.xfail(reason="see https://github.com/chokkan/crfsuite/pull/24", run=False)
 51 | def test_open_invalid_with_correct_signature(tmpdir):
 52 |     tmp = tmpdir.join("tmp.txt")
 53 |     tmp.write(b"lCRFfoo" * 100)
 54 |     tagger = Tagger()
 55 |     with pytest.raises(ValueError):
 56 |         tagger.open(str(tmp))
 57 | 
 58 | 
 59 | def test_open_inmemory(model_bytes, xseq, yseq):
 60 |     with Tagger().open_inmemory(model_bytes) as tagger:
 61 |         assert tagger.tag(xseq) == yseq
 62 | 
 63 | 
 64 | def test_open_inmemory_invalid():
 65 |     tagger = Tagger()
 66 |     with pytest.raises(ValueError):
 67 |         tagger.open_inmemory(b"")
 68 | 
 69 |     with pytest.raises(ValueError):
 70 |         tagger.open_inmemory(b"lCRFabc")
 71 | 
 72 | 
 73 | @pytest.mark.xfail(
 74 |     reason="see https://github.com/scrapinghub/python-crfsuite/issues/28", run=False
 75 | )
 76 | def test_tag_not_opened(xseq):
 77 |     tagger = Tagger()
 78 |     with pytest.raises(Exception):
 79 |         tagger.tag(xseq)
 80 | 
 81 | 
 82 | def test_tag(model_filename, xseq, yseq):
 83 |     with Tagger().open(model_filename) as tagger:
 84 |         assert tagger.tag(xseq) == yseq
 85 | 
 86 | 
 87 | def test_tag_item_sequence(model_filename, xseq, yseq):
 88 |     with Tagger().open(model_filename) as tagger:
 89 |         assert tagger.tag(ItemSequence(xseq)) == yseq
 90 | 
 91 | 
 92 | def test_tag_string_lists(model_filename, xseq, yseq):
 93 |     with Tagger().open(model_filename) as tagger:
 94 |         # Working with lists is supported,
 95 |         # but if we discard weights the results become different
 96 |         data = [x.keys() for x in xseq]
 97 |         assert tagger.tag(data) != yseq
 98 | 
 99 | 
100 | def test_tag_bools(model_filename, xseq, yseq):
101 |     with Tagger().open(model_filename) as tagger:
102 |         # Some values are bools:
103 |         # True <=> 1.0; False <=> 0.0
104 |         data = [
105 |             {k: bool(v) if v == 0 or v == 1 else v for (k, v) in x.items()}
106 |             for x in xseq
107 |         ]
108 |         assert tagger.tag(data) == yseq
109 | 
110 | 
111 | def test_tag_formats(tmpdir, xseq, yseq):
112 |     # make all coefficients 1 and check that results are the same
113 |     model_filename = str(tmpdir.join("model.crfsuite"))
114 |     xseq = [{key: 1 for key in x} for x in xseq]
115 | 
116 |     trainer = Trainer()
117 |     trainer.set("c2", 1e-6)  # make sure model overfits
118 |     trainer.append(xseq, yseq)
119 |     trainer.train(model_filename)
120 | 
121 |     with Tagger().open(model_filename) as tagger:
122 |         assert tagger.tag(xseq) == yseq
123 | 
124 |     # strings
125 |     with Tagger().open(model_filename) as tagger:
126 |         data = [x.keys() for x in xseq]
127 |         assert tagger.tag(data) == yseq
128 | 
129 | 
130 | @pytest.mark.xfail()
131 | @pytest.mark.parametrize(
132 |     "bad_seq",
133 |     [
134 |         "foo",
135 |         ["foo"],  # should be a list of lists of strings
136 |         [[{"foo": 1.0}]],  # should be a list of dicts
137 |     ],
138 | )
139 | def test_tag_invalid_feature_format(model_filename, bad_seq):
140 |     with Tagger().open(model_filename) as tagger:
141 |         with pytest.raises(ValueError):
142 |             tagger.tag(bad_seq)
143 | 
144 | 
145 | def test_tag_probability(model_filename, xseq, yseq):
146 |     with Tagger().open(model_filename) as tagger:
147 |         res = tagger.tag(xseq)
148 |         prob = tagger.probability(res)
149 |         prob2 = tagger.probability([yseq[0]] * len(yseq))
150 |         assert prob > prob2
151 |         assert 0 < prob < 1
152 |         assert 0 < prob2 < 1
153 | 
154 | 
155 | def test_dump(tmpdir, model_filename):
156 |     with Tagger().open(model_filename) as tagger:
157 |         dump_filename = str(tmpdir.join("dump.txt"))
158 |         tagger.dump(dump_filename)
159 | 
160 |         with open(dump_filename, "rb") as f:
161 |             res = f.read().decode("utf8")
162 |             assert "LABELS = {" in res
163 |             assert "солнце:не светит --> rainy:" in res
164 | 
165 |     # it shouldn't segfault on a closed tagger
166 |     with pytest.raises(RuntimeError):
167 |         tagger.dump(dump_filename)
168 | 
169 | 
170 | def test_info(model_filename):
171 |     with Tagger().open(model_filename) as tagger:
172 |         res = tagger.info()
173 | 
174 |         assert res.transitions[("sunny", "sunny")] > res.transitions[("sunny", "rainy")]
175 |         assert (
176 |             res.state_features[("walk", "sunny")]
177 |             > res.state_features[("walk", "rainy")]
178 |         )
179 |         assert ("солнце:не светит", "rainy") in res.state_features
180 |         assert res.header["num_labels"] == "3"
181 |         assert set(res.labels.keys()) == set(["sunny", "rainy", "好"])
182 |         assert set(res.attributes.keys()) == set(
183 |             ["shop", "walk", "clean", "солнце:не светит", "world"]
184 |         )
185 | 
186 |     # it shouldn't segfault on a closed tagger
187 |     with pytest.raises(RuntimeError):
188 |         tagger.info()
189 | 
190 | 
191 | def test_append_strstr_dicts(tmpdir):
192 |     trainer = Trainer()
193 |     trainer.append(
194 |         [{"foo": "bar"}, {"baz": False}, {"foo": "bar", "baz": True}, {"baz": 0.2}],
195 |         ["spam", "egg", "spam", "spam"],
196 |     )
197 |     model_filename = str(tmpdir.join("model.crfsuite"))
198 |     trainer.train(model_filename)
199 | 
200 |     with Tagger().open(model_filename) as tagger:
201 |         info = tagger.info()
202 |         assert set(info.attributes.keys()) == {"foo:bar", "baz"}
203 |         assert info.state_features[("foo:bar", "spam")] > 0
204 | 
205 | 
206 | def test_append_nested_dicts(tmpdir):
207 |     trainer = Trainer()
208 |     trainer.append(
209 |         [
210 |             {
211 |                 "foo": {
212 |                     "bar": "baz",
213 |                     "spam": 0.5,
214 |                     "egg": ["x", "y"],
215 |                     "ham": {"x": -0.5, "y": -0.1},
216 |                 },
217 |             },
218 |             {
219 |                 "foo": {"bar": "ham", "spam": -0.5, "ham": {"x", "y"}},
220 |             },
221 |         ],
222 |         ["first", "second"],
223 |     )
224 |     model_filename = str(tmpdir.join("model.crfsuite"))
225 |     trainer.train(model_filename)
226 | 
227 |     with Tagger().open(model_filename) as tagger:
228 |         info = tagger.info()
229 |         assert set(info.attributes.keys()) == {
230 |             "foo:bar:baz",
231 |             "foo:spam",
232 |             "foo:egg:x",
233 |             "foo:egg:y",
234 |             "foo:ham:x",
235 |             "foo:ham:y",
236 |             "foo:bar:ham",
237 |         }
238 | 
239 |         for feat in ["foo:bar:baz", "foo:spam", "foo:egg:x", "foo:egg:y"]:
240 |             assert info.state_features[(feat, "first")] > 0
241 |             assert info.state_features.get((feat, "second"), 0) <= 0
242 | 
243 |         for feat in ["foo:bar:ham", "foo:ham:x", "foo:ham:y"]:
244 |             assert info.state_features[(feat, "second")] > 0
245 |             assert info.state_features.get((feat, "first"), 0) <= 0
246 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # python-crfsuite documentation build configuration file, created by
  4 | # sphinx-quickstart on Sun Apr 27 15:19:14 2014.
  5 | #
  6 | # This file is execfile()d with the current directory set to its
  7 | # containing dir.
  8 | #
  9 | # Note that not all possible configuration values are present in this
 10 | # autogenerated file.
 11 | #
 12 | # All configuration values have a default; values that are commented out
 13 | # serve to show the default.
 14 | 
 15 | import sys
 16 | import os
 17 | 
 18 | # If extensions (or modules to document with autodoc) are in another directory,
 19 | # add these directories to sys.path here. If the directory is relative to the
 20 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 21 | #sys.path.insert(0, os.path.abspath('.'))
 22 | 
 23 | # -- General configuration ------------------------------------------------
 24 | 
 25 | # If your documentation needs a minimal Sphinx version, state it here.
 26 | #needs_sphinx = '1.0'
 27 | 
 28 | # Add any Sphinx extension module names here, as strings. They can be
 29 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 30 | # ones.
 31 | extensions = [
 32 |     'sphinx.ext.autodoc',
 33 |     'sphinx.ext.viewcode',
 34 |     'numpydoc',
 35 | ]
 36 | numpydoc_show_class_members = False
 37 | 
 38 | # Add any paths that contain templates here, relative to this directory.
 39 | templates_path = ['_templates']
 40 | 
 41 | # The suffix of source filenames.
 42 | source_suffix = '.rst'
 43 | 
 44 | # The encoding of source files.
 45 | #source_encoding = 'utf-8-sig'
 46 | 
 47 | # The master toctree document.
 48 | master_doc = 'index'
 49 | 
 50 | # General information about the project.
 51 | project = u'python-crfsuite'
 52 | copyright = u'2020, Terry Peng, Mikhail Korobov'
 53 | 
 54 | # The version info for the project you're documenting, acts as replacement for
 55 | # |version| and |release|, also used in various other places throughout the
 56 | # built documents.
 57 | #
 58 | # The short X.Y version.
 59 | version = '0.9'
 60 | # The full version, including alpha/beta/rc tags.
 61 | release = '0.9.7'
 62 | 
 63 | # The language for content autogenerated by Sphinx. Refer to documentation
 64 | # for a list of supported languages.
 65 | #language = None
 66 | 
 67 | # There are two options for replacing |today|: either, you set today to some
 68 | # non-false value, then it is used:
 69 | #today = ''
 70 | # Else, today_fmt is used as the format for a strftime call.
 71 | #today_fmt = '%B %d, %Y'
 72 | 
 73 | # List of patterns, relative to source directory, that match files and
 74 | # directories to ignore when looking for source files.
 75 | exclude_patterns = ['_build']
 76 | 
 77 | # The reST default role (used for this markup: `text`) to use for all
 78 | # documents.
 79 | #default_role = None
 80 | 
 81 | # If true, '()' will be appended to :func: etc. cross-reference text.
 82 | #add_function_parentheses = True
 83 | 
 84 | # If true, the current module name will be prepended to all description
 85 | # unit titles (such as .. function::).
 86 | #add_module_names = True
 87 | 
 88 | # If true, sectionauthor and moduleauthor directives will be shown in the
 89 | # output. They are ignored by default.
 90 | #show_authors = False
 91 | 
 92 | # The name of the Pygments (syntax highlighting) style to use.
 93 | pygments_style = 'sphinx'
 94 | 
 95 | # A list of ignored prefixes for module index sorting.
 96 | #modindex_common_prefix = []
 97 | 
 98 | # If true, keep warnings as "system message" paragraphs in the built documents.
 99 | #keep_warnings = False
100 | 
101 | 
102 | # -- Options for HTML output ----------------------------------------------
103 | 
104 | # The theme to use for HTML and HTML Help pages.  See the documentation for
105 | # a list of builtin themes.
106 | html_theme = 'default'
107 | 
108 | # Theme options are theme-specific and customize the look and feel of a theme
109 | # further.  For a list of options available for each theme, see the
110 | # documentation.
111 | #html_theme_options = {}
112 | 
113 | # Add any paths that contain custom themes here, relative to this directory.
114 | #html_theme_path = []
115 | 
116 | # The name for this set of Sphinx documents.  If None, it defaults to
117 | # "<project> v<release> documentation".
118 | #html_title = None
119 | 
120 | # A shorter title for the navigation bar.  Default is the same as html_title.
121 | #html_short_title = None
122 | 
123 | # The name of an image file (relative to this directory) to place at the top
124 | # of the sidebar.
125 | #html_logo = None
126 | 
127 | # The name of an image file (within the static path) to use as favicon of the
128 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
129 | # pixels large.
130 | #html_favicon = None
131 | 
132 | # Add any paths that contain custom static files (such as style sheets) here,
133 | # relative to this directory. They are copied after the builtin static files,
134 | # so a file named "default.css" will overwrite the builtin "default.css".
135 | html_static_path = ['_static']
136 | 
137 | # Add any extra paths that contain custom files (such as robots.txt or
138 | # .htaccess) here, relative to this directory. These files are copied
139 | # directly to the root of the documentation.
140 | #html_extra_path = []
141 | 
142 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
143 | # using the given strftime format.
144 | #html_last_updated_fmt = '%b %d, %Y'
145 | 
146 | # If true, SmartyPants will be used to convert quotes and dashes to
147 | # typographically correct entities.
148 | #html_use_smartypants = True
149 | 
150 | # Custom sidebar templates, maps document names to template names.
151 | #html_sidebars = {}
152 | 
153 | # Additional templates that should be rendered to pages, maps page names to
154 | # template names.
155 | #html_additional_pages = {}
156 | 
157 | # If false, no module index is generated.
158 | #html_domain_indices = True
159 | 
160 | # If false, no index is generated.
161 | #html_use_index = True
162 | 
163 | # If true, the index is split into individual pages for each letter.
164 | #html_split_index = False
165 | 
166 | # If true, links to the reST sources are added to the pages.
167 | #html_show_sourcelink = True
168 | 
169 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
170 | #html_show_sphinx = True
171 | 
172 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
173 | #html_show_copyright = True
174 | 
175 | # If true, an OpenSearch description file will be output, and all pages will
176 | # contain a <link> tag referring to it.  The value of this option must be the
177 | # base URL from which the finished HTML is served.
178 | #html_use_opensearch = ''
179 | 
180 | # This is the file name suffix for HTML files (e.g. ".xhtml").
181 | #html_file_suffix = None
182 | 
183 | # Output file base name for HTML help builder.
184 | htmlhelp_basename = 'python-crfsuitedoc'
185 | 
186 | 
187 | # -- Options for LaTeX output ---------------------------------------------
188 | 
189 | latex_elements = {
190 | # The paper size ('letterpaper' or 'a4paper').
191 | #'papersize': 'letterpaper',
192 | 
193 | # The font size ('10pt', '11pt' or '12pt').
194 | #'pointsize': '10pt',
195 | 
196 | # Additional stuff for the LaTeX preamble.
197 | #'preamble': '',
198 | }
199 | 
200 | # Grouping the document tree into LaTeX files. List of tuples
201 | # (source start file, target name, title,
202 | #  author, documentclass [howto, manual, or own class]).
203 | latex_documents = [
204 |   ('index', 'python-crfsuite.tex', u'python-crfsuite Documentation',
205 |    u'Terry Peng, Mikhail Korobov', 'manual'),
206 | ]
207 | 
208 | # The name of an image file (relative to this directory) to place at the top of
209 | # the title page.
210 | #latex_logo = None
211 | 
212 | # For "manual" documents, if this is true, then toplevel headings are parts,
213 | # not chapters.
214 | #latex_use_parts = False
215 | 
216 | # If true, show page references after internal links.
217 | #latex_show_pagerefs = False
218 | 
219 | # If true, show URL addresses after external links.
220 | #latex_show_urls = False
221 | 
222 | # Documents to append as an appendix to all manuals.
223 | #latex_appendices = []
224 | 
225 | # If false, no module index is generated.
226 | #latex_domain_indices = True
227 | 
228 | 
229 | # -- Options for manual page output ---------------------------------------
230 | 
231 | # One entry per manual page. List of tuples
232 | # (source start file, name, description, authors, manual section).
233 | man_pages = [
234 |     ('index', 'python-crfsuite', u'python-crfsuite Documentation',
235 |      [u'Terry Peng, Mikhail Korobov'], 1)
236 | ]
237 | 
238 | # If true, show URL addresses after external links.
239 | #man_show_urls = False
240 | 
241 | 
242 | # -- Options for Texinfo output -------------------------------------------
243 | 
244 | # Grouping the document tree into Texinfo files. List of tuples
245 | # (source start file, target name, title, author,
246 | #  dir menu entry, description, category)
247 | texinfo_documents = [
248 |   ('index', 'python-crfsuite', u'python-crfsuite Documentation',
249 |    u'Terry Peng, Mikhail Korobov', 'python-crfsuite', 'Python CRFSuite wrapper.',
250 |    'Miscellaneous'),
251 | ]
252 | 
253 | # Documents to append as an appendix to all manuals.
254 | #texinfo_appendices = []
255 | 
256 | # If false, no module index is generated.
257 | #texinfo_domain_indices = True
258 | 
259 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
260 | #texinfo_show_urls = 'footnote'
261 | 
262 | # If true, do not generate a @detailmenu in the "Top" node's menu.
263 | #texinfo_no_detailmenu = False
264 | 


--------------------------------------------------------------------------------
/pycrfsuite/_pycrfsuite.pyx:
--------------------------------------------------------------------------------
  1 | # cython: embedsignature=True
  2 | # cython: c_string_type=str
  3 | # cython: c_string_encoding=utf-8
  4 | # cython: profile=False
  5 | # distutils: language=c++
  6 | from . cimport crfsuite_api
  7 | from libcpp.string cimport string
  8 | 
  9 | import sys
 10 | import os
 11 | import contextlib
 12 | import tempfile
 13 | 
 14 | from pycrfsuite import _dumpparser
 15 | from pycrfsuite import _logparser
 16 | 
 17 | CRFSUITE_VERSION = crfsuite_api.version()
 18 | 
 19 | 
 20 | class CRFSuiteError(Exception):
 21 | 
 22 |     _messages = {
 23 |         crfsuite_api.CRFSUITEERR_UNKNOWN: "Unknown error occurred",
 24 |         crfsuite_api.CRFSUITEERR_OUTOFMEMORY: "Insufficient memory",
 25 |         crfsuite_api.CRFSUITEERR_NOTSUPPORTED: "Unsupported operation",
 26 |         crfsuite_api.CRFSUITEERR_INCOMPATIBLE: "Incompatible data",
 27 |         crfsuite_api.CRFSUITEERR_INTERNAL_LOGIC: "Internal error",
 28 |         crfsuite_api.CRFSUITEERR_OVERFLOW: "Overflow",
 29 |         crfsuite_api.CRFSUITEERR_NOTIMPLEMENTED: "Not implemented",
 30 |     }
 31 | 
 32 |     def __init__(self, code):
 33 |         self.code = code
 34 |         Exception.__init__(self._messages.get(self.code, "Unexpected error"))
 35 | 
 36 | 
 37 | cdef string _SEP = b':'
 38 | 
 39 | cdef extern crfsuite_api.Item to_item(x) except+:
 40 |     """ Convert a Python object to an Item. """
 41 |     cdef crfsuite_api.Item c_item
 42 |     cdef double c_value
 43 |     cdef string c_key
 44 |     cdef bint is_dict, is_nested_value
 45 | 
 46 |     is_dict = isinstance(x, dict)
 47 |     c_item = crfsuite_api.Item()
 48 |     c_item.reserve(len(x))  # at least this amount is required
 49 |     for key in x:
 50 |         if isinstance(key, unicode):
 51 |             c_key = (<unicode>key).encode('utf8')
 52 |         else:
 53 |             c_key = key
 54 | 
 55 |         if not is_dict:
 56 |             # "string_key"
 57 |             c_value = 1.0
 58 |             c_item.push_back(crfsuite_api.Attribute(c_key, c_value))
 59 |         else:
 60 |             value = (<dict>x)[key]
 61 | 
 62 |             if isinstance(value, (dict, list, set)):
 63 |                 # {"string_prefix": {...}}
 64 |                 for attr in to_item(value):
 65 |                     c_item.push_back(
 66 |                         crfsuite_api.Attribute(c_key + _SEP + attr.attr, attr.value)
 67 |                     )
 68 |             else:
 69 |                 if isinstance(value, unicode):
 70 |                     # {"string_key": "string_value"}
 71 |                     c_key += _SEP
 72 |                     c_key += <string>(<unicode>value).encode('utf8')
 73 |                     c_value = 1.0
 74 |                 elif isinstance(value, bytes):
 75 |                     # {"string_key": "string_value"}
 76 |                     c_key += _SEP
 77 |                     c_key += <string>value
 78 |                     c_value = 1.0
 79 |                 else:
 80 |                     # {"string_key": float_value}
 81 |                     # {"string_key": bool}
 82 |                     c_value = value
 83 | 
 84 |                 c_item.push_back(crfsuite_api.Attribute(c_key, c_value))
 85 | 
 86 |     return c_item
 87 | 
 88 | 
 89 | cdef extern crfsuite_api.ItemSequence to_seq(pyseq) except+:
 90 |     """
 91 |     Convert an iterable to an ItemSequence.
 92 |     Elements of an iterable could be:
 93 | 
 94 |     * {"string_key": float_value} dicts;
 95 |     * {"string_key": bool} dicts: True is converted to 1.0, False - to 0.0;
 96 |     * {"string_key": "string_value"} dicts: result is {"string_key=string_value": 1.0}
 97 |     * "string_key": result is {"string_key": 1.0}
 98 |     * {"string_prefix": {...}} nested dicts: nested dict is processed and
 99 |       "string_prefix" s prepended to each key.
100 |     * {"string_prefix": [...]} dicts: nested list is processed and
101 |       "string_prefix" s prepended to each key.
102 |     """
103 |     cdef crfsuite_api.ItemSequence c_seq
104 | 
105 |     if isinstance(pyseq, ItemSequence):
106 |         c_seq = (<ItemSequence>pyseq).c_seq
107 |     else:
108 |         for x in pyseq:
109 |             c_seq.push_back(to_item(x))
110 |     return c_seq
111 | 
112 | 
113 | cdef class ItemSequence(object):
114 |     """
115 |     A wrapper for crfsuite ItemSequence - a class for storing
116 |     features for all items in a single sequence.
117 | 
118 |     Using this class is an alternative to passing data to :class:`~Trainer`
119 |     and :class:`Tagger` directly. By using this class it is possible to
120 |     save some time if the same input sequence is passed to trainers/taggers
121 |     more than once - features won't be processed multiple times.
122 |     It also allows to get "processed" features/attributes that are sent
123 |     to CRFsuite - they could be helpful e.g. to check which attributes
124 |     (returned by :meth:`~Tagger.info`) are active for a given observation.
125 | 
126 |     Initialize ItemSequence with a list of item features:
127 | 
128 |     >>> ItemSequence([{'foo': 1, 'bar': 0}, {'foo': 1.5, 'baz': 2}])
129 |     <ItemSequence of size 2>
130 | 
131 |     Item features could be in one of the following formats:
132 | 
133 |     * {"string_key": float_weight, ...} dict where keys are
134 |       observed features and values are their weights;
135 |     * {"string_key": bool, ...} dict; True is converted to 1.0 weight,
136 |       False - to 0.0;
137 |     * {"string_key": "string_value", ...} dict; that's the same as
138 |       {"string_key=string_value": 1.0, ...}
139 |     * ["string_key1", "string_key2", ...] list; that's the same as
140 |       {"string_key1": 1.0, "string_key2": 1.0, ...}
141 |     * {"string_prefix": {...}} dicts: nested dict is processed and
142 |       "string_prefix" s prepended to each key.
143 |     * {"string_prefix": [...]} dicts: nested list is processed and
144 |       "string_prefix" s prepended to each key.
145 |     * {"string_prefix": set([...])} dicts: nested list is processed and
146 |       "string_prefix" s prepended to each key.
147 | 
148 |     Dict-based features can be mixed, i.e. this is allowed::
149 | 
150 |         {"key1": float_weight,
151 |          "key2": "string_value",
152 |          "key3": bool_value,
153 |          "key4: {"key5": ["x", "y"], "key6": float_value},
154 |          }
155 | 
156 |     """
157 |     cdef crfsuite_api.ItemSequence c_seq
158 | 
159 |     def __init__(self, pyseq):
160 |         self.c_seq = to_seq(pyseq)
161 | 
162 |     def items(self):
163 |         """
164 |         Return a list of prepared item features:
165 |         a list of ``{unicode_key: float_value}`` dicts.
166 | 
167 |         >>> ItemSequence([["foo"], {"bar": {"baz": 1}}]).items()
168 |         [{'foo': 1.0}, {'bar:baz': 1.0}]
169 | 
170 |         """
171 |         cdef crfsuite_api.Item c_item
172 |         cdef crfsuite_api.Attribute c_attr
173 |         cdef bytes key
174 |         seq = []
175 | 
176 |         for c_item in self.c_seq:
177 |             x = {}
178 |             for c_attr in c_item:
179 |                 # Always decode keys from utf-8. It means binary keys are
180 |                 # not supported. I think it is OK because Tagger.info()
181 |                 # also only supports utf-8.
182 | 
183 |                 # XXX: (<bytes>c_attr.attr).decode('utf8') doesn't
184 |                 # work properly in Cython 0.21
185 |                 key = <bytes>c_attr.attr.c_str()
186 |                 x[key.decode('utf8')] = c_attr.value
187 |             seq.append(x)
188 |         return seq
189 | 
190 |     def __len__(self):
191 |         return self.c_seq.size()
192 | 
193 |     def __repr__(self):
194 |         return "<ItemSequence of size %d>" % len(self)
195 | 
196 | 
197 | def _intbool(txt):
198 |     return bool(int(txt))
199 | 
200 | 
201 | cdef class BaseTrainer(object):
202 |     """
203 |     The trainer class.
204 | 
205 |     This class maintains a data set for training, and provides an interface
206 |     to various training algorithms.
207 | 
208 |     Parameters
209 |     ----------
210 |     algorithm : {'lbfgs', 'l2sgd', 'ap', 'pa', 'arow'}
211 |         The name of the training algorithm. See :meth:`Trainer.select`.
212 | 
213 |     params : dict, optional
214 |         Training parameters. See :meth:`Trainer.set_params`
215 |         and :meth:`Trainer.set`.
216 | 
217 |     verbose : boolean
218 |         Whether to print debug messages during training. Default is True.
219 | 
220 |     """
221 |     cdef crfsuite_api.Trainer c_trainer
222 | 
223 |     _PARAMETER_TYPES = {
224 |         'feature.minfreq': float,
225 |         'feature.possible_states': _intbool,
226 |         'feature.possible_transitions': _intbool,
227 |         'c1': float,
228 |         'c2': float,
229 |         'max_iterations': int,
230 |         'num_memories': int,
231 |         'epsilon': float,
232 |         'period': int,  # XXX: is it called 'stop' in docs?
233 |         'delta': float,
234 |         'linesearch': str,
235 |         'max_linesearch': int,
236 |         'calibration.eta': float,
237 |         'calibration.rate': float,
238 |         'calibration.samples': float,
239 |         'calibration.candidates': int,
240 |         'calibration.max_trials': int,
241 |         'type': int,
242 |         'c': float,
243 |         'error_sensitive': _intbool,
244 |         'averaging': _intbool,
245 |         'variance': float,
246 |         'gamma': float,
247 |     }
248 | 
249 |     _ALGORITHM_ALIASES = {
250 |         'ap': 'averaged-perceptron',
251 |         'pa': 'passive-aggressive',
252 |     }
253 | 
254 |     cdef public verbose
255 | 
256 |     def __init__(self, algorithm=None, params=None, verbose=True):
257 |         if algorithm is not None:
258 |             self.select(algorithm)
259 |         if params is not None:
260 |             self.set_params(params)
261 |         self.verbose = verbose
262 | 
263 |     def __cinit__(self):
264 |         # setup message handler
265 |         self.c_trainer.set_handler(self, <crfsuite_api.messagefunc>self._on_message)
266 | 
267 |         # fix segfaults, see https://github.com/chokkan/crfsuite/pull/21
268 |         self.c_trainer.select("lbfgs".encode('ascii'), "crf1d".encode('ascii'))
269 |         self.c_trainer._init_hack()
270 | 
271 |     cdef _on_message(self, string message):
272 |         self.message(message)
273 | 
274 |     def message(self, message):
275 |         """
276 |         Receive messages from the training algorithm.
277 |         Override this method to receive messages of the training
278 |         process.
279 | 
280 |         By default, this method prints messages
281 |         if ``Trainer.verbose`` is True.
282 | 
283 |         Parameters
284 |         ----------
285 |         message : string
286 |             The message
287 |         """
288 |         if self.verbose:
289 |             print(message, end='')
290 | 
291 |     def append(self, xseq, yseq, int group=0):
292 |         """
293 |         Append an instance (item/label sequence) to the data set.
294 | 
295 |         Parameters
296 |         ----------
297 |         xseq : a sequence of item features
298 |             The item sequence of the instance. ``xseq`` should be a list
299 |             of item features or an :class:`~ItemSequence` instance.
300 |             Allowed item features formats are the same as described
301 |             in :class:`~ItemSequence` docs.
302 | 
303 |         yseq : a sequence of strings
304 |             The label sequence of the instance. The number
305 |             of elements in yseq must be identical to that
306 |             in xseq.
307 | 
308 |         group : int, optional
309 |             The group number of the instance. Group numbers are used to
310 |             select subset of data for heldout evaluation.
311 |         """
312 |         self.c_trainer.append(to_seq(xseq), yseq, group)
313 | 
314 |     def select(self, algorithm, type='crf1d'):
315 |         """
316 |         Initialize the training algorithm.
317 | 
318 |         Parameters
319 |         ----------
320 |         algorithm : {'lbfgs', 'l2sgd', 'ap', 'pa', 'arow'}
321 |             The name of the training algorithm.
322 | 
323 |             * 'lbfgs' for Gradient descent using the L-BFGS method,
324 |             * 'l2sgd' for Stochastic Gradient Descent with L2 regularization term
325 |             * 'ap' for Averaged Perceptron
326 |             * 'pa' for Passive Aggressive
327 |             * 'arow' for Adaptive Regularization Of Weight Vector
328 | 
329 |         type : string, optional
330 |             The name of the graphical model.
331 |         """
332 |         algorithm = algorithm.lower()
333 |         algorithm = self._ALGORITHM_ALIASES.get(algorithm, algorithm)
334 |         if not self.c_trainer.select(algorithm.encode('ascii'), type.encode('ascii')):
335 |             raise ValueError(
336 |                 "Bad arguments: algorithm=%r, type=%r" % (algorithm, type)
337 |             )
338 | 
339 |     def train(self, model, int holdout=-1):
340 |         """
341 |         Run the training algorithm.
342 |         This function starts the training algorithm with the data set given
343 |         by :meth:`Trainer.append` method.
344 | 
345 |         Parameters
346 |         ----------
347 |         model : string
348 |             The filename to which the trained model is stored.
349 |             If this value is empty, this function does not
350 |             write out a model file.
351 | 
352 |         holdout : int, optional
353 |             The group number of holdout evaluation. The
354 |             instances with this group number will not be used
355 |             for training, but for holdout evaluation.
356 |             Default value is -1, meaning "use all instances for training".
357 |         """
358 |         self._before_train()
359 |         status_code = self.c_trainer.train(model, holdout)
360 |         if status_code != crfsuite_api.CRFSUITE_SUCCESS:
361 |             raise CRFSuiteError(status_code)
362 | 
363 |     def params(self):
364 |         """
365 |         Obtain the list of parameters.
366 | 
367 |         This function returns the list of parameter names available for the
368 |         graphical model and training algorithm specified in Trainer constructor
369 |         or by :meth:`Trainer.select` method.
370 | 
371 |         Returns
372 |         -------
373 |         list of strings
374 |             The list of parameters available for the current
375 |             graphical model and training algorithm.
376 | 
377 |         """
378 |         return self.c_trainer.params()
379 | 
380 |     def set_params(self, params):
381 |         """
382 |         Set training parameters.
383 | 
384 |         Parameters
385 |         ----------
386 |         params : dict
387 |             A dict with parameters ``{name: value}``
388 |         """
389 |         for key, value in params.items():
390 |             self.set(key, value)
391 | 
392 |     def get_params(self):
393 |         """
394 |         Get training parameters.
395 | 
396 |         Returns
397 |         -------
398 |         dict
399 |             A dictionary with ``{parameter_name: parameter_value}``
400 |             with all trainer parameters.
401 |         """
402 |         # params = self.params()
403 |         return dict((name, self.get(name)) for name in self.params())
404 | 
405 |     def set(self, name, value):
406 |         """
407 |         Set a training parameter.
408 |         This function sets a parameter value for the graphical model and
409 |         training algorithm specified by :meth:`Trainer.select` method.
410 | 
411 |         Parameters
412 |         ----------
413 |         name : string
414 |             The parameter name.
415 |         value : string
416 |             The value of the parameter.
417 | 
418 |         """
419 |         if isinstance(value, bool):
420 |             value = int(value)
421 |         self.c_trainer.set(name, str(value))
422 | 
423 |     def get(self, name):
424 |         """
425 |         Get the value of a training parameter.
426 |         This function gets a parameter value for the graphical model and
427 |         training algorithm specified by :meth:`Trainer.select` method.
428 | 
429 |         Parameters
430 |         ----------
431 |         name : string
432 |             The parameter name.
433 |         """
434 |         return self._cast_parameter(name, self.c_trainer.get(name))
435 | 
436 |     def help(self, name):
437 |         """
438 |         Get the description of a training parameter.
439 |         This function obtains the help message for the parameter specified
440 |         by the name. The graphical model and training algorithm must be
441 |         selected by :meth:`Trainer.select` method before calling this method.
442 | 
443 |         Parameters
444 |         ----------
445 |         name : string
446 |             The parameter name.
447 | 
448 |         Returns
449 |         -------
450 |         string
451 |             The description (help message) of the parameter.
452 | 
453 |         """
454 |         if name not in self.params():
455 |             # c_trainer.help(name) segfaults without this workaround;
456 |             # see https://github.com/chokkan/crfsuite/pull/21
457 |             raise ValueError("Parameter not found: %s" % name)
458 |         return self.c_trainer.help(name)
459 | 
460 |     def clear(self):
461 |         """ Remove all instances in the data set. """
462 |         self.c_trainer.clear()
463 | 
464 |     def _cast_parameter(self, name, value):
465 |         if name in self._PARAMETER_TYPES:
466 |             return self._PARAMETER_TYPES[name](value)
467 |         return value
468 | 
469 |     def _before_train(self):
470 |         pass
471 | 
472 | 
473 | class Trainer(BaseTrainer):
474 |     """
475 |     The trainer class.
476 | 
477 |     This class maintains a data set for training, and provides an interface
478 |     to various training algorithms.
479 | 
480 |     Parameters
481 |     ----------
482 |     algorithm : {'lbfgs', 'l2sgd', 'ap', 'pa', 'arow'}
483 |         The name of the training algorithm. See :meth:`Trainer.select`.
484 | 
485 |     params : dict, optional
486 |         Training parameters. See :meth:`Trainer.set_params`
487 |         and :meth:`Trainer.set`.
488 | 
489 |     verbose : boolean
490 |         Whether to print debug messages during training. Default is True.
491 | 
492 |     """
493 |     logparser = None
494 | 
495 |     def _before_train(self):
496 |         self.logparser = _logparser.TrainLogParser()
497 | 
498 |     def message(self, message):
499 |         event = self.logparser.feed(message)
500 | 
501 |         if not self.verbose or event is None:
502 |             return
503 | 
504 |         log = self.logparser.last_log
505 |         if event == 'start':
506 |             self.on_start(log)
507 |         elif event == 'featgen_progress':
508 |             self.on_featgen_progress(log, self.logparser.featgen_percent)
509 |         elif event == 'featgen_end':
510 |             self.on_featgen_end(log)
511 |         elif event == 'prepared':
512 |             self.on_prepared(log)
513 |         elif event == 'prepare_error':
514 |             self.on_prepare_error(log)
515 |         elif event == 'iteration':
516 |             self.on_iteration(log, self.logparser.last_iteration)
517 |         elif event == 'optimization_end':
518 |             self.on_optimization_end(log)
519 |         elif event == 'end':
520 |             self.on_end(log)
521 |         else:
522 |             raise Exception("Unknown event %r" % event)
523 | 
524 |     def on_start(self, log):
525 |         print(log, end='')
526 | 
527 |     def on_featgen_progress(self, log, percent):
528 |         print(log, end='')
529 | 
530 |     def on_featgen_end(self, log):
531 |         print(log, end='')
532 | 
533 |     def on_prepared(self, log):
534 |         print(log, end='')
535 | 
536 |     def on_prepare_error(self, log):
537 |         print(log, end='')
538 | 
539 |     def on_iteration(self, log, info):
540 |         print(log, end='')
541 | 
542 |     def on_optimization_end(self, log):
543 |         print(log, end='')
544 | 
545 |     def on_end(self, log):
546 |         print(log, end='')
547 | 
548 | 
549 | cdef class Tagger(object):
550 |     """
551 |     The tagger class.
552 | 
553 |     This class provides the functionality for predicting label sequences for
554 |     input sequences using a model.
555 |     """
556 |     cdef crfsuite_api.Tagger c_tagger
557 | 
558 |     def open(self, name):
559 |         """
560 |         Open a model file.
561 | 
562 |         Parameters
563 |         ----------
564 |         name : string
565 |             The file name of the model file.
566 | 
567 |         """
568 |         # We need to do some basic checks ourselves because crfsuite
569 |         # may segfault if the file is invalid.
570 |         # See https://github.com/chokkan/crfsuite/pull/24
571 |         self._check_model(name)
572 |         if not self.c_tagger.open(name):
573 |             raise ValueError("Error opening model file %r" % name)
574 |         return contextlib.closing(self)
575 | 
576 |     def open_inmemory(self, bytes value):
577 |         """
578 |         Open a model from memory.
579 | 
580 |         Parameters
581 |         ----------
582 |         value : bytes
583 |             Binary model data (content of a file saved by Trainer.train).
584 | 
585 |         """
586 |         self._check_inmemory_model(value)
587 |         cdef const char *v = value
588 |         if not self.c_tagger.open(v, len(value)):
589 |             raise ValueError("Error opening model")
590 |         return contextlib.closing(self)
591 | 
592 |     def close(self):
593 |         """
594 |         Close the model.
595 |         """
596 |         self.c_tagger.close()
597 | 
598 |     def labels(self):
599 |         """
600 |         Obtain the list of labels.
601 | 
602 |         Returns
603 |         -------
604 |         list of strings
605 |             The list of labels in the model.
606 |         """
607 |         return self.c_tagger.labels()
608 | 
609 |     def tag(self, xseq=None):
610 |         """
611 |         Predict the label sequence for the item sequence.
612 | 
613 |         Parameters
614 |         ----------
615 |         xseq : item sequence, optional
616 |             The item sequence. If omitted, the current sequence is used
617 |             (a sequence set using :meth:`Tagger.set` method or
618 |             a sequence used in a previous :meth:`Tagger.tag` call).
619 | 
620 |             ``xseq`` should be a list of item features or
621 |             an :class:`~ItemSequence` instance. Allowed item features formats
622 |             are the same as described in :class:`~ItemSequence` docs.
623 | 
624 |         Returns
625 |         -------
626 |         list of strings
627 |             The label sequence predicted.
628 |         """
629 |         if xseq is not None:
630 |             self.set(xseq)
631 | 
632 |         return self.c_tagger.viterbi()
633 | 
634 |     def probability(self, yseq):
635 |         """
636 |         Compute the probability of the label sequence for the current input
637 |         sequence (a sequence set using :meth:`Tagger.set` method or
638 |         a sequence used in a previous :meth:`Tagger.tag` call).
639 | 
640 |         Parameters
641 |         ----------
642 |         yseq : list of strings
643 |             The label sequence.
644 | 
645 |         Returns
646 |         -------
647 |         float
648 |             The probability ``P(yseq|xseq)``.
649 |         """
650 |         return self.c_tagger.probability(yseq)
651 | 
652 |     def marginal(self, y, pos):
653 |         """
654 |         Compute the marginal probability of the label ``y`` at position ``pos``
655 |         for the current input sequence (i.e. a sequence set using
656 |         :meth:`Tagger.set` method or a sequence used in a previous
657 |         :meth:`Tagger.tag` call).
658 | 
659 |         Parameters
660 |         ----------
661 |         y : string
662 |             The label.
663 |         t : int
664 |             The position of the label.
665 | 
666 |         Returns
667 |         -------
668 |         float
669 |             The marginal probability of the label ``y`` at position ``t``.
670 |         """
671 |         return self.c_tagger.marginal(y, pos)
672 | 
673 |     cpdef extern set(self, xseq) except +:
674 |         """
675 |         Set an instance (item sequence) for future calls of
676 |         :meth:`Tagger.tag`, :meth:`Tagger.probability`
677 |         and :meth:`Tagger.marginal` methods.
678 | 
679 |         Parameters
680 |         ----------
681 |         xseq : item sequence
682 |             The item sequence of the instance. ``xseq`` should be a list of
683 |             item features or an :class:`~ItemSequence` instance.
684 |             Allowed item features formats are the same as described
685 |             in :class:`~ItemSequence` docs.
686 | 
687 |         """
688 |         self.c_tagger.set(to_seq(xseq))
689 | 
690 |     def dump(self, filename=None):
691 |         """
692 |         Dump a CRF model in plain-text format.
693 | 
694 |         Parameters
695 |         ----------
696 |         filename : string, optional
697 |             File name to dump the model to.
698 |             If None, the model is dumped to stdout.
699 |         """
700 |         if filename is None:
701 |             self.c_tagger.dump(os.dup(sys.stdout.fileno()))
702 |         else:
703 |             fd = os.open(filename, os.O_CREAT | os.O_WRONLY)
704 |             try:
705 |                 self.c_tagger.dump(fd)
706 |             finally:
707 |                 try:
708 |                     os.close(fd)
709 |                 except OSError:
710 |                     pass  # already closed by Tagger::dump
711 | 
712 |     def info(self):
713 |         """
714 |         Return a :class:`~.ParsedDump` structure with model internal information.
715 |         """
716 |         parser = _dumpparser.CRFsuiteDumpParser()
717 |         fd, name = tempfile.mkstemp()
718 |         try:
719 |             self.c_tagger.dump(fd)
720 |             with open(name, 'rb') as f:
721 |                 for line in f:
722 |                     parser.feed(line.decode('utf8'))
723 |         finally:
724 |             try:
725 |                 os.unlink(name)
726 |             except OSError:
727 |                 pass
728 |         return parser.result
729 | 
730 |     def _check_model(self, name):
731 |         # See https://github.com/chokkan/crfsuite/pull/24
732 |         # 1. Check that the file can be opened.
733 |         with open(name, 'rb') as f:
734 | 
735 |             # 2. Check that file magic is correct.
736 |             magic = f.read(4)
737 |             if magic != b'lCRF':
738 |                 raise ValueError("Invalid model file %r" % name)
739 | 
740 |             # 3. Make sure crfsuite won't read past allocated memory
741 |             # in case of incomplete header.
742 |             f.seek(0, os.SEEK_END)
743 |             size = f.tell()
744 |             if size <= 48:  # header size
745 |                 raise ValueError("Model file %r doesn't have a complete header" % name)
746 | 
747 |     def _check_inmemory_model(self, bytes value):
748 |         magic = value[:4]
749 |         if magic != b'lCRF':
750 |             raise ValueError("Invalid model")
751 | 
752 |         if len(value) < 48:
753 |             raise ValueError("Invalid model: incomplete header")
754 | 


--------------------------------------------------------------------------------
/examples/CoNLL 2002.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [
 10 |     {
 11 |      "name": "stdout",
 12 |      "output_type": "stream",
 13 |      "text": [
 14 |       "0.16.1\n"
 15 |      ]
 16 |     }
 17 |    ],
 18 |    "source": [
 19 |     "from itertools import chain\n",
 20 |     "import nltk\n",
 21 |     "from sklearn.metrics import classification_report, confusion_matrix\n",
 22 |     "from sklearn.preprocessing import LabelBinarizer\n",
 23 |     "import sklearn\n",
 24 |     "import pycrfsuite\n",
 25 |     "\n",
 26 |     "print(sklearn.__version__)"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "markdown",
 31 |    "metadata": {},
 32 |    "source": [
 33 |     "# Let's use CoNLL 2002 data to build a NER system\n",
 34 |     "\n",
 35 |     "CoNLL2002 corpus is available in NLTK. We use Spanish data."
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 2,
 41 |    "metadata": {
 42 |     "collapsed": false
 43 |    },
 44 |    "outputs": [
 45 |     {
 46 |      "data": {
 47 |       "text/plain": [
 48 |        "[u'esp.testa',\n",
 49 |        " u'esp.testb',\n",
 50 |        " u'esp.train',\n",
 51 |        " u'ned.testa',\n",
 52 |        " u'ned.testb',\n",
 53 |        " u'ned.train']"
 54 |       ]
 55 |      },
 56 |      "execution_count": 2,
 57 |      "metadata": {},
 58 |      "output_type": "execute_result"
 59 |     }
 60 |    ],
 61 |    "source": [
 62 |     "nltk.corpus.conll2002.fileids()"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": 3,
 68 |    "metadata": {
 69 |     "collapsed": false
 70 |    },
 71 |    "outputs": [
 72 |     {
 73 |      "name": "stdout",
 74 |      "output_type": "stream",
 75 |      "text": [
 76 |       "CPU times: user 2.42 s, sys: 70.4 ms, total: 2.49 s\n",
 77 |       "Wall time: 2.55 s\n"
 78 |      ]
 79 |     }
 80 |    ],
 81 |    "source": [
 82 |     "%%time\n",
 83 |     "train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))\n",
 84 |     "test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "markdown",
 89 |    "metadata": {},
 90 |    "source": [
 91 |     "Data format:"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 4,
 97 |    "metadata": {
 98 |     "collapsed": false
 99 |    },
100 |    "outputs": [
101 |     {
102 |      "data": {
103 |       "text/plain": [
104 |        "[(u'Melbourne', u'NP', u'B-LOC'),\n",
105 |        " (u'(', u'Fpa', u'O'),\n",
106 |        " (u'Australia', u'NP', u'B-LOC'),\n",
107 |        " (u')', u'Fpt', u'O'),\n",
108 |        " (u',', u'Fc', u'O'),\n",
109 |        " (u'25', u'Z', u'O'),\n",
110 |        " (u'may', u'NC', u'O'),\n",
111 |        " (u'(', u'Fpa', u'O'),\n",
112 |        " (u'EFE', u'NC', u'B-ORG'),\n",
113 |        " (u')', u'Fpt', u'O'),\n",
114 |        " (u'.', u'Fp', u'O')]"
115 |       ]
116 |      },
117 |      "execution_count": 4,
118 |      "metadata": {},
119 |      "output_type": "execute_result"
120 |     }
121 |    ],
122 |    "source": [
123 |     "train_sents[0]"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "markdown",
128 |    "metadata": {},
129 |    "source": [
130 |     "## Features\n",
131 |     "\n",
132 |     "Next, define some features. In this example we use word identity, word suffix, word shape and word POS tag; also, some information from nearby words is used. \n",
133 |     "\n",
134 |     "This makes a simple baseline, but you certainly can add and remove some features to get (much?) better results - experiment with it."
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": 5,
140 |    "metadata": {
141 |     "collapsed": false
142 |    },
143 |    "outputs": [],
144 |    "source": [
145 |     "def word2features(sent, i):\n",
146 |     "    word = sent[i][0]\n",
147 |     "    postag = sent[i][1]\n",
148 |     "    features = [\n",
149 |     "        'bias',\n",
150 |     "        'word.lower=' + word.lower(),\n",
151 |     "        'word[-3:]=' + word[-3:],\n",
152 |     "        'word[-2:]=' + word[-2:],\n",
153 |     "        'word.isupper=%s' % word.isupper(),\n",
154 |     "        'word.istitle=%s' % word.istitle(),\n",
155 |     "        'word.isdigit=%s' % word.isdigit(),\n",
156 |     "        'postag=' + postag,\n",
157 |     "        'postag[:2]=' + postag[:2],\n",
158 |     "    ]\n",
159 |     "    if i > 0:\n",
160 |     "        word1 = sent[i-1][0]\n",
161 |     "        postag1 = sent[i-1][1]\n",
162 |     "        features.extend([\n",
163 |     "            '-1:word.lower=' + word1.lower(),\n",
164 |     "            '-1:word.istitle=%s' % word1.istitle(),\n",
165 |     "            '-1:word.isupper=%s' % word1.isupper(),\n",
166 |     "            '-1:postag=' + postag1,\n",
167 |     "            '-1:postag[:2]=' + postag1[:2],\n",
168 |     "        ])\n",
169 |     "    else:\n",
170 |     "        features.append('BOS')\n",
171 |     "        \n",
172 |     "    if i < len(sent)-1:\n",
173 |     "        word1 = sent[i+1][0]\n",
174 |     "        postag1 = sent[i+1][1]\n",
175 |     "        features.extend([\n",
176 |     "            '+1:word.lower=' + word1.lower(),\n",
177 |     "            '+1:word.istitle=%s' % word1.istitle(),\n",
178 |     "            '+1:word.isupper=%s' % word1.isupper(),\n",
179 |     "            '+1:postag=' + postag1,\n",
180 |     "            '+1:postag[:2]=' + postag1[:2],\n",
181 |     "        ])\n",
182 |     "    else:\n",
183 |     "        features.append('EOS')\n",
184 |     "                \n",
185 |     "    return features\n",
186 |     "\n",
187 |     "\n",
188 |     "def sent2features(sent):\n",
189 |     "    return [word2features(sent, i) for i in range(len(sent))]\n",
190 |     "\n",
191 |     "def sent2labels(sent):\n",
192 |     "    return [label for token, postag, label in sent]\n",
193 |     "\n",
194 |     "def sent2tokens(sent):\n",
195 |     "    return [token for token, postag, label in sent]    "
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "markdown",
200 |    "metadata": {},
201 |    "source": [
202 |     "This is what word2features extracts:"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": 6,
208 |    "metadata": {
209 |     "collapsed": false
210 |    },
211 |    "outputs": [
212 |     {
213 |      "data": {
214 |       "text/plain": [
215 |        "['bias',\n",
216 |        " u'word.lower=melbourne',\n",
217 |        " u'word[-3:]=rne',\n",
218 |        " u'word[-2:]=ne',\n",
219 |        " 'word.isupper=False',\n",
220 |        " 'word.istitle=True',\n",
221 |        " 'word.isdigit=False',\n",
222 |        " u'postag=NP',\n",
223 |        " u'postag[:2]=NP',\n",
224 |        " 'BOS',\n",
225 |        " u'+1:word.lower=(',\n",
226 |        " '+1:word.istitle=False',\n",
227 |        " '+1:word.isupper=False',\n",
228 |        " u'+1:postag=Fpa',\n",
229 |        " u'+1:postag[:2]=Fp']"
230 |       ]
231 |      },
232 |      "execution_count": 6,
233 |      "metadata": {},
234 |      "output_type": "execute_result"
235 |     }
236 |    ],
237 |    "source": [
238 |     "sent2features(train_sents[0])[0]"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "markdown",
243 |    "metadata": {},
244 |    "source": [
245 |     "Extract the features from the data:"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": 7,
251 |    "metadata": {
252 |     "collapsed": false
253 |    },
254 |    "outputs": [
255 |     {
256 |      "name": "stdout",
257 |      "output_type": "stream",
258 |      "text": [
259 |       "CPU times: user 2.24 s, sys: 287 ms, total: 2.53 s\n",
260 |       "Wall time: 2.53 s\n"
261 |      ]
262 |     }
263 |    ],
264 |    "source": [
265 |     "%%time\n",
266 |     "X_train = [sent2features(s) for s in train_sents]\n",
267 |     "y_train = [sent2labels(s) for s in train_sents]\n",
268 |     "\n",
269 |     "X_test = [sent2features(s) for s in test_sents]\n",
270 |     "y_test = [sent2labels(s) for s in test_sents]"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "markdown",
275 |    "metadata": {},
276 |    "source": [
277 |     "## Train the model\n",
278 |     "\n",
279 |     "To train the model, we create pycrfsuite.Trainer, load the training data and call 'train' method. \n",
280 |     "First, create pycrfsuite.Trainer and load the training data to CRFsuite:"
281 |    ]
282 |   },
283 |   {
284 |    "cell_type": "code",
285 |    "execution_count": 8,
286 |    "metadata": {
287 |     "collapsed": false
288 |    },
289 |    "outputs": [
290 |     {
291 |      "name": "stdout",
292 |      "output_type": "stream",
293 |      "text": [
294 |       "CPU times: user 3.48 s, sys: 90.2 ms, total: 3.57 s\n",
295 |       "Wall time: 3.56 s\n"
296 |      ]
297 |     }
298 |    ],
299 |    "source": [
300 |     "%%time\n",
301 |     "trainer = pycrfsuite.Trainer(verbose=False)\n",
302 |     "\n",
303 |     "for xseq, yseq in zip(X_train, y_train):\n",
304 |     "    trainer.append(xseq, yseq)"
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "markdown",
309 |    "metadata": {},
310 |    "source": [
311 |     "Set training parameters. We will use L-BFGS training algorithm (it is default) with Elastic Net (L1 + L2) regularization."
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "code",
316 |    "execution_count": 9,
317 |    "metadata": {
318 |     "collapsed": false
319 |    },
320 |    "outputs": [],
321 |    "source": [
322 |     "trainer.set_params({\n",
323 |     "    'c1': 1.0,   # coefficient for L1 penalty\n",
324 |     "    'c2': 1e-3,  # coefficient for L2 penalty\n",
325 |     "    'max_iterations': 50,  # stop earlier\n",
326 |     "\n",
327 |     "    # include transitions that are possible, but not observed\n",
328 |     "    'feature.possible_transitions': True\n",
329 |     "})"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "markdown",
334 |    "metadata": {},
335 |    "source": [
336 |     "Possible parameters for the default training algorithm:"
337 |    ]
338 |   },
339 |   {
340 |    "cell_type": "code",
341 |    "execution_count": 10,
342 |    "metadata": {
343 |     "collapsed": false
344 |    },
345 |    "outputs": [
346 |     {
347 |      "data": {
348 |       "text/plain": [
349 |        "['feature.minfreq',\n",
350 |        " 'feature.possible_states',\n",
351 |        " 'feature.possible_transitions',\n",
352 |        " 'c1',\n",
353 |        " 'c2',\n",
354 |        " 'max_iterations',\n",
355 |        " 'num_memories',\n",
356 |        " 'epsilon',\n",
357 |        " 'period',\n",
358 |        " 'delta',\n",
359 |        " 'linesearch',\n",
360 |        " 'max_linesearch']"
361 |       ]
362 |      },
363 |      "execution_count": 10,
364 |      "metadata": {},
365 |      "output_type": "execute_result"
366 |     }
367 |    ],
368 |    "source": [
369 |     "trainer.params()"
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "markdown",
374 |    "metadata": {},
375 |    "source": [
376 |     "Train the model:"
377 |    ]
378 |   },
379 |   {
380 |    "cell_type": "code",
381 |    "execution_count": 11,
382 |    "metadata": {
383 |     "collapsed": false
384 |    },
385 |    "outputs": [
386 |     {
387 |      "name": "stdout",
388 |      "output_type": "stream",
389 |      "text": [
390 |       "CPU times: user 18.8 s, sys: 102 ms, total: 18.9 s\n",
391 |       "Wall time: 19.2 s\n"
392 |      ]
393 |     }
394 |    ],
395 |    "source": [
396 |     "%%time\n",
397 |     "trainer.train('conll2002-esp.crfsuite')"
398 |    ]
399 |   },
400 |   {
401 |    "cell_type": "markdown",
402 |    "metadata": {},
403 |    "source": [
404 |     "trainer.train saves model to a file:"
405 |    ]
406 |   },
407 |   {
408 |    "cell_type": "code",
409 |    "execution_count": 12,
410 |    "metadata": {
411 |     "collapsed": false
412 |    },
413 |    "outputs": [
414 |     {
415 |      "name": "stdout",
416 |      "output_type": "stream",
417 |      "text": [
418 |       "-rw-r--r--  1 gsh25  staff   600K Jun 22 14:56 ./conll2002-esp.crfsuite\r\n"
419 |      ]
420 |     }
421 |    ],
422 |    "source": [
423 |     "!ls -lh ./conll2002-esp.crfsuite"
424 |    ]
425 |   },
426 |   {
427 |    "cell_type": "markdown",
428 |    "metadata": {},
429 |    "source": [
430 |     "We can also get information about the final state of the model by looking at the trainer's logparser. If we had tagged our input data using the optional group argument in add, and had used the optional holdout argument during train, there would be information about the trainer's performance on the holdout set as well. "
431 |    ]
432 |   },
433 |   {
434 |    "cell_type": "code",
435 |    "execution_count": 13,
436 |    "metadata": {
437 |     "collapsed": false
438 |    },
439 |    "outputs": [
440 |     {
441 |      "data": {
442 |       "text/plain": [
443 |        "{'active_features': 11346,\n",
444 |        " 'error_norm': 1262.912078,\n",
445 |        " 'feature_norm': 79.110017,\n",
446 |        " 'linesearch_step': 1.0,\n",
447 |        " 'linesearch_trials': 1,\n",
448 |        " 'loss': 14807.577946,\n",
449 |        " 'num': 50,\n",
450 |        " 'scores': {},\n",
451 |        " 'time': 0.342}"
452 |       ]
453 |      },
454 |      "execution_count": 13,
455 |      "metadata": {},
456 |      "output_type": "execute_result"
457 |     }
458 |    ],
459 |    "source": [
460 |     "trainer.logparser.last_iteration"
461 |    ]
462 |   },
463 |   {
464 |    "cell_type": "markdown",
465 |    "metadata": {},
466 |    "source": [
467 |     "We can also get this information for every step using trainer.logparser.iterations"
468 |    ]
469 |   },
470 |   {
471 |    "cell_type": "code",
472 |    "execution_count": 15,
473 |    "metadata": {
474 |     "collapsed": false
475 |    },
476 |    "outputs": [
477 |     {
478 |      "name": "stdout",
479 |      "output_type": "stream",
480 |      "text": [
481 |       "50 {'loss': 14807.577946, 'error_norm': 1262.912078, 'linesearch_trials': 1, 'active_features': 11346, 'num': 50, 'time': 0.342, 'scores': {}, 'linesearch_step': 1.0, 'feature_norm': 79.110017}\n"
482 |      ]
483 |     }
484 |    ],
485 |    "source": [
486 |     "print len(trainer.logparser.iterations), trainer.logparser.iterations[-1]"
487 |    ]
488 |   },
489 |   {
490 |    "cell_type": "markdown",
491 |    "metadata": {},
492 |    "source": [
493 |     "## Make predictions\n",
494 |     "\n",
495 |     "To use the trained model, create pycrfsuite.Tagger, open the model and use \"tag\" method:"
496 |    ]
497 |   },
498 |   {
499 |    "cell_type": "code",
500 |    "execution_count": 13,
501 |    "metadata": {
502 |     "collapsed": false
503 |    },
504 |    "outputs": [
505 |     {
506 |      "data": {
507 |       "text/plain": [
508 |        "<contextlib.closing at 0x10b517dd8>"
509 |       ]
510 |      },
511 |      "execution_count": 13,
512 |      "metadata": {},
513 |      "output_type": "execute_result"
514 |     }
515 |    ],
516 |    "source": [
517 |     "tagger = pycrfsuite.Tagger()\n",
518 |     "tagger.open('conll2002-esp.crfsuite')"
519 |    ]
520 |   },
521 |   {
522 |    "cell_type": "markdown",
523 |    "metadata": {},
524 |    "source": [
525 |     "Let's tag a sentence to see how it works:"
526 |    ]
527 |   },
528 |   {
529 |    "cell_type": "code",
530 |    "execution_count": 14,
531 |    "metadata": {
532 |     "collapsed": false
533 |    },
534 |    "outputs": [
535 |     {
536 |      "name": "stdout",
537 |      "output_type": "stream",
538 |      "text": [
539 |       "La Coruña , 23 may ( EFECOM ) .\n",
540 |       "\n",
541 |       "Predicted: B-LOC I-LOC O O O O B-ORG O O\n",
542 |       "Correct:   B-LOC I-LOC O O O O B-ORG O O\n"
543 |      ]
544 |     }
545 |    ],
546 |    "source": [
547 |     "example_sent = test_sents[0]\n",
548 |     "print(' '.join(sent2tokens(example_sent)), end='\\n\\n')\n",
549 |     "\n",
550 |     "print(\"Predicted:\", ' '.join(tagger.tag(sent2features(example_sent))))\n",
551 |     "print(\"Correct:  \", ' '.join(sent2labels(example_sent)))"
552 |    ]
553 |   },
554 |   {
555 |    "cell_type": "markdown",
556 |    "metadata": {},
557 |    "source": [
558 |     "## Evaluate the model"
559 |    ]
560 |   },
561 |   {
562 |    "cell_type": "code",
563 |    "execution_count": 15,
564 |    "metadata": {
565 |     "collapsed": false
566 |    },
567 |    "outputs": [],
568 |    "source": [
569 |     "def bio_classification_report(y_true, y_pred):\n",
570 |     "    \"\"\"\n",
571 |     "    Classification report for a list of BIO-encoded sequences.\n",
572 |     "    It computes token-level metrics and discards \"O\" labels.\n",
573 |     "    \n",
574 |     "    Note that it requires scikit-learn 0.15+ (or a version from github master)\n",
575 |     "    to calculate averages properly!\n",
576 |     "    \"\"\"\n",
577 |     "    lb = LabelBinarizer()\n",
578 |     "    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))\n",
579 |     "    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))\n",
580 |     "        \n",
581 |     "    tagset = set(lb.classes_) - {'O'}\n",
582 |     "    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])\n",
583 |     "    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}\n",
584 |     "    \n",
585 |     "    return classification_report(\n",
586 |     "        y_true_combined,\n",
587 |     "        y_pred_combined,\n",
588 |     "        labels = [class_indices[cls] for cls in tagset],\n",
589 |     "        target_names = tagset,\n",
590 |     "    )"
591 |    ]
592 |   },
593 |   {
594 |    "cell_type": "markdown",
595 |    "metadata": {},
596 |    "source": [
597 |     "Predict entity labels for all sentences in our testing set ('testb' Spanish data):"
598 |    ]
599 |   },
600 |   {
601 |    "cell_type": "code",
602 |    "execution_count": 16,
603 |    "metadata": {
604 |     "collapsed": false
605 |    },
606 |    "outputs": [
607 |     {
608 |      "name": "stdout",
609 |      "output_type": "stream",
610 |      "text": [
611 |       "CPU times: user 598 ms, sys: 17.4 ms, total: 616 ms\n",
612 |       "Wall time: 615 ms\n"
613 |      ]
614 |     }
615 |    ],
616 |    "source": [
617 |     "%%time\n",
618 |     "y_pred = [tagger.tag(xseq) for xseq in X_test]"
619 |    ]
620 |   },
621 |   {
622 |    "cell_type": "markdown",
623 |    "metadata": {},
624 |    "source": [
625 |     "..and check the result. Note this report is not comparable to results in CONLL2002 papers because here we check per-token results (not per-entity). Per-entity numbers will be worse.  "
626 |    ]
627 |   },
628 |   {
629 |    "cell_type": "code",
630 |    "execution_count": 17,
631 |    "metadata": {
632 |     "collapsed": false
633 |    },
634 |    "outputs": [
635 |     {
636 |      "name": "stdout",
637 |      "output_type": "stream",
638 |      "text": [
639 |       "             precision    recall  f1-score   support\n",
640 |       "\n",
641 |       "      B-LOC       0.78      0.75      0.76      1084\n",
642 |       "      I-LOC       0.87      0.93      0.90       634\n",
643 |       "     B-MISC       0.69      0.47      0.56       339\n",
644 |       "     I-MISC       0.87      0.93      0.90       634\n",
645 |       "      B-ORG       0.82      0.87      0.84       735\n",
646 |       "      I-ORG       0.87      0.93      0.90       634\n",
647 |       "      B-PER       0.61      0.49      0.54       557\n",
648 |       "      I-PER       0.87      0.93      0.90       634\n",
649 |       "\n",
650 |       "avg / total       0.81      0.81      0.80      5251\n",
651 |       "\n"
652 |      ]
653 |     }
654 |    ],
655 |    "source": [
656 |     "print(bio_classification_report(y_test, y_pred))"
657 |    ]
658 |   },
659 |   {
660 |    "cell_type": "markdown",
661 |    "metadata": {},
662 |    "source": [
663 |     "## Let's check what classifier learned"
664 |    ]
665 |   },
666 |   {
667 |    "cell_type": "code",
668 |    "execution_count": 18,
669 |    "metadata": {
670 |     "collapsed": false
671 |    },
672 |    "outputs": [
673 |     {
674 |      "name": "stdout",
675 |      "output_type": "stream",
676 |      "text": [
677 |       "Top likely transitions:\n",
678 |       "B-ORG  -> I-ORG   8.631963\n",
679 |       "I-ORG  -> I-ORG   7.833706\n",
680 |       "B-PER  -> I-PER   6.998706\n",
681 |       "B-LOC  -> I-LOC   6.913675\n",
682 |       "I-MISC -> I-MISC  6.129735\n",
683 |       "B-MISC -> I-MISC  5.538291\n",
684 |       "I-LOC  -> I-LOC   4.983567\n",
685 |       "I-PER  -> I-PER   3.748358\n",
686 |       "B-ORG  -> B-LOC   1.727090\n",
687 |       "B-PER  -> B-LOC   1.388267\n",
688 |       "B-LOC  -> B-LOC   1.240278\n",
689 |       "O      -> O       1.197929\n",
690 |       "O      -> B-ORG   1.097062\n",
691 |       "I-PER  -> B-LOC   1.083332\n",
692 |       "O      -> B-MISC  1.046113\n",
693 |       "\n",
694 |       "Top unlikely transitions:\n",
695 |       "I-PER  -> B-ORG   -2.056130\n",
696 |       "I-LOC  -> I-ORG   -2.143940\n",
697 |       "B-ORG  -> I-MISC  -2.167501\n",
698 |       "I-PER  -> I-ORG   -2.369380\n",
699 |       "B-ORG  -> I-PER   -2.378110\n",
700 |       "I-MISC -> I-PER   -2.458788\n",
701 |       "B-LOC  -> I-PER   -2.516414\n",
702 |       "I-ORG  -> I-MISC  -2.571973\n",
703 |       "I-LOC  -> B-PER   -2.697791\n",
704 |       "I-LOC  -> I-PER   -3.065950\n",
705 |       "I-ORG  -> I-PER   -3.364434\n",
706 |       "O      -> I-PER   -7.322841\n",
707 |       "O      -> I-MISC  -7.648246\n",
708 |       "O      -> I-ORG   -8.024126\n",
709 |       "O      -> I-LOC   -8.333815\n"
710 |      ]
711 |     }
712 |    ],
713 |    "source": [
714 |     "from collections import Counter\n",
715 |     "info = tagger.info()\n",
716 |     "\n",
717 |     "def print_transitions(trans_features):\n",
718 |     "    for (label_from, label_to), weight in trans_features:\n",
719 |     "        print(\"%-6s -> %-7s %0.6f\" % (label_from, label_to, weight))\n",
720 |     "\n",
721 |     "print(\"Top likely transitions:\")\n",
722 |     "print_transitions(Counter(info.transitions).most_common(15))\n",
723 |     "\n",
724 |     "print(\"\\nTop unlikely transitions:\")\n",
725 |     "print_transitions(Counter(info.transitions).most_common()[-15:])"
726 |    ]
727 |   },
728 |   {
729 |    "cell_type": "markdown",
730 |    "metadata": {},
731 |    "source": [
732 |     "We can see that, for example, it is very likely that the beginning of an organization name (B-ORG) will be followed by a token inside organization name (I-ORG), but transitions to I-ORG from tokens with other labels are penalized. Also note I-PER -> B-LOC transition: a positive weight means that model thinks that a person name is often followed by a location.\n",
733 |     "\n",
734 |     "Check the state features:"
735 |    ]
736 |   },
737 |   {
738 |    "cell_type": "code",
739 |    "execution_count": 19,
740 |    "metadata": {
741 |     "collapsed": false
742 |    },
743 |    "outputs": [
744 |     {
745 |      "name": "stdout",
746 |      "output_type": "stream",
747 |      "text": [
748 |       "Top positive:\n",
749 |       "8.886516 B-ORG  word.lower=efe-cantabria\n",
750 |       "8.743642 B-ORG  word.lower=psoe-progresistas\n",
751 |       "5.769032 B-LOC  -1:word.lower=cantabria\n",
752 |       "5.195429 I-LOC  -1:word.lower=calle\n",
753 |       "5.116821 O      word.lower=mayo\n",
754 |       "4.990871 O      -1:word.lower=día\n",
755 |       "4.910915 I-ORG  -1:word.lower=l\n",
756 |       "4.721572 B-MISC word.lower=diversia\n",
757 |       "4.676259 B-ORG  word.lower=telefónica\n",
758 |       "4.334354 B-ORG  word[-2:]=-e\n",
759 |       "4.149862 B-ORG  word.lower=amena\n",
760 |       "4.141370 B-ORG  word.lower=terra\n",
761 |       "3.942852 O      word.istitle=False\n",
762 |       "3.926397 B-ORG  word.lower=continente\n",
763 |       "3.924672 B-ORG  word.lower=acesa\n",
764 |       "3.888706 O      word.lower=euro\n",
765 |       "3.856445 B-PER  -1:word.lower=según\n",
766 |       "3.812373 B-MISC word.lower=exteriores\n",
767 |       "3.807582 I-MISC -1:word.lower=1.9\n",
768 |       "3.807098 B-MISC word.lower=sanidad\n",
769 |       "\n",
770 |       "Top negative:\n",
771 |       "-1.965379 O      word.lower=fundación\n",
772 |       "-1.981541 O      -1:word.lower=británica\n",
773 |       "-2.118347 O      word.lower=061\n",
774 |       "-2.190653 B-PER  word[-3:]=nes\n",
775 |       "-2.226373 B-ORG  postag=SP\n",
776 |       "-2.226373 B-ORG  postag[:2]=SP\n",
777 |       "-2.260972 O      word[-3:]=uia\n",
778 |       "-2.384920 O      -1:word.lower=sección\n",
779 |       "-2.483009 O      word[-2:]=s.\n",
780 |       "-2.535050 I-LOC  BOS\n",
781 |       "-2.583123 O      -1:word.lower=sánchez\n",
782 |       "-2.585756 O      postag[:2]=NP\n",
783 |       "-2.585756 O      postag=NP\n",
784 |       "-2.588899 O      word[-2:]=om\n",
785 |       "-2.738583 O      -1:word.lower=carretera\n",
786 |       "-2.913103 O      word.istitle=True\n",
787 |       "-2.926560 O      word[-2:]=nd\n",
788 |       "-2.946862 I-PER  -1:word.lower=san\n",
789 |       "-2.954094 B-PER  -1:word.lower=del\n",
790 |       "-3.529449 O      word.isupper=True\n"
791 |      ]
792 |     }
793 |    ],
794 |    "source": [
795 |     "def print_state_features(state_features):\n",
796 |     "    for (attr, label), weight in state_features:\n",
797 |     "        print(\"%0.6f %-6s %s\" % (weight, label, attr))    \n",
798 |     "\n",
799 |     "print(\"Top positive:\")\n",
800 |     "print_state_features(Counter(info.state_features).most_common(20))\n",
801 |     "\n",
802 |     "print(\"\\nTop negative:\")\n",
803 |     "print_state_features(Counter(info.state_features).most_common()[-20:])"
804 |    ]
805 |   },
806 |   {
807 |    "cell_type": "markdown",
808 |    "metadata": {},
809 |    "source": [
810 |     "Some observations:\n",
811 |     "\n",
812 |     "* **8.743642 B-ORG  word.lower=psoe-progresistas** - the model remembered names of some entities - maybe it is overfit, or maybe our features are not adequate, or maybe remembering is indeed helpful;\n",
813 |     "* **5.195429 I-LOC  -1:word.lower=calle**: \"calle\" is a street in Spanish; model learns that if a previous word was \"calle\" then the token is likely a part of location;\n",
814 |     "* **-3.529449 O      word.isupper=True**, ** -2.913103 O      word.istitle=True **: UPPERCASED or TitleCased words are likely entities of some kind;\n",
815 |     "* **-2.585756 O      postag=NP** - proper nouns (NP is a proper noun in the Spanish tagset) are often entities."
816 |    ]
817 |   },
818 |   {
819 |    "cell_type": "markdown",
820 |    "metadata": {},
821 |    "source": [
822 |     "## What to do next\n",
823 |     "\n",
824 |     "1. Load 'testa' Spanish data.\n",
825 |     "2. Use it to develop better features and to find best model parameters.\n",
826 |     "3. Apply the model to 'testb' data again.\n",
827 |     "\n",
828 |     "The model in this notebook is just a starting point; you certainly can do better!"
829 |    ]
830 |   }
831 |  ],
832 |  "metadata": {
833 |   "kernelspec": {
834 |    "display_name": "Python 2",
835 |    "language": "python",
836 |    "name": "python2"
837 |   },
838 |   "language_info": {
839 |    "codemirror_mode": {
840 |     "name": "ipython",
841 |     "version": 2
842 |    },
843 |    "file_extension": ".py",
844 |    "mimetype": "text/x-python",
845 |    "name": "python",
846 |    "nbconvert_exporter": "python",
847 |    "pygments_lexer": "ipython2",
848 |    "version": "2.7.11"
849 |   }
850 |  },
851 |  "nbformat": 4,
852 |  "nbformat_minor": 0
853 | }
854 | 


--------------------------------------------------------------------------------
/tests/test_logparser.py:
--------------------------------------------------------------------------------
   1 | from pycrfsuite._logparser import TrainLogParser  # noqa: F401
   2 | 
   3 | 
   4 | def _apply_parser(parser, log):
   5 |     for line in log:
   6 |         event = parser.feed(line)
   7 |         if event and event != "featgen_progress":
   8 |             print(parser.last_log, end="")
   9 |             print("============== " + event)
  10 | 
  11 | 
  12 | log1 = [
  13 |     "Holdout group: 2\n",
  14 |     "\n",
  15 |     "Feature generation\n",
  16 |     "type: CRF1d\n",
  17 |     "feature.minfreq: 0.000000\n",
  18 |     "feature.possible_states: 0\n",
  19 |     "feature.possible_transitions: 1\n",
  20 |     "0",
  21 |     ".",
  22 |     ".",
  23 |     ".",
  24 |     ".",
  25 |     "1",
  26 |     ".",
  27 |     ".",
  28 |     ".",
  29 |     ".",
  30 |     "2",
  31 |     ".",
  32 |     ".",
  33 |     ".",
  34 |     ".",
  35 |     "3",
  36 |     ".",
  37 |     ".",
  38 |     ".",
  39 |     ".",
  40 |     "4",
  41 |     ".",
  42 |     ".",
  43 |     ".",
  44 |     ".",
  45 |     "5",
  46 |     ".",
  47 |     ".",
  48 |     ".",
  49 |     ".",
  50 |     "6",
  51 |     ".",
  52 |     ".",
  53 |     ".",
  54 |     ".",
  55 |     "7",
  56 |     ".",
  57 |     ".",
  58 |     ".",
  59 |     ".",
  60 |     "8",
  61 |     ".",
  62 |     ".",
  63 |     ".",
  64 |     ".",
  65 |     "9",
  66 |     ".",
  67 |     ".",
  68 |     ".",
  69 |     ".",
  70 |     "10",
  71 |     "\n",
  72 |     "Number of features: 3948\n",
  73 |     "Seconds required: 0.022\n",
  74 |     "\n",
  75 |     "L-BFGS optimization\n",
  76 |     "c1: 1.000000\n",
  77 |     "c2: 0.001000\n",
  78 |     "num_memories: 6\n",
  79 |     "max_iterations: 5\n",
  80 |     "epsilon: 0.000010\n",
  81 |     "stop: 10\n",
  82 |     "delta: 0.000010\n",
  83 |     "linesearch: MoreThuente\n",
  84 |     "linesearch.max_iterations: 20\n",
  85 |     "\n",
  86 |     "***** Iteration #1 *****\n",
  87 |     "Loss: 1450.519004\n",
  88 |     "Feature norm: 1.000000\n",
  89 |     "Error norm: 713.784994\n",
  90 |     "Active features: 1794\n",
  91 |     "Line search trials: 1\n",
  92 |     "Line search step: 0.000228\n",
  93 |     "Seconds required for this iteration: 0.008\n",
  94 |     "Performance by label (#match, #model, #ref) (precision, recall, F1):\n",
  95 |     "    B-LOC: (0, 0, 6) (0.0000, 0.0000, 0.0000)\n",
  96 |     "    O: (306, 339, 306) (0.9027, 1.0000, 0.9488)\n",
  97 |     "    B-ORG: (0, 0, 9) (0.0000, 0.0000, 0.0000)\n",
  98 |     "    B-PER: (0, 0, 3) (0.0000, 0.0000, 0.0000)\n",
  99 |     "    I-PER: (0, 0, 4) (0.0000, 0.0000, 0.0000)\n",
 100 |     "    B-MISC: (0, 0, 5) (0.0000, 0.0000, 0.0000)\n",
 101 |     "    I-ORG: (0, 0, 5) (0.0000, 0.0000, 0.0000)\n",
 102 |     "    I-LOC: (0, 0, 1) (0.0000, 0.0000, 0.0000)\n",
 103 |     "    I-MISC: (0, 0, 0) (******, ******, ******)\n",
 104 |     "Macro-average precision, recall, F1: (0.100295, 0.111111, 0.105426)\n",
 105 |     "Item accuracy: 306 / 339 (0.9027)\n",
 106 |     "Instance accuracy: 3 / 10 (0.3000)\n",
 107 |     "\n",
 108 |     "***** Iteration #2 *****\n",
 109 |     "Loss: 1363.687719\n",
 110 |     "Feature norm: 1.178396\n",
 111 |     "Error norm: 370.827506\n",
 112 |     "Active features: 1540\n",
 113 |     "Line search trials: 1\n",
 114 |     "Line search step: 1.000000\n",
 115 |     "Seconds required for this iteration: 0.004\n",
 116 |     "Performance by label (#match, #model, #ref) (precision, recall, F1):\n",
 117 |     "    B-LOC: (0, 0, 6) (0.0000, 0.0000, 0.0000)\n",
 118 |     "    O: (306, 339, 306) (0.9027, 1.0000, 0.9488)\n",
 119 |     "    B-ORG: (0, 0, 9) (0.0000, 0.0000, 0.0000)\n",
 120 |     "    B-PER: (0, 0, 3) (0.0000, 0.0000, 0.0000)\n",
 121 |     "    I-PER: (0, 0, 4) (0.0000, 0.0000, 0.0000)\n",
 122 |     "    B-MISC: (0, 0, 5) (0.0000, 0.0000, 0.0000)\n",
 123 |     "    I-ORG: (0, 0, 5) (0.0000, 0.0000, 0.0000)\n",
 124 |     "    I-LOC: (0, 0, 1) (0.0000, 0.0000, 0.0000)\n",
 125 |     "    I-MISC: (0, 0, 0) (******, ******, ******)\n",
 126 |     "Macro-average precision, recall, F1: (0.100295, 0.111111, 0.105426)\n",
 127 |     "Item accuracy: 306 / 339 (0.9027)\n",
 128 |     "Instance accuracy: 3 / 10 (0.3000)\n",
 129 |     "\n",
 130 |     "***** Iteration #3 *****\n",
 131 |     "Loss: 1309.171814\n",
 132 |     "Feature norm: 1.266322\n",
 133 |     "Error norm: 368.739493\n",
 134 |     "Active features: 1308\n",
 135 |     "Line search trials: 1\n",
 136 |     "Line search step: 1.000000\n",
 137 |     "Seconds required for this iteration: 0.003\n",
 138 |     "Performance by label (#match, #model, #ref) (precision, recall, F1):\n",
 139 |     "    B-LOC: (0, 0, 6) (0.0000, 0.0000, 0.0000)\n",
 140 |     "    O: (306, 339, 306) (0.9027, 1.0000, 0.9488)\n",
 141 |     "    B-ORG: (0, 0, 9) (0.0000, 0.0000, 0.0000)\n",
 142 |     "    B-PER: (0, 0, 3) (0.0000, 0.0000, 0.0000)\n",
 143 |     "    I-PER: (0, 0, 4) (0.0000, 0.0000, 0.0000)\n",
 144 |     "    B-MISC: (0, 0, 5) (0.0000, 0.0000, 0.0000)\n",
 145 |     "    I-ORG: (0, 0, 5) (0.0000, 0.0000, 0.0000)\n",
 146 |     "    I-LOC: (0, 0, 1) (0.0000, 0.0000, 0.0000)\n",
 147 |     "    I-MISC: (0, 0, 0) (******, ******, ******)\n",
 148 |     "Macro-average precision, recall, F1: (0.100295, 0.111111, 0.105426)\n",
 149 |     "Item accuracy: 306 / 339 (0.9027)\n",
 150 |     "Instance accuracy: 3 / 10 (0.3000)\n",
 151 |     "\n",
 152 |     "***** Iteration #4 *****\n",
 153 |     "Loss: 1019.561634\n",
 154 |     "Feature norm: 1.929814\n",
 155 |     "Error norm: 202.976154\n",
 156 |     "Active features: 1127\n",
 157 |     "Line search trials: 1\n",
 158 |     "Line search step: 1.000000\n",
 159 |     "Seconds required for this iteration: 0.003\n",
 160 |     "Performance by label (#match, #model, #ref) (precision, recall, F1):\n",
 161 |     "    B-LOC: (0, 0, 6) (0.0000, 0.0000, 0.0000)\n",
 162 |     "    O: (306, 339, 306) (0.9027, 1.0000, 0.9488)\n",
 163 |     "    B-ORG: (0, 0, 9) (0.0000, 0.0000, 0.0000)\n",
 164 |     "    B-PER: (0, 0, 3) (0.0000, 0.0000, 0.0000)\n",
 165 |     "    I-PER: (0, 0, 4) (0.0000, 0.0000, 0.0000)\n",
 166 |     "    B-MISC: (0, 0, 5) (0.0000, 0.0000, 0.0000)\n",
 167 |     "    I-ORG: (0, 0, 5) (0.0000, 0.0000, 0.0000)\n",
 168 |     "    I-LOC: (0, 0, 1) (0.0000, 0.0000, 0.0000)\n",
 169 |     "    I-MISC: (0, 0, 0) (******, ******, ******)\n",
 170 |     "Macro-average precision, recall, F1: (0.100295, 0.111111, 0.105426)\n",
 171 |     "Item accuracy: 306 / 339 (0.9027)\n",
 172 |     "Instance accuracy: 3 / 10 (0.3000)\n",
 173 |     "\n",
 174 |     "***** Iteration #5 *****\n",
 175 |     "Loss: 782.637378\n",
 176 |     "Feature norm: 3.539391\n",
 177 |     "Error norm: 121.725020\n",
 178 |     "Active features: 1035\n",
 179 |     "Line search trials: 1\n",
 180 |     "Line search step: 1.000000\n",
 181 |     "Seconds required for this iteration: 0.003\n",
 182 |     "Performance by label (#match, #model, #ref) (precision, recall, F1):\n",
 183 |     "    B-LOC: (2, 5, 6) (0.4000, 0.3333, 0.3636)\n",
 184 |     "    O: (305, 318, 306) (0.9591, 0.9967, 0.9776)\n",
 185 |     "    B-ORG: (0, 0, 9) (0.0000, 0.0000, 0.0000)\n",
 186 |     "    B-PER: (2, 4, 3) (0.5000, 0.6667, 0.5714)\n",
 187 |     "    I-PER: (4, 12, 4) (0.3333, 1.0000, 0.5000)\n",
 188 |     "    B-MISC: (0, 0, 5) (0.0000, 0.0000, 0.0000)\n",
 189 |     "    I-ORG: (0, 0, 5) (0.0000, 0.0000, 0.0000)\n",
 190 |     "    I-LOC: (0, 0, 1) (0.0000, 0.0000, 0.0000)\n",
 191 |     "    I-MISC: (0, 0, 0) (******, ******, ******)\n",
 192 |     "Macro-average precision, recall, F1: (0.243606, 0.332970, 0.268070)\n",
 193 |     "Item accuracy: 313 / 339 (0.9233)\n",
 194 |     "Instance accuracy: 3 / 10 (0.3000)\n",
 195 |     "\n",
 196 |     "L-BFGS terminated with the maximum number of iterations\n",
 197 |     "Total seconds required for training: 0.022\n",
 198 |     "\n",
 199 |     "Storing the model\n",
 200 |     "Number of active features: 1035 (3948)\n",
 201 |     "Number of active attributes: 507 (3350)\n",
 202 |     "Number of active labels: 9 (9)\n",
 203 |     "Writing labels\n",
 204 |     "Writing attributes\n",
 205 |     "Writing feature references for transitions\n",
 206 |     "Writing feature references for attributes\n",
 207 |     "Seconds required: 0.003\n",
 208 |     "\n",
 209 | ]
 210 | 
 211 | log2 = [
 212 |     "Feature generation\n",  # featgen_start
 213 |     "type: CRF1d\n",
 214 |     "feature.minfreq: 0.000000\n",
 215 |     "feature.possible_states: 0\n",
 216 |     "feature.possible_transitions: 1\n",
 217 |     "0",
 218 |     ".",
 219 |     ".",
 220 |     ".",
 221 |     ".",  # featgen_progress
 222 |     "1",
 223 |     ".",
 224 |     ".",
 225 |     ".",
 226 |     ".",
 227 |     "2",
 228 |     ".",
 229 |     ".",
 230 |     ".",
 231 |     ".",
 232 |     "3",
 233 |     ".",
 234 |     ".",
 235 |     ".",
 236 |     ".",
 237 |     "4",
 238 |     ".",
 239 |     ".",
 240 |     ".",
 241 |     ".",
 242 |     "5",
 243 |     ".",
 244 |     ".",
 245 |     ".",
 246 |     ".",
 247 |     "6",
 248 |     ".",
 249 |     ".",
 250 |     ".",
 251 |     ".",
 252 |     "7",
 253 |     ".",
 254 |     ".",
 255 |     ".",
 256 |     ".",
 257 |     "8",
 258 |     ".",
 259 |     ".",
 260 |     ".",
 261 |     ".",
 262 |     "9",
 263 |     ".",
 264 |     ".",
 265 |     ".",
 266 |     ".",
 267 |     "10",
 268 |     "\n",
 269 |     "Number of features: 4379\n",
 270 |     "Seconds required: 0.021\n",  # featgen_end
 271 |     "\n",
 272 |     "Averaged perceptron\n",
 273 |     "max_iterations: 5\n",
 274 |     "epsilon: 0.000000\n",
 275 |     "\n",
 276 |     "***** Iteration #1 *****\n",  # iteration
 277 |     "Loss: 16.359638\n",
 278 |     "Feature norm: 112.848688\n",
 279 |     "Seconds required for this iteration: 0.005\n",  # iteration end
 280 |     "\n",
 281 |     "***** Iteration #2 *****\n",
 282 |     "Loss: 12.449970\n",
 283 |     "Feature norm: 126.174821\n",
 284 |     "Seconds required for this iteration: 0.004\n",
 285 |     "\n",
 286 |     "***** Iteration #3 *****\n",
 287 |     "Loss: 9.451751\n",
 288 |     "Feature norm: 145.482678\n",
 289 |     "Seconds required for this iteration: 0.003\n",
 290 |     "\n",
 291 |     "***** Iteration #4 *****\n",
 292 |     "Loss: 8.652287\n",
 293 |     "Feature norm: 155.495167\n",
 294 |     "Seconds required for this iteration: 0.003\n",
 295 |     "\n",
 296 |     "***** Iteration #5 *****\n",
 297 |     "Loss: 7.442703\n",
 298 |     "Feature norm: 166.818487\n",
 299 |     "Seconds required for this iteration: 0.002\n",
 300 |     "\n",
 301 |     "Total seconds required for training: 0.017\n",  # optimization_end
 302 |     "\n",
 303 |     "Storing the model\n",  # storing_start
 304 |     "Number of active features: 2265 (4379)\n",
 305 |     "Number of active attributes: 1299 (3350)\n",
 306 |     "Number of active labels: 9 (9)\n",
 307 |     "Writing labels\n",
 308 |     "Writing attributes\n",
 309 |     "Writing feature references for transitions\n",
 310 |     "Writing feature references for attributes\n",
 311 |     "Seconds required: 0.007\n",  # storing_end
 312 |     "\n",  # end
 313 | ]
 314 | 
 315 | log3 = [
 316 |     "Holdout group: 2\n",
 317 |     "\n",
 318 |     "Feature generation\n",
 319 |     "type: CRF1d\n",
 320 |     "feature.minfreq: 0.000000\n",
 321 |     "feature.possible_states: 0\n",
 322 |     "feature.possible_transitions: 1\n",
 323 |     "0",
 324 |     ".",
 325 |     ".",
 326 |     ".",
 327 |     ".",
 328 |     "1",
 329 |     ".",
 330 |     ".",
 331 |     ".",
 332 |     ".",
 333 |     "2",
 334 |     ".",
 335 |     ".",
 336 |     ".",
 337 |     ".",
 338 |     "3",
 339 |     ".",
 340 |     ".",
 341 |     ".",
 342 |     ".",
 343 |     "4",
 344 |     ".",
 345 |     ".",
 346 |     ".",
 347 |     ".",
 348 |     "5",
 349 |     ".",
 350 |     ".",
 351 |     ".",
 352 |     ".",
 353 |     "6",
 354 |     ".",
 355 |     ".",
 356 |     ".",
 357 |     ".",
 358 |     "7",
 359 |     ".",
 360 |     ".",
 361 |     ".",
 362 |     ".",
 363 |     "8",
 364 |     ".",
 365 |     ".",
 366 |     ".",
 367 |     ".",
 368 |     "9",
 369 |     ".",
 370 |     ".",
 371 |     ".",
 372 |     ".",
 373 |     "10",
 374 |     "\n",
 375 |     "Number of features: 96180\n",
 376 |     "Seconds required: 1.263\n",
 377 |     "\n",
 378 |     "Stochastic Gradient Descent (SGD)\n",
 379 |     "c2: 1.000000\n",
 380 |     "max_iterations: 5\n",
 381 |     "period: 10\n",
 382 |     "delta: 0.000001\n",
 383 |     "\n",
 384 |     "Calibrating the learning rate (eta)\n",
 385 |     "calibration.eta: 0.100000\n",
 386 |     "calibration.rate: 2.000000\n",
 387 |     "calibration.samples: 1000\n",
 388 |     "calibration.candidates: 10\n",
 389 |     "calibration.max_trials: 20\n",
 390 |     "Initial loss: 69781.655352\n",
 391 |     "Trial #1 (eta = 0.100000): ",
 392 |     "12808.890280\n",
 393 |     "Trial #2 (eta = 0.200000): ",
 394 |     "26716.801091\n",
 395 |     "Trial #3 (eta = 0.400000): ",
 396 |     "51219.321368\n",
 397 |     "Trial #4 (eta = 0.800000): ",
 398 |     "104398.795416 (worse)\n",
 399 |     "Trial #5 (eta = 0.050000): ",
 400 |     "7804.492475\n",
 401 |     "Trial #6 (eta = 0.025000): ",
 402 |     "6419.964967\n",
 403 |     "Trial #7 (eta = 0.012500): ",
 404 |     "6989.552193\n",
 405 |     "Trial #8 (eta = 0.006250): ",
 406 |     "8303.107921\n",
 407 |     "Trial #9 (eta = 0.003125): ",
 408 |     "9934.052819\n",
 409 |     "Trial #10 (eta = 0.001563): ",
 410 |     "11782.234687\n",
 411 |     "Trial #11 (eta = 0.000781): ",
 412 |     "13777.708878\n",
 413 |     "Trial #12 (eta = 0.000391): ",
 414 |     "15891.422697\n",
 415 |     "Trial #13 (eta = 0.000195): ",
 416 |     "18174.499245\n",
 417 |     "Trial #14 (eta = 0.000098): ",
 418 |     "20955.855446\n",
 419 |     "Best learning rate (eta): 0.025000\n",
 420 |     "Seconds required: 0.858\n",
 421 |     "\n",
 422 |     "***** Epoch #1 *****\n",
 423 |     "Loss: 36862.915596\n",
 424 |     "Feature L2-norm: 24.717729\n",
 425 |     "Learning rate (eta): 0.023810\n",
 426 |     "Total number of feature updates: 8323\n",
 427 |     "Seconds required for this iteration: 0.462\n",
 428 |     "Performance by label (#match, #model, #ref) (precision, recall, F1):\n",
 429 |     "    B-LOC: (778, 1193, 1084) (0.6521, 0.7177, 0.6834)\n",
 430 |     "    O: (45103, 45519, 45355) (0.9909, 0.9944, 0.9926)\n",
 431 |     "    B-ORG: (1003, 1326, 1400) (0.7564, 0.7164, 0.7359)\n",
 432 |     "    B-PER: (583, 764, 735) (0.7631, 0.7932, 0.7779)\n",
 433 |     "    I-PER: (565, 681, 634) (0.8297, 0.8912, 0.8593)\n",
 434 |     "    B-MISC: (76, 181, 339) (0.4199, 0.2242, 0.2923)\n",
 435 |     "    I-ORG: (735, 933, 1104) (0.7878, 0.6658, 0.7216)\n",
 436 |     "    I-LOC: (191, 455, 325) (0.4198, 0.5877, 0.4897)\n",
 437 |     "    I-MISC: (204, 481, 557) (0.4241, 0.3662, 0.3931)\n",
 438 |     "Macro-average precision, recall, F1: (0.671525, 0.661871, 0.660646)\n",
 439 |     "Item accuracy: 49238 / 51533 (0.9555)\n",
 440 |     "Instance accuracy: 852 / 1517 (0.5616)\n",
 441 |     "\n",
 442 |     "***** Epoch #2 *****\n",
 443 |     "Loss: 31176.026308\n",
 444 |     "Feature L2-norm: 32.274598\n",
 445 |     "Learning rate (eta): 0.022727\n",
 446 |     "Total number of feature updates: 16646\n",
 447 |     "Seconds required for this iteration: 0.466\n",
 448 |     "Performance by label (#match, #model, #ref) (precision, recall, F1):\n",
 449 |     "    B-LOC: (708, 1018, 1084) (0.6955, 0.6531, 0.6736)\n",
 450 |     "    O: (45101, 45611, 45355) (0.9888, 0.9944, 0.9916)\n",
 451 |     "    B-ORG: (1053, 1711, 1400) (0.6154, 0.7521, 0.6770)\n",
 452 |     "    B-PER: (594, 777, 735) (0.7645, 0.8082, 0.7857)\n",
 453 |     "    I-PER: (589, 778, 634) (0.7571, 0.9290, 0.8343)\n",
 454 |     "    B-MISC: (94, 264, 339) (0.3561, 0.2773, 0.3118)\n",
 455 |     "    I-ORG: (384, 468, 1104) (0.8205, 0.3478, 0.4885)\n",
 456 |     "    I-LOC: (166, 285, 325) (0.5825, 0.5108, 0.5443)\n",
 457 |     "    I-MISC: (210, 621, 557) (0.3382, 0.3770, 0.3565)\n",
 458 |     "Macro-average precision, recall, F1: (0.657608, 0.627752, 0.629257)\n",
 459 |     "Item accuracy: 48899 / 51533 (0.9489)\n",
 460 |     "Instance accuracy: 813 / 1517 (0.5359)\n",
 461 |     "\n",
 462 |     "***** Epoch #3 *****\n",
 463 |     "Loss: 23705.719839\n",
 464 |     "Feature L2-norm: 35.255014\n",
 465 |     "Learning rate (eta): 0.021739\n",
 466 |     "Total number of feature updates: 24969\n",
 467 |     "Seconds required for this iteration: 0.472\n",
 468 |     "Performance by label (#match, #model, #ref) (precision, recall, F1):\n",
 469 |     "    B-LOC: (808, 1210, 1084) (0.6678, 0.7454, 0.7044)\n",
 470 |     "    O: (45244, 45771, 45355) (0.9885, 0.9976, 0.9930)\n",
 471 |     "    B-ORG: (1061, 1403, 1400) (0.7562, 0.7579, 0.7570)\n",
 472 |     "    B-PER: (588, 728, 735) (0.8077, 0.8000, 0.8038)\n",
 473 |     "    I-PER: (565, 640, 634) (0.8828, 0.8912, 0.8870)\n",
 474 |     "    B-MISC: (86, 130, 339) (0.6615, 0.2537, 0.3667)\n",
 475 |     "    I-ORG: (857, 1148, 1104) (0.7465, 0.7763, 0.7611)\n",
 476 |     "    I-LOC: (152, 282, 325) (0.5390, 0.4677, 0.5008)\n",
 477 |     "    I-MISC: (170, 221, 557) (0.7692, 0.3052, 0.4370)\n",
 478 |     "Macro-average precision, recall, F1: (0.757699, 0.666091, 0.690108)\n",
 479 |     "Item accuracy: 49531 / 51533 (0.9612)\n",
 480 |     "Instance accuracy: 889 / 1517 (0.5860)\n",
 481 |     "\n",
 482 |     "***** Epoch #4 *****\n",
 483 |     "Loss: 21273.137466\n",
 484 |     "Feature L2-norm: 37.985723\n",
 485 |     "Learning rate (eta): 0.020833\n",
 486 |     "Total number of feature updates: 33292\n",
 487 |     "Seconds required for this iteration: 0.468\n",
 488 |     "Performance by label (#match, #model, #ref) (precision, recall, F1):\n",
 489 |     "    B-LOC: (848, 1276, 1084) (0.6646, 0.7823, 0.7186)\n",
 490 |     "    O: (44212, 44389, 45355) (0.9960, 0.9748, 0.9853)\n",
 491 |     "    B-ORG: (784, 896, 1400) (0.8750, 0.5600, 0.6829)\n",
 492 |     "    B-PER: (582, 686, 735) (0.8484, 0.7918, 0.8191)\n",
 493 |     "    I-PER: (570, 647, 634) (0.8810, 0.8991, 0.8899)\n",
 494 |     "    B-MISC: (166, 619, 339) (0.2682, 0.4897, 0.3466)\n",
 495 |     "    I-ORG: (152, 155, 1104) (0.9806, 0.1377, 0.2415)\n",
 496 |     "    I-LOC: (138, 219, 325) (0.6301, 0.4246, 0.5074)\n",
 497 |     "    I-MISC: (467, 2646, 557) (0.1765, 0.8384, 0.2916)\n",
 498 |     "Macro-average precision, recall, F1: (0.702269, 0.655374, 0.609212)\n",
 499 |     "Item accuracy: 47919 / 51533 (0.9299)\n",
 500 |     "Instance accuracy: 793 / 1517 (0.5227)\n",
 501 |     "\n",
 502 |     "***** Epoch #5 *****\n",
 503 |     "Loss: 20806.661564\n",
 504 |     "Feature L2-norm: 40.673070\n",
 505 |     "Learning rate (eta): 0.020000\n",
 506 |     "Total number of feature updates: 41615\n",
 507 |     "Seconds required for this iteration: 0.460\n",
 508 |     "Performance by label (#match, #model, #ref) (precision, recall, F1):\n",
 509 |     "    B-LOC: (689, 892, 1084) (0.7724, 0.6356, 0.6974)\n",
 510 |     "    O: (45171, 45556, 45355) (0.9915, 0.9959, 0.9937)\n",
 511 |     "    B-ORG: (1214, 1931, 1400) (0.6287, 0.8671, 0.7289)\n",
 512 |     "    B-PER: (529, 574, 735) (0.9216, 0.7197, 0.8083)\n",
 513 |     "    I-PER: (520, 553, 634) (0.9403, 0.8202, 0.8762)\n",
 514 |     "    B-MISC: (77, 96, 339) (0.8021, 0.2271, 0.3540)\n",
 515 |     "    I-ORG: (1009, 1678, 1104) (0.6013, 0.9139, 0.7254)\n",
 516 |     "    I-LOC: (126, 182, 325) (0.6923, 0.3877, 0.4970)\n",
 517 |     "    I-MISC: (57, 71, 557) (0.8028, 0.1023, 0.1815)\n",
 518 |     "Macro-average precision, recall, F1: (0.794790, 0.629970, 0.651378)\n",
 519 |     "Item accuracy: 49392 / 51533 (0.9585)\n",
 520 |     "Instance accuracy: 885 / 1517 (0.5834)\n",
 521 |     "\n",
 522 |     "SGD terminated with the maximum number of iterations\n",
 523 |     "Loss: 20806.661564\n",
 524 |     "Total seconds required for training: 3.350\n",
 525 |     "\n",
 526 |     "Storing the model\n",
 527 |     "Number of active features: 96180 (96180)\n",
 528 |     "Number of active attributes: 76691 (83593)\n",
 529 |     "Number of active labels: 9 (9)\n",
 530 |     "Writing labels\n",
 531 |     "Writing attributes\n",
 532 |     "Writing feature references for transitions\n",
 533 |     "Writing feature references for attributes\n",
 534 |     "Seconds required: 0.329\n",
 535 |     "\n",
 536 | ]
 537 | 
 538 | log4 = [
 539 |     "Feature generation\n",
 540 |     "type: CRF1d\n",
 541 |     "feature.minfreq: 0.000000\n",
 542 |     "feature.possible_states: 0\n",
 543 |     "feature.possible_transitions: 0\n",
 544 |     "0",
 545 |     ".",
 546 |     ".",
 547 |     ".",
 548 |     ".",
 549 |     "1",
 550 |     ".",
 551 |     ".",
 552 |     ".",
 553 |     ".",
 554 |     "2",
 555 |     ".",
 556 |     ".",
 557 |     ".",
 558 |     ".",
 559 |     "3",
 560 |     ".",
 561 |     ".",
 562 |     ".",
 563 |     ".",
 564 |     "4",
 565 |     ".",
 566 |     ".",
 567 |     ".",
 568 |     ".",
 569 |     "5",
 570 |     ".",
 571 |     ".",
 572 |     ".",
 573 |     ".",
 574 |     "6",
 575 |     ".",
 576 |     ".",
 577 |     ".",
 578 |     ".",
 579 |     "7",
 580 |     ".",
 581 |     ".",
 582 |     ".",
 583 |     ".",
 584 |     "8",
 585 |     ".",
 586 |     ".",
 587 |     ".",
 588 |     ".",
 589 |     "9",
 590 |     ".",
 591 |     ".",
 592 |     ".",
 593 |     ".",
 594 |     "10",
 595 |     "\n",
 596 |     "Number of features: 0\n",
 597 |     "Seconds required: 0.001\n",
 598 |     "\n",
 599 |     "L-BFGS optimization\n",
 600 |     "c1: 0.000000\n",
 601 |     "c2: 1.000000\n",
 602 |     "num_memories: 6\n",
 603 |     "max_iterations: 2147483647\n",
 604 |     "epsilon: 0.000010\n",
 605 |     "stop: 10\n",
 606 |     "delta: 0.000010\n",
 607 |     "linesearch: MoreThuente\n",
 608 |     "linesearch.max_iterations: 20\n",
 609 |     "\n",
 610 |     "L-BFGS terminated with error code (-1020)\n",
 611 |     "Total seconds required for training: 0.000\n",
 612 |     "\n",
 613 |     "Storing the model\n",
 614 |     "Number of active features: 0 (0)\n",
 615 |     "Number of active attributes: 0 (0)\n",
 616 |     "Number of active labels: 0 (0)\n",
 617 |     "Writing labels\n",
 618 |     "Writing attributes\n",
 619 |     "Writing feature references for transitions\n",
 620 |     "Writing feature references for attributes\n",
 621 |     "Seconds required: 0.000\n",
 622 |     "\n",
 623 | ]
 624 | 
 625 | 
 626 | def test_parser_log1():
 627 |     """
 628 |     >>> parser = TrainLogParser()
 629 |     >>> _apply_parser(parser, log1)
 630 |     Holdout group: 2
 631 |     ============== start
 632 |     <BLANKLINE>
 633 |     Number of features: 3948
 634 |     Seconds required: 0.022
 635 |     ============== featgen_end
 636 |     <BLANKLINE>
 637 |     L-BFGS optimization
 638 |     c1: 1.000000
 639 |     c2: 0.001000
 640 |     num_memories: 6
 641 |     max_iterations: 5
 642 |     epsilon: 0.000010
 643 |     stop: 10
 644 |     delta: 0.000010
 645 |     linesearch: MoreThuente
 646 |     linesearch.max_iterations: 20
 647 |     <BLANKLINE>
 648 |     ============== prepared
 649 |     ***** Iteration #1 *****
 650 |     Loss: 1450.519004
 651 |     Feature norm: 1.000000
 652 |     Error norm: 713.784994
 653 |     Active features: 1794
 654 |     Line search trials: 1
 655 |     Line search step: 0.000228
 656 |     Seconds required for this iteration: 0.008
 657 |     Performance by label (#match, #model, #ref) (precision, recall, F1):
 658 |         B-LOC: (0, 0, 6) (0.0000, 0.0000, 0.0000)
 659 |         O: (306, 339, 306) (0.9027, 1.0000, 0.9488)
 660 |         B-ORG: (0, 0, 9) (0.0000, 0.0000, 0.0000)
 661 |         B-PER: (0, 0, 3) (0.0000, 0.0000, 0.0000)
 662 |         I-PER: (0, 0, 4) (0.0000, 0.0000, 0.0000)
 663 |         B-MISC: (0, 0, 5) (0.0000, 0.0000, 0.0000)
 664 |         I-ORG: (0, 0, 5) (0.0000, 0.0000, 0.0000)
 665 |         I-LOC: (0, 0, 1) (0.0000, 0.0000, 0.0000)
 666 |         I-MISC: (0, 0, 0) (******, ******, ******)
 667 |     Macro-average precision, recall, F1: (0.100295, 0.111111, 0.105426)
 668 |     Item accuracy: 306 / 339 (0.9027)
 669 |     Instance accuracy: 3 / 10 (0.3000)
 670 |     <BLANKLINE>
 671 |     ============== iteration
 672 |     ***** Iteration #2 *****
 673 |     Loss: 1363.687719
 674 |     Feature norm: 1.178396
 675 |     Error norm: 370.827506
 676 |     Active features: 1540
 677 |     Line search trials: 1
 678 |     Line search step: 1.000000
 679 |     Seconds required for this iteration: 0.004
 680 |     Performance by label (#match, #model, #ref) (precision, recall, F1):
 681 |         B-LOC: (0, 0, 6) (0.0000, 0.0000, 0.0000)
 682 |         O: (306, 339, 306) (0.9027, 1.0000, 0.9488)
 683 |         B-ORG: (0, 0, 9) (0.0000, 0.0000, 0.0000)
 684 |         B-PER: (0, 0, 3) (0.0000, 0.0000, 0.0000)
 685 |         I-PER: (0, 0, 4) (0.0000, 0.0000, 0.0000)
 686 |         B-MISC: (0, 0, 5) (0.0000, 0.0000, 0.0000)
 687 |         I-ORG: (0, 0, 5) (0.0000, 0.0000, 0.0000)
 688 |         I-LOC: (0, 0, 1) (0.0000, 0.0000, 0.0000)
 689 |         I-MISC: (0, 0, 0) (******, ******, ******)
 690 |     Macro-average precision, recall, F1: (0.100295, 0.111111, 0.105426)
 691 |     Item accuracy: 306 / 339 (0.9027)
 692 |     Instance accuracy: 3 / 10 (0.3000)
 693 |     <BLANKLINE>
 694 |     ============== iteration
 695 |     ***** Iteration #3 *****
 696 |     Loss: 1309.171814
 697 |     Feature norm: 1.266322
 698 |     Error norm: 368.739493
 699 |     Active features: 1308
 700 |     Line search trials: 1
 701 |     Line search step: 1.000000
 702 |     Seconds required for this iteration: 0.003
 703 |     Performance by label (#match, #model, #ref) (precision, recall, F1):
 704 |         B-LOC: (0, 0, 6) (0.0000, 0.0000, 0.0000)
 705 |         O: (306, 339, 306) (0.9027, 1.0000, 0.9488)
 706 |         B-ORG: (0, 0, 9) (0.0000, 0.0000, 0.0000)
 707 |         B-PER: (0, 0, 3) (0.0000, 0.0000, 0.0000)
 708 |         I-PER: (0, 0, 4) (0.0000, 0.0000, 0.0000)
 709 |         B-MISC: (0, 0, 5) (0.0000, 0.0000, 0.0000)
 710 |         I-ORG: (0, 0, 5) (0.0000, 0.0000, 0.0000)
 711 |         I-LOC: (0, 0, 1) (0.0000, 0.0000, 0.0000)
 712 |         I-MISC: (0, 0, 0) (******, ******, ******)
 713 |     Macro-average precision, recall, F1: (0.100295, 0.111111, 0.105426)
 714 |     Item accuracy: 306 / 339 (0.9027)
 715 |     Instance accuracy: 3 / 10 (0.3000)
 716 |     <BLANKLINE>
 717 |     ============== iteration
 718 |     ***** Iteration #4 *****
 719 |     Loss: 1019.561634
 720 |     Feature norm: 1.929814
 721 |     Error norm: 202.976154
 722 |     Active features: 1127
 723 |     Line search trials: 1
 724 |     Line search step: 1.000000
 725 |     Seconds required for this iteration: 0.003
 726 |     Performance by label (#match, #model, #ref) (precision, recall, F1):
 727 |         B-LOC: (0, 0, 6) (0.0000, 0.0000, 0.0000)
 728 |         O: (306, 339, 306) (0.9027, 1.0000, 0.9488)
 729 |         B-ORG: (0, 0, 9) (0.0000, 0.0000, 0.0000)
 730 |         B-PER: (0, 0, 3) (0.0000, 0.0000, 0.0000)
 731 |         I-PER: (0, 0, 4) (0.0000, 0.0000, 0.0000)
 732 |         B-MISC: (0, 0, 5) (0.0000, 0.0000, 0.0000)
 733 |         I-ORG: (0, 0, 5) (0.0000, 0.0000, 0.0000)
 734 |         I-LOC: (0, 0, 1) (0.0000, 0.0000, 0.0000)
 735 |         I-MISC: (0, 0, 0) (******, ******, ******)
 736 |     Macro-average precision, recall, F1: (0.100295, 0.111111, 0.105426)
 737 |     Item accuracy: 306 / 339 (0.9027)
 738 |     Instance accuracy: 3 / 10 (0.3000)
 739 |     <BLANKLINE>
 740 |     ============== iteration
 741 |     ***** Iteration #5 *****
 742 |     Loss: 782.637378
 743 |     Feature norm: 3.539391
 744 |     Error norm: 121.725020
 745 |     Active features: 1035
 746 |     Line search trials: 1
 747 |     Line search step: 1.000000
 748 |     Seconds required for this iteration: 0.003
 749 |     Performance by label (#match, #model, #ref) (precision, recall, F1):
 750 |         B-LOC: (2, 5, 6) (0.4000, 0.3333, 0.3636)
 751 |         O: (305, 318, 306) (0.9591, 0.9967, 0.9776)
 752 |         B-ORG: (0, 0, 9) (0.0000, 0.0000, 0.0000)
 753 |         B-PER: (2, 4, 3) (0.5000, 0.6667, 0.5714)
 754 |         I-PER: (4, 12, 4) (0.3333, 1.0000, 0.5000)
 755 |         B-MISC: (0, 0, 5) (0.0000, 0.0000, 0.0000)
 756 |         I-ORG: (0, 0, 5) (0.0000, 0.0000, 0.0000)
 757 |         I-LOC: (0, 0, 1) (0.0000, 0.0000, 0.0000)
 758 |         I-MISC: (0, 0, 0) (******, ******, ******)
 759 |     Macro-average precision, recall, F1: (0.243606, 0.332970, 0.268070)
 760 |     Item accuracy: 313 / 339 (0.9233)
 761 |     Instance accuracy: 3 / 10 (0.3000)
 762 |     <BLANKLINE>
 763 |     ============== iteration
 764 |     L-BFGS terminated with the maximum number of iterations
 765 |     Total seconds required for training: 0.022
 766 |     <BLANKLINE>
 767 |     ============== optimization_end
 768 |     Storing the model
 769 |     Number of active features: 1035 (3948)
 770 |     Number of active attributes: 507 (3350)
 771 |     Number of active labels: 9 (9)
 772 |     Writing labels
 773 |     Writing attributes
 774 |     Writing feature references for transitions
 775 |     Writing feature references for attributes
 776 |     Seconds required: 0.003
 777 |     <BLANKLINE>
 778 |     ============== end
 779 | 
 780 |     >>> len(parser.iterations)
 781 |     5
 782 |     >>> parser.iterations[3]['active_features']
 783 |     1127
 784 |     """
 785 |     pass
 786 | 
 787 | 
 788 | def test_parser_log2():
 789 |     """
 790 |     >>> parser = TrainLogParser()
 791 |     >>> _apply_parser(parser, log2)
 792 |     Feature generation
 793 |     ============== start
 794 |     <BLANKLINE>
 795 |     Number of features: 4379
 796 |     Seconds required: 0.021
 797 |     ============== featgen_end
 798 |     <BLANKLINE>
 799 |     Averaged perceptron
 800 |     max_iterations: 5
 801 |     epsilon: 0.000000
 802 |     <BLANKLINE>
 803 |     ============== prepared
 804 |     ***** Iteration #1 *****
 805 |     Loss: 16.359638
 806 |     Feature norm: 112.848688
 807 |     Seconds required for this iteration: 0.005
 808 |     <BLANKLINE>
 809 |     ============== iteration
 810 |     ***** Iteration #2 *****
 811 |     Loss: 12.449970
 812 |     Feature norm: 126.174821
 813 |     Seconds required for this iteration: 0.004
 814 |     <BLANKLINE>
 815 |     ============== iteration
 816 |     ***** Iteration #3 *****
 817 |     Loss: 9.451751
 818 |     Feature norm: 145.482678
 819 |     Seconds required for this iteration: 0.003
 820 |     <BLANKLINE>
 821 |     ============== iteration
 822 |     ***** Iteration #4 *****
 823 |     Loss: 8.652287
 824 |     Feature norm: 155.495167
 825 |     Seconds required for this iteration: 0.003
 826 |     <BLANKLINE>
 827 |     ============== iteration
 828 |     ***** Iteration #5 *****
 829 |     Loss: 7.442703
 830 |     Feature norm: 166.818487
 831 |     Seconds required for this iteration: 0.002
 832 |     <BLANKLINE>
 833 |     ============== iteration
 834 |     Total seconds required for training: 0.017
 835 |     <BLANKLINE>
 836 |     ============== optimization_end
 837 |     Storing the model
 838 |     Number of active features: 2265 (4379)
 839 |     Number of active attributes: 1299 (3350)
 840 |     Number of active labels: 9 (9)
 841 |     Writing labels
 842 |     Writing attributes
 843 |     Writing feature references for transitions
 844 |     Writing feature references for attributes
 845 |     Seconds required: 0.007
 846 |     <BLANKLINE>
 847 |     ============== end
 848 |     """
 849 |     pass
 850 | 
 851 | 
 852 | def test_parser_log3():
 853 |     """
 854 |     >>> parser = TrainLogParser()
 855 |     >>> _apply_parser(parser, log3)
 856 |     Holdout group: 2
 857 |     ============== start
 858 |     <BLANKLINE>
 859 |     Number of features: 96180
 860 |     Seconds required: 1.263
 861 |     ============== featgen_end
 862 |     <BLANKLINE>
 863 |     Stochastic Gradient Descent (SGD)
 864 |     c2: 1.000000
 865 |     max_iterations: 5
 866 |     period: 10
 867 |     delta: 0.000001
 868 |     <BLANKLINE>
 869 |     Calibrating the learning rate (eta)
 870 |     calibration.eta: 0.100000
 871 |     calibration.rate: 2.000000
 872 |     calibration.samples: 1000
 873 |     calibration.candidates: 10
 874 |     calibration.max_trials: 20
 875 |     Initial loss: 69781.655352
 876 |     Trial #1 (eta = 0.100000): 12808.890280
 877 |     Trial #2 (eta = 0.200000): 26716.801091
 878 |     Trial #3 (eta = 0.400000): 51219.321368
 879 |     Trial #4 (eta = 0.800000): 104398.795416 (worse)
 880 |     Trial #5 (eta = 0.050000): 7804.492475
 881 |     Trial #6 (eta = 0.025000): 6419.964967
 882 |     Trial #7 (eta = 0.012500): 6989.552193
 883 |     Trial #8 (eta = 0.006250): 8303.107921
 884 |     Trial #9 (eta = 0.003125): 9934.052819
 885 |     Trial #10 (eta = 0.001563): 11782.234687
 886 |     Trial #11 (eta = 0.000781): 13777.708878
 887 |     Trial #12 (eta = 0.000391): 15891.422697
 888 |     Trial #13 (eta = 0.000195): 18174.499245
 889 |     Trial #14 (eta = 0.000098): 20955.855446
 890 |     Best learning rate (eta): 0.025000
 891 |     Seconds required: 0.858
 892 |     <BLANKLINE>
 893 |     ============== prepared
 894 |     ***** Epoch #1 *****
 895 |     Loss: 36862.915596
 896 |     Feature L2-norm: 24.717729
 897 |     Learning rate (eta): 0.023810
 898 |     Total number of feature updates: 8323
 899 |     Seconds required for this iteration: 0.462
 900 |     Performance by label (#match, #model, #ref) (precision, recall, F1):
 901 |         B-LOC: (778, 1193, 1084) (0.6521, 0.7177, 0.6834)
 902 |         O: (45103, 45519, 45355) (0.9909, 0.9944, 0.9926)
 903 |         B-ORG: (1003, 1326, 1400) (0.7564, 0.7164, 0.7359)
 904 |         B-PER: (583, 764, 735) (0.7631, 0.7932, 0.7779)
 905 |         I-PER: (565, 681, 634) (0.8297, 0.8912, 0.8593)
 906 |         B-MISC: (76, 181, 339) (0.4199, 0.2242, 0.2923)
 907 |         I-ORG: (735, 933, 1104) (0.7878, 0.6658, 0.7216)
 908 |         I-LOC: (191, 455, 325) (0.4198, 0.5877, 0.4897)
 909 |         I-MISC: (204, 481, 557) (0.4241, 0.3662, 0.3931)
 910 |     Macro-average precision, recall, F1: (0.671525, 0.661871, 0.660646)
 911 |     Item accuracy: 49238 / 51533 (0.9555)
 912 |     Instance accuracy: 852 / 1517 (0.5616)
 913 |     <BLANKLINE>
 914 |     ============== iteration
 915 |     ***** Epoch #2 *****
 916 |     Loss: 31176.026308
 917 |     Feature L2-norm: 32.274598
 918 |     Learning rate (eta): 0.022727
 919 |     Total number of feature updates: 16646
 920 |     Seconds required for this iteration: 0.466
 921 |     Performance by label (#match, #model, #ref) (precision, recall, F1):
 922 |         B-LOC: (708, 1018, 1084) (0.6955, 0.6531, 0.6736)
 923 |         O: (45101, 45611, 45355) (0.9888, 0.9944, 0.9916)
 924 |         B-ORG: (1053, 1711, 1400) (0.6154, 0.7521, 0.6770)
 925 |         B-PER: (594, 777, 735) (0.7645, 0.8082, 0.7857)
 926 |         I-PER: (589, 778, 634) (0.7571, 0.9290, 0.8343)
 927 |         B-MISC: (94, 264, 339) (0.3561, 0.2773, 0.3118)
 928 |         I-ORG: (384, 468, 1104) (0.8205, 0.3478, 0.4885)
 929 |         I-LOC: (166, 285, 325) (0.5825, 0.5108, 0.5443)
 930 |         I-MISC: (210, 621, 557) (0.3382, 0.3770, 0.3565)
 931 |     Macro-average precision, recall, F1: (0.657608, 0.627752, 0.629257)
 932 |     Item accuracy: 48899 / 51533 (0.9489)
 933 |     Instance accuracy: 813 / 1517 (0.5359)
 934 |     <BLANKLINE>
 935 |     ============== iteration
 936 |     ***** Epoch #3 *****
 937 |     Loss: 23705.719839
 938 |     Feature L2-norm: 35.255014
 939 |     Learning rate (eta): 0.021739
 940 |     Total number of feature updates: 24969
 941 |     Seconds required for this iteration: 0.472
 942 |     Performance by label (#match, #model, #ref) (precision, recall, F1):
 943 |         B-LOC: (808, 1210, 1084) (0.6678, 0.7454, 0.7044)
 944 |         O: (45244, 45771, 45355) (0.9885, 0.9976, 0.9930)
 945 |         B-ORG: (1061, 1403, 1400) (0.7562, 0.7579, 0.7570)
 946 |         B-PER: (588, 728, 735) (0.8077, 0.8000, 0.8038)
 947 |         I-PER: (565, 640, 634) (0.8828, 0.8912, 0.8870)
 948 |         B-MISC: (86, 130, 339) (0.6615, 0.2537, 0.3667)
 949 |         I-ORG: (857, 1148, 1104) (0.7465, 0.7763, 0.7611)
 950 |         I-LOC: (152, 282, 325) (0.5390, 0.4677, 0.5008)
 951 |         I-MISC: (170, 221, 557) (0.7692, 0.3052, 0.4370)
 952 |     Macro-average precision, recall, F1: (0.757699, 0.666091, 0.690108)
 953 |     Item accuracy: 49531 / 51533 (0.9612)
 954 |     Instance accuracy: 889 / 1517 (0.5860)
 955 |     <BLANKLINE>
 956 |     ============== iteration
 957 |     ***** Epoch #4 *****
 958 |     Loss: 21273.137466
 959 |     Feature L2-norm: 37.985723
 960 |     Learning rate (eta): 0.020833
 961 |     Total number of feature updates: 33292
 962 |     Seconds required for this iteration: 0.468
 963 |     Performance by label (#match, #model, #ref) (precision, recall, F1):
 964 |         B-LOC: (848, 1276, 1084) (0.6646, 0.7823, 0.7186)
 965 |         O: (44212, 44389, 45355) (0.9960, 0.9748, 0.9853)
 966 |         B-ORG: (784, 896, 1400) (0.8750, 0.5600, 0.6829)
 967 |         B-PER: (582, 686, 735) (0.8484, 0.7918, 0.8191)
 968 |         I-PER: (570, 647, 634) (0.8810, 0.8991, 0.8899)
 969 |         B-MISC: (166, 619, 339) (0.2682, 0.4897, 0.3466)
 970 |         I-ORG: (152, 155, 1104) (0.9806, 0.1377, 0.2415)
 971 |         I-LOC: (138, 219, 325) (0.6301, 0.4246, 0.5074)
 972 |         I-MISC: (467, 2646, 557) (0.1765, 0.8384, 0.2916)
 973 |     Macro-average precision, recall, F1: (0.702269, 0.655374, 0.609212)
 974 |     Item accuracy: 47919 / 51533 (0.9299)
 975 |     Instance accuracy: 793 / 1517 (0.5227)
 976 |     <BLANKLINE>
 977 |     ============== iteration
 978 |     ***** Epoch #5 *****
 979 |     Loss: 20806.661564
 980 |     Feature L2-norm: 40.673070
 981 |     Learning rate (eta): 0.020000
 982 |     Total number of feature updates: 41615
 983 |     Seconds required for this iteration: 0.460
 984 |     Performance by label (#match, #model, #ref) (precision, recall, F1):
 985 |         B-LOC: (689, 892, 1084) (0.7724, 0.6356, 0.6974)
 986 |         O: (45171, 45556, 45355) (0.9915, 0.9959, 0.9937)
 987 |         B-ORG: (1214, 1931, 1400) (0.6287, 0.8671, 0.7289)
 988 |         B-PER: (529, 574, 735) (0.9216, 0.7197, 0.8083)
 989 |         I-PER: (520, 553, 634) (0.9403, 0.8202, 0.8762)
 990 |         B-MISC: (77, 96, 339) (0.8021, 0.2271, 0.3540)
 991 |         I-ORG: (1009, 1678, 1104) (0.6013, 0.9139, 0.7254)
 992 |         I-LOC: (126, 182, 325) (0.6923, 0.3877, 0.4970)
 993 |         I-MISC: (57, 71, 557) (0.8028, 0.1023, 0.1815)
 994 |     Macro-average precision, recall, F1: (0.794790, 0.629970, 0.651378)
 995 |     Item accuracy: 49392 / 51533 (0.9585)
 996 |     Instance accuracy: 885 / 1517 (0.5834)
 997 |     <BLANKLINE>
 998 |     ============== iteration
 999 |     SGD terminated with the maximum number of iterations
1000 |     Loss: 20806.661564
1001 |     Total seconds required for training: 3.350
1002 |     <BLANKLINE>
1003 |     ============== optimization_end
1004 |     Storing the model
1005 |     Number of active features: 96180 (96180)
1006 |     Number of active attributes: 76691 (83593)
1007 |     Number of active labels: 9 (9)
1008 |     Writing labels
1009 |     Writing attributes
1010 |     Writing feature references for transitions
1011 |     Writing feature references for attributes
1012 |     Seconds required: 0.329
1013 |     <BLANKLINE>
1014 |     ============== end
1015 | 
1016 |     """
1017 |     pass
1018 | 
1019 | 
1020 | def test_parser_log4():
1021 |     """
1022 |     >>> parser = TrainLogParser()
1023 |     >>> _apply_parser(parser, log4)
1024 |     Feature generation
1025 |     ============== start
1026 |     <BLANKLINE>
1027 |     Number of features: 0
1028 |     Seconds required: 0.001
1029 |     ============== featgen_end
1030 |     <BLANKLINE>
1031 |     L-BFGS optimization
1032 |     c1: 0.000000
1033 |     c2: 1.000000
1034 |     num_memories: 6
1035 |     max_iterations: 2147483647
1036 |     epsilon: 0.000010
1037 |     stop: 10
1038 |     delta: 0.000010
1039 |     linesearch: MoreThuente
1040 |     linesearch.max_iterations: 20
1041 |     <BLANKLINE>
1042 |     L-BFGS terminated with error code (-1020)
1043 |     ============== prepare_error
1044 |     Total seconds required for training: 0.000
1045 |     <BLANKLINE>
1046 |     ============== optimization_end
1047 |     Storing the model
1048 |     Number of active features: 0 (0)
1049 |     Number of active attributes: 0 (0)
1050 |     Number of active labels: 0 (0)
1051 |     Writing labels
1052 |     Writing attributes
1053 |     Writing feature references for transitions
1054 |     Writing feature references for attributes
1055 |     Seconds required: 0.000
1056 |     <BLANKLINE>
1057 |     ============== end
1058 | 
1059 |     """
1060 |     pass
1061 | 


--------------------------------------------------------------------------------