├── .coveragerc
├── .github
    ├── dependabot.yml
    └── workflows
    │   └── python-test.yml
├── .gitignore
├── AUTHORS.rst
├── CHANGES.rst
├── MANIFEST.in
├── README.rst
├── bench.ini
├── benchmarks
    ├── __init__.py
    ├── bench.py
    ├── shrink-unigrams.ipynb
    ├── speed.py
    └── utils.py
├── dev_data
    ├── toy_dict.xml
    └── unigrams.txt
├── docs
    ├── Makefile
    ├── _static
    │   └── rtfd_overrides.css
    ├── conf.py
    ├── glossary.rst
    ├── index.rst
    ├── internals
    │   ├── char-substitutes.rst
    │   ├── dict.rst
    │   ├── index.rst
    │   ├── prediction.rst
    │   └── umlauts.rst
    ├── make.bat
    ├── misc
    │   ├── 2trie.rst
    │   ├── _authors.rst
    │   ├── _changes.rst
    │   ├── api_reference.rst
    │   ├── citing.rst
    │   └── index.rst
    └── user
    │   ├── contributing.rst
    │   ├── grammemes.rst
    │   ├── guide.rst
    │   └── index.rst
├── pymorphy2
    ├── __init__.py
    ├── analyzer.py
    ├── cli.py
    ├── dawg.py
    ├── lang
    │   ├── __init__.py
    │   ├── ru
    │   │   ├── __init__.py
    │   │   └── config.py
    │   └── uk
    │   │   ├── __init__.py
    │   │   ├── _prefixes.py
    │   │   └── config.py
    ├── opencorpora_dict
    │   ├── __init__.py
    │   ├── compile.py
    │   ├── parse.py
    │   ├── preprocess.py
    │   ├── probability.py
    │   ├── storage.py
    │   └── wrapper.py
    ├── shapes.py
    ├── tagset.py
    ├── tokenizers.py
    ├── units
    │   ├── __init__.py
    │   ├── abbreviations.py
    │   ├── base.py
    │   ├── by_analogy.py
    │   ├── by_hyphen.py
    │   ├── by_lookup.py
    │   ├── by_shape.py
    │   ├── unkn.py
    │   └── utils.py
    ├── utils.py
    └── version.py
├── setup.cfg
├── setup.py
├── tests
    ├── conftest.py
    ├── test_analyzer.py
    ├── test_cli.py
    ├── test_dict_loading.py
    ├── test_inflection.py
    ├── test_lexemes.py
    ├── test_numeral_agreement.py
    ├── test_opencorpora_dict.py
    ├── test_parsing.py
    ├── test_prefix_matching.py
    ├── test_result_wrapper.py
    ├── test_tagset.py
    ├── test_threading.py
    ├── test_tokenizers.py
    ├── test_utils.py
    └── utils.py
└── tox.ini


/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | branch = True
3 | omit =
4 |     benchmarks/*
5 |     setup.py
6 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 |   - package-ecosystem: "pip"
 4 |     directory: "/"
 5 |     schedule:
 6 |       interval: "monthly"
 7 |     groups:
 8 |       dev-dependencies:
 9 |         patterns:
10 |           - "pytest*"
11 |           - "flake8*"
12 |           - "black"
13 |           - "isort"
14 |           - "ruff"
15 | 
16 |   - package-ecosystem: "github-actions"
17 |     directory: "/"
18 |     schedule:
19 |       interval: "monthly"
20 |     groups:
21 |       github-actions:
22 |         patterns:
23 |           - "*"
24 | 


--------------------------------------------------------------------------------
/.github/workflows/python-test.yml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ "master" ]
 6 |     paths:
 7 |       - 'tox.ini'
 8 |       - '**.py'
 9 |       - '.github/workflows/python-test.yml'
10 | 
11 |   pull_request:
12 |     branches: [ "master" ]
13 |     paths:
14 |       - 'tox.ini'
15 |       - '**.py'
16 |       - '.github/workflows/python-test.yml'
17 | 
18 | permissions:
19 |   contents: read
20 | 
21 | jobs:
22 |   build:
23 |     runs-on: ubuntu-latest
24 |     strategy:
25 |       fail-fast: false
26 |       matrix:
27 |         python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12", "3.13", "pypy3.10" ]
28 |         speed: ["slow", "fast"]
29 |         
30 |     steps:
31 |     - uses: actions/checkout@v4
32 |     - name: Set up Python ${{ matrix.python-version }}
33 |       uses: actions/setup-python@v5
34 |       with:
35 |         python-version: ${{ matrix.python-version }}
36 |         
37 |     - name: Install dependencies
38 |       run: |
39 |         python -m pip install --upgrade pip
40 |         pip install .
41 |         pip install tox
42 |         
43 |     - name: Test with tox
44 |       env:
45 |         TOXENV: ${{ matrix.speed }}
46 |       run: tox
47 | 
48 |     - name: Upload coverage data to coveralls.io
49 |       env:
50 |         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
51 |       run: |
52 |         pip install coveralls
53 |         coveralls --service=github
54 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | 
 2 | *.pyc
 3 | *.*~
 4 | *.swp
 5 | Thumbs.db
 6 | 
 7 | build/
 8 | docs/_build
 9 | dist/
10 | pymorphy2.egg-info
11 | stuff/
12 | htmlcov/
13 | .coverage
14 | .cache/
15 | 
16 | .idea/
17 | .rope*
18 | PYSMELLTAGS
19 | reports
20 | .ipynb_checkpoints
21 | *.ipynb
22 | *.prof
23 | *.html
24 | *.so
25 | 
26 | .tox
27 | MANIFEST
28 | 
29 | stuff
30 | 


--------------------------------------------------------------------------------
/AUTHORS.rst:
--------------------------------------------------------------------------------
 1 | Authors and Contributors
 2 | ========================
 3 | 
 4 | * Mikhail Korobov
 5 | * @radixvinni
 6 | * @ivirabyan
 7 | * @anti-social
 8 | * @insolor
 9 | * @kuk
10 | * @underoll
11 | * @valentino-sm
12 | 
13 | If you contributed to pymorphy2, please add yourself to this list
14 | (or update your contact information).
15 | 
16 | Many people contributed to pymorphy2 predecessor, pymorphy; they are
17 | listed here: https://github.com/kmike/pymorphy/blob/master/AUTHORS.rst
18 | 


--------------------------------------------------------------------------------
/CHANGES.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | История изменений
  3 | =================
  4 | 
  5 | 
  6 | 0.9.1 (2020-09-27)
  7 | ------------------
  8 | 
  9 | Исправлено обнаружение словарей в случае, когда pymorphy2 установлен
 10 | после запуска процесса. Это типичная ситуация в Jupyter Notebook или
 11 | Google Colab - в начале блокнота установить зависимости
 12 | (``!pip install pymorphy2``); pymorphy2==0.9 не работал в этом случае без
 13 | перезапуска блокнота.
 14 | 
 15 | 0.9 (2020-09-20)
 16 | ----------------
 17 | 
 18 | Новые возможности:
 19 | 
 20 | - Добавлена экспериментальная поддержка украинского языка. См. документацию.
 21 | - Улучшена утилита командной строки. См. `pymorphy --help`.
 22 | - Добавлена поддержка Python 3.7 и 3.8.
 23 | 
 24 | Обратно-несовместимые изменения:
 25 | 
 26 | - Внутренняя организация кода сильно поменялась
 27 | - Python 2.6 и 3.2 - 3.4 больше не поддерживаются. Python 2.7 пока
 28 |   поддерживается, но поддержка Python 2.x будет убрана в pymorphy2 v1.0.
 29 | 
 30 | Исправления ошибок, небольшие улучшения:
 31 | 
 32 | - Исправлена некорректная работа MorphAnalyzer в многопоточных программах
 33 | - улучшено поведение метода .inflect
 34 | - исправлена ошибка, вызванная некорректным кешированием нормальных форм
 35 | - Команды для скачивания и сборки словарей перенесены в пакет pymorphy2-dicts
 36 | - Ускорение сборки словарей
 37 | - Небольшое ускорение токенизатора
 38 | - улучшения в тестах и документации
 39 | 
 40 | 
 41 | 0.8 (2014-06-06)
 42 | ----------------
 43 | 
 44 | - pymorphy2 теперь использует setuptools;
 45 | - на pypi доступен пакет в формате wheel;
 46 | - зависимости устанавливаются автоматически;
 47 | - можно установить "быструю" версию через ``pip install pymorphy2[fast]``;
 48 | - копия docopt больше не распространяется вместе с pymorphy2;
 49 |   пакет ``pymorphy2.vendor`` больше не доступен.
 50 | 
 51 | В этом релизе изменен способ установки pymorphy2; никаких изменений
 52 | в разборе по сравнению с 0.7 нет.
 53 | 
 54 | 
 55 | 0.7 (2014-05-26)
 56 | ----------------
 57 | 
 58 | - Методы :meth:`~.MorphAnalyzer.parse` и :meth:`~.MorphAnalyzer.tag`
 59 |   теперь всегда возвращают хотя бы один вариант разбора:
 60 |   если разбор не удался, то вместо пустого списка теперь возвращается
 61 |   список с одним элементом UNKN;
 62 | - функция :func:`pymorphy2.shapes.restore_word_case` переименована
 63 |   в :func:`pymorphy2.shapes.restore_capitalization`;
 64 | - проверена совместимость с Python 3.4;
 65 | - в список для замен падежей OpencorporaTag.RARE_CASES добавлены граммемы
 66 |   gen1, acc1 и loc1 - они не используются в pymorphy2, но могут встречаться
 67 |   в выгрузке корпуса OpenCorpora;
 68 | - убран DeprecationWarning при использовании psutil < 2.x;
 69 | - небольшие улучшения в документации.
 70 | 
 71 | 0.6.1 (2014-04-23)
 72 | ------------------
 73 | 
 74 | - Для инициалов добавлена граммема Init.
 75 | 
 76 | 0.6 (2014-04-22)
 77 | ----------------
 78 | 
 79 | - Заглавные буквы предсказываются как инициалы;
 80 | - улучшен внутренний API для предсказателей - флаг terminal больше не нужен;
 81 | - улучшения в тестах.
 82 | 
 83 | Если вы использовали параметр ``units`` в конструкторе ``MorphAnalyzer``,
 84 | то вам нужно будет обновить код, т.к. вместо флага terminal теперь
 85 | предсказатели нужно группировать в list-ы в параметре ``units``.
 86 | 
 87 | 0.5 (2013-11-05)
 88 | ----------------
 89 | 
 90 | - Методы ``MorphAnalyzer.cyr2lat``, ``MorphAnalyzer.lat2cyr`` и атрибут
 91 |   ``OpencorporaTag.cyr_repr`` для преобразования между тегами/граммемами,
 92 |   записанными латиницей и кириллицей;
 93 | - тег для целых чисел теперь ``NUMB,intg``; для вещественных - ``NUMB,real``
 94 |   (раньше для всех был просто ``NUMB``);
 95 | - ``KnownSuffixAnalyzer`` теперь не вызывается для слов короче 4 символов.
 96 | 
 97 | 0.4 (2013-10-19)
 98 | ----------------
 99 | 
100 | - Parse.estimate переименован в score и содержит теперь
101 |   оценку P(tag|word) на основе данных из OpenCorpora;
102 | - по умолчанию результаты разбора сортируются по score.
103 | 
104 | То, что результатам сопоставляется оценка P(tag|word), может в некоторых
105 | случаях снизить скорость разбора раза в 1.5 - 2. Если эти оценки не нужны,
106 | создайте экземпляр MorphAnalyzer с параметром ``probability_estimator_cls=None``.
107 | 
108 | Для обновления требуется обновить pymorphy2-dicts до версии >= 2.4,
109 | а также библиотеки DAWG или DAWG-Python до версиий >= 0.7.
110 | 
111 | 
112 | 0.3.5 (2013-06-30)
113 | ------------------
114 | 
115 | - Препроцессинг словаря: loc1/gen1/acc1 заменяются на loct/gent/accs;
116 |   варианты написания тегов унифицируются (чтоб их было меньше);
117 | - исправлено согласование слов с числительными;
118 | - при склонении слов в loc2/gen2/acc2/voct слово ставится в loct/gent/accs/nomn,
119 |   если вариантов с loc2/gen2/acc2/voct не найдено.
120 | 
121 | Для полноценного обновления лучше обновить pymorphy2-dicts до версии >= 2.2.
122 | 
123 | 0.3.4 (2013-04-29)
124 | ------------------
125 | 
126 | - Добавлен метод ``Parse.make_agree_with_number`` для согласования слов
127 |   с числительными;
128 | - небольшие улучшения в документации.
129 | 
130 | 0.3.3 (2013-04-12)
131 | ------------------
132 | 
133 | - Исправлен тег, который выдает ``RomanNumberAnalyzer`` (теперь это ROMN,
134 |   как в OpenCorpora);
135 | - добавлена функция ``pymorphy2.tokenizers.simple_word_tokenize``,
136 |   которая разбивает текст по пробелам и пунктуации (но не дефису);
137 | - исправлена ошибка с разбором слов вроде "ретро-fm" (pymorphy2
138 |   раньше падал с исключением).
139 | 
140 | 0.3.2 (2013-04-03)
141 | ------------------
142 | 
143 | - добавлен ``RomanNumberAnalyzer`` для разбора римских чисел;
144 | - ``MorphAnalyzer`` и ``OpencorporaTag`` теперь можно сериализовывать
145 |   с помощью pickle;
146 | - улучшены тесты;
147 | - при компиляции словаря версия xml печатается раньше.
148 | 
149 | 0.3.1 (2013-03-12)
150 | ------------------
151 | 
152 | - Поправлен метод ``MorphAnalyzer.word_is_known``, который раньше
153 |   учитывал регистр слова (что неправильно);
154 | - исправлена ошибка в разборе слов с дефисом (тех, у которых лишний
155 |   дефис справа или слева).
156 | 
157 | 0.3 (2013-03-11)
158 | ----------------
159 | 
160 | - Рефакторинг: теперь при необходимости можно дописывать свои
161 |   "шаги" морфологического анализа ("предсказатели")
162 |   и комбинировать их с существующими (документация пока не готова,
163 |   и API может поменяться);
164 | - на вход больше не обязательно подавать слова в нижнем регистре
165 |   (но на выходе при этом регистр сохраняться не обязан - используйте
166 |   функцию ``pymorphy2.shapes.restore_word_case``, если требуется
167 |   восстановить регистр полученных слов);
168 | - улучшено предсказание неизвестных слов по словообразовательным префиксам
169 |   (учитывается больше таких префиксов);
170 | - реализован разбор (и склонение) слов с дефисами;
171 | - результаты разбора теперь включают в себя полную информацию о том,
172 |   как слово разбиралось; наличие ``para_id`` и ``idx`` при этом
173 |   больше не обязательно;
174 | - анализатор теперь отмечает пунктуацию тегом PNCT, числа - тегом NUMB,
175 |   слова, записанные латиницей - тегом LATN;
176 | - улучшено предсказание по неизвестному префиксу (добавлено ограничение по
177 |   граммеме Apro);
178 | - улучшения в тестах и бенчмарках;
179 | - удален атрибут ``morph.dict_meta`` (используйте ``morph.dictionary.meta``);
180 | - удален (возможно, временно) метод ``MorphAnalyzer.inflect``
181 |   (используйте метод ``inflect`` у результата разбора);
182 | - удален метод ``MorphAnalyzer.decline`` (используйте ``parse.lexeme``);
183 | - удалено свойство ``Parse.paradigm``.
184 | 
185 | В результате этих изменений улучшилось качество разбора, качество склонения
186 | и возможности по расширению библиотеки (втч для настройки под конкретную
187 | задачу), но скорость работы "из коробки" по сравнению с 0.2 снизилась
188 | примерно на треть.
189 | 
190 | 0.2 (2013-02-18)
191 | ----------------
192 | 
193 | - Улучшения в предсказателе: учет словоизменительных префиксов;
194 | - улучшения в предсказателе: равноценные варианты разбора не отбрасываются;
195 | - изменена схема проверки совместимости словарей;
196 | - изменен формат словарей (нужно обновить pymorphy2-dicts до 2.0);
197 | - добавлено свойство ``Parse.paradigm``.
198 | 
199 | 
200 | 0.1 (2013-02-14)
201 | ----------------
202 | 
203 | Первый альфа-релиз. Релизована основа: эффективный разбор и склонение,
204 | обновление словарей, полная поддержка буквы ё.
205 | 
206 | Многие вещи, которые были доступны в pymorphy, пока не работают
207 | (разбор слов с дефисом, разбор фамилий, поддержка шаблонов django,
208 | утилиты из contrib).
209 | 
210 | Кроме того, API пока не зафиксирован и может меняться в последующих релизах.
211 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include AUTHORS.rst
 2 | include README.rst
 3 | include CHANGES.rst
 4 | include docs/Makefile
 5 | include docs/make.bat
 6 | include docs/conf.py
 7 | 
 8 | recursive-include docs *.rst
 9 | recursive-include benchmarks *.py
10 | 
11 | recursive-exclude benchmarks *.py
12 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | pymorphy2
 2 | =========
 3 | 
 4 | Morphological analyzer (POS tagger + inflection engine)
 5 | for Russian and Ukrainian languages. License is MIT.
 6 | 
 7 | .. image:: https://img.shields.io/pypi/v/pymorphy2.svg
 8 |    :target: https://pypi.python.org/pypi/pymorphy2
 9 |    :alt: PyPI Version
10 | 
11 | .. image:: https://github.com/pymorphy2-fork/pymorphy2/actions/workflows/python-test.yml/badge.svg
12 |    :target: https://github.com/pymorphy2-fork/pymorphy2/actions/workflows/python-test.yml
13 |    :alt: Test Status
14 | 
15 | .. image:: https://coveralls.io/repos/github/pymorphy2-fork/pymorphy2/badge.svg?branch=master
16 |    :target: https://coveralls.io/github/pymorphy2-fork/pymorphy2?branch=master
17 |    :alt: Code Coverage
18 | 
19 | .. image:: https://readthedocs.org/projects/pymorphy2/badge/?version=latest
20 |    :target: https://pymorphy2.readthedocs.io/
21 |    :alt: Documentation
22 | 
23 | * docs: https://pymorphy2.readthedocs.io
24 | * changelog: https://github.com/pymorphy2-fork/pymorphy2/blob/master/CHANGES.rst
25 | * source code: github_
26 | * bug tracker: https://github.com/pymorphy2-fork/pymorphy2/issues
27 | * discussions: https://github.com/orgs/pymorphy2-fork/discussions
28 | 
29 | .. _github: https://github.com/pymorphy2-fork/pymorphy2
30 | 
31 | Citing
32 | ------
33 | 
34 | ::
35 | 
36 |     Korobov M.: Morphological Analyzer and Generator for Russian and
37 |     Ukrainian Languages // Analysis of Images, Social Networks and Texts,
38 |     pp 320-332 (2015).
39 | 
40 | Links:
41 | 
42 | * `Springer <https://link.springer.com/chapter/10.1007%2F978-3-319-26123-2_31>`_
43 | * `PDF <https://arxiv.org/pdf/1503.07283v1.pdf>`_
44 | 
45 | BibTeX::
46 | 
47 |    @incollection{
48 |       year={2015},
49 |       isbn={978-3-319-26122-5},
50 |       booktitle={Analysis of Images, Social Networks and Texts},
51 |       volume={542},
52 |       series={Communications in Computer and Information Science},
53 |       editor={Khachay, Mikhail Yu. and Konstantinova, Natalia and Panchenko, Alexander and Ignatov, Dmitry I. and Labunets, Valeri G.},
54 |       doi={10.1007/978-3-319-26123-2_31},
55 |       title={Morphological Analyzer and Generator for Russian and Ukrainian Languages},
56 |       url={http://dx.doi.org/10.1007/978-3-319-26123-2_31},
57 |       publisher={Springer International Publishing},
58 |       keywords={Morphological analyzer; Russian; Ukrainian; Morphological generator; Open source; OpenCorpora; LanguageTool; pymorphy2; pymorphy},
59 |       author={Korobov, Mikhail},
60 |       pages={320-332},
61 |       language={English}
62 |    }
63 | 


--------------------------------------------------------------------------------
/bench.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py37,py38,py39,py310,py311,pypy3
 3 | 
 4 | [base]
 5 | deps =
 6 |     pytest
 7 |     psutil
 8 | 
 9 | [testenv]
10 | deps=
11 |     dawg2 >= 0.7.7
12 |     tqdm
13 |     {[base]deps}
14 | 
15 | ;setenv =
16 | ;    PYMORPHY2_DICT_PATH = ../pymorphy2-dicts/pymorphy2_dicts/data
17 | 
18 | commands=
19 |     python setup.py install
20 |     pymorphy dict mem_usage
21 |     python benchmarks/bench.py run []
22 | 
23 | commands=
24 |     python setup.py install
25 |     pymorphy dict mem_usage
26 |     python benchmarks/bench.py run {posargs:--repeats=37}
27 | 
28 | 
29 | [testenv:pypy3]
30 | deps=
31 |     {[base]deps}
32 | 
33 | commands=
34 |     python setup.py install
35 | 
36 |     ; psutil doesn't work with pypy3 + OS X
37 |     ; pymorphy dict mem_usage
38 | 
39 |     python benchmarks/bench.py run {posargs:--repeats=37}
40 | 
41 | [testenv:py37_no_compiler]
42 | basepython = python3.7
43 | deps=
44 |     {[base]deps}
45 | 


--------------------------------------------------------------------------------
/benchmarks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pymorphy2-fork/pymorphy2/8fdb9e0d066c1ed7cc6bb0ac1f747b7960eeeb18/benchmarks/__init__.py


--------------------------------------------------------------------------------
/benchmarks/bench.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Pymorphy2 benchmark utility.
 3 | 
 4 | Usage:
 5 |     bench.py run [--dict=<DICT_PATH>] [--repeats=<NUM>] [--verbose]
 6 |     bench.py -h | --help
 7 |     bench.py --version
 8 | 
 9 | Options:
10 |     -d --dict <DICT_PATH>   Use dictionary from <DICT_PATH>
11 |     -r --repeats <NUM>      Number of times to run each benchmarks [default: 5]
12 |     -v --verbose            Be more verbose
13 | 
14 | """
15 | import logging
16 | import os
17 | import sys
18 | 
19 | from docopt import docopt
20 | 
21 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
22 | 
23 | import pymorphy2
24 | from benchmarks import speed
25 | 
26 | logger = logging.getLogger('pymorphy2.bench')
27 | logger.addHandler(logging.StreamHandler())
28 | 
29 | 
30 | def main():
31 |     """ CLI interface dispatcher """
32 |     args = docopt(__doc__, version=pymorphy2.__version__)
33 | 
34 |     if args['--verbose']:
35 |         logger.setLevel(logging.DEBUG)
36 |     else:
37 |         logger.setLevel(logging.INFO)
38 | 
39 |     if args['run']:
40 |         speed.bench_all(
41 |             dict_path=args['--dict'],
42 |             repeats=int(args['--repeats'])
43 |         )
44 | 
45 |     return 0
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     sys.exit(main())
50 | 


--------------------------------------------------------------------------------
/benchmarks/shrink-unigrams.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "metadata": {
 3 |   "name": "shrink-unigrams"
 4 |  },
 5 |  "nbformat": 3,
 6 |  "nbformat_minor": 0,
 7 |  "worksheets": [
 8 |   {
 9 |    "cells": [
10 |     {
11 |      "cell_type": "markdown",
12 |      "metadata": {},
13 |      "source": [
14 |       "This notebook prepares a reduced file with unigrams (to make benchmarks runs faster)."
15 |      ]
16 |     },
17 |     {
18 |      "cell_type": "code",
19 |      "collapsed": false,
20 |      "input": [
21 |       "import random\n",
22 |       "import math\n",
23 |       "random.seed(0)\n",
24 |       "\n",
25 |       "TOTAL_UNIGRAMS = 2000\n",
26 |       "HEAD_SIZE = 400\n",
27 |       "COUNTS_SCALE = 0.05\n",
28 |       "OUT_PATH = '../dev_data/unigrams.txt'\n",
29 |       "\n",
30 |       "# unigrams file downloaded from http://opencorpora.org/?page=downloads\n",
31 |       "SOURCE_UNIGRAMS_PATH = '../dev_data/unigrams.cyr.lc' "
32 |      ],
33 |      "language": "python",
34 |      "metadata": {},
35 |      "outputs": [],
36 |      "prompt_number": 109
37 |     },
38 |     {
39 |      "cell_type": "code",
40 |      "collapsed": false,
41 |      "input": [
42 |       "def _scaled(txt):\n",
43 |       "    return math.ceil(int(txt)*COUNTS_SCALE)\n",
44 |       "\n",
45 |       "with open(SOURCE_UNIGRAMS_PATH, 'rb') as f:\n",
46 |       "    lines = [line.split() for line in f.read().decode('utf8').split('\\n') if line]\n",
47 |       "    unigrams = [(word, _scaled(count), _scaled(ipm)) for (word, count, ipm) in lines]"
48 |      ],
49 |      "language": "python",
50 |      "metadata": {},
51 |      "outputs": [],
52 |      "prompt_number": 110
53 |     },
54 |     {
55 |      "cell_type": "code",
56 |      "collapsed": false,
57 |      "input": [
58 |       "head, tail = unigrams[:HEAD_SIZE], unigrams[HEAD_SIZE:]\n",
59 |       "take_ids = sorted(random.sample(range(len(tail)), TOTAL_UNIGRAMS-HEAD_SIZE))\n",
60 |       "result = head + [tail[i] for i in take_ids]\n",
61 |       "assert len(result) == TOTAL_UNIGRAMS"
62 |      ],
63 |      "language": "python",
64 |      "metadata": {},
65 |      "outputs": [],
66 |      "prompt_number": 111
67 |     },
68 |     {
69 |      "cell_type": "code",
70 |      "collapsed": false,
71 |      "input": [
72 |       "with open(OUT_PATH, 'wb') as f:\n",
73 |       "    f.write(\"\\n\".join([\"%s\\t%s\\t%s\" % r for r in result]).encode('utf8'))"
74 |      ],
75 |      "language": "python",
76 |      "metadata": {},
77 |      "outputs": [],
78 |      "prompt_number": 112
79 |     }
80 |    ],
81 |    "metadata": {}
82 |   }
83 |  ]
84 | }


--------------------------------------------------------------------------------
/benchmarks/speed.py:
--------------------------------------------------------------------------------
  1 | import codecs
  2 | import datetime
  3 | import functools
  4 | import logging
  5 | import os
  6 | 
  7 | from benchmarks import utils
  8 | from pymorphy2 import MorphAnalyzer
  9 | 
 10 | logger = logging.getLogger('pymorphy2.bench')
 11 | 
 12 | DATA_PATH = os.path.join(
 13 |     os.path.dirname(__file__),
 14 |     '..',
 15 |     'dev_data',
 16 |     'unigrams.txt'
 17 | )
 18 | 
 19 | def load_words(path=DATA_PATH):
 20 |     words = []
 21 |     with codecs.open(path, 'r', 'utf8') as f:
 22 |         for line in f:
 23 |             word, count, ipm = line.split()
 24 |             count = int(count)
 25 |             words.append((word.lower(), count))
 26 |     return words
 27 | 
 28 | def get_total_usages(words):
 29 |     return sum(w[1] for w in words)
 30 | 
 31 | def bench_tag(morph, words, total_usages, repeats):
 32 |     word_no_umlauts = [(w[0].replace('ё', 'е'), w[1]) for w in words]
 33 | 
 34 |     def _run():
 35 |         for word, cnt in words:
 36 |             for x in range(cnt):
 37 |                 morph.tag(word)
 38 | 
 39 |     def _run_nofreq():
 40 |         for word, cnt in words:
 41 |             morph.tag(word)
 42 | 
 43 |     def _run_no_umlauts():
 44 |         for word, cnt in word_no_umlauts:
 45 |             morph.tag(word)
 46 | 
 47 |     def _run_str():
 48 |         for word, cnt in words:
 49 |             str(morph.tag(word))
 50 | 
 51 |     measure = functools.partial(utils.measure, repeats=repeats)
 52 | 
 53 |     logger.info("    morph.tag(w): %0.0f words/sec (considering word frequencies)", measure(_run, total_usages))
 54 |     logger.info("    morph.tag(w): %0.0f words/sec", measure(_run_nofreq, len(words)))
 55 |     logger.info("    morph.tag(w): %0.0f words/sec (umlauts removed from input)", measure(_run_no_umlauts, len(words)))
 56 |     logger.info("    morph.tag(w): %0.0f words/sec (str(tag) called)", measure(_run_str, len(words)))
 57 | 
 58 | 
 59 | def bench_parse(morph, words, total_usages, repeats):
 60 |     def _run():
 61 |         for word, cnt in words:
 62 |             for x in range(cnt):
 63 |                 morph.parse(word)
 64 | 
 65 |     def _run_nofreq():
 66 |         for word, cnt in words:
 67 |             morph.parse(word)
 68 | 
 69 |     def _run_normal_form():
 70 |         for word, cnt in words:
 71 |             [p.normal_form for p in morph.parse(word)]
 72 | 
 73 |     def _run_normalized():
 74 |         for word, cnt in words:
 75 |             [p.normalized for p in morph.parse(word)]
 76 | 
 77 |     def _run_is_noun():
 78 |         for word, cnt in words:
 79 |             [{'NOUN'} in p.tag for p in morph.parse(word)]
 80 | 
 81 |     def _run_is_noun2():
 82 |         for word, cnt in words:
 83 |             [p.tag.POS == 'NOUN' for p in morph.parse(word)]
 84 | 
 85 |     def _run_word_is_known():
 86 |         for x in range(10):
 87 |             for word, cnt in words:
 88 |                 morph.word_is_known(word)
 89 | 
 90 |     def _run_cyr_repr():
 91 |         for word, cnt in words:
 92 |             [p.tag.cyr_repr for p in morph.parse(word)]
 93 | 
 94 |     def _run_grammemes_cyr():
 95 |         for word, cnt in words:
 96 |             [p.tag.grammemes_cyr for p in morph.parse(word)]
 97 | 
 98 |     def _run_POS_cyr():
 99 |         for word, cnt in words:
100 |             [morph.lat2cyr(p.tag) for p in morph.parse(word)]
101 | 
102 |     def _run_lexeme():
103 |         for word, cnt in words[::5]:
104 |             [p.lexeme for p in morph.parse(word)]
105 | 
106 |     measure = functools.partial(utils.measure, repeats=repeats)
107 | 
108 |     def show_info(bench_name, func, note='', count=len(words)):
109 |         wps = measure(func, count)
110 |         logger.info("    %-50s %0.0f words/sec %s", bench_name, wps, note)
111 | 
112 | 
113 |     # === run benchmarks:
114 | 
115 |     show_info('morph.parse(w)', _run_nofreq)
116 |     show_info('morph.parse(w)', _run, '(considering word frequencies)', total_usages)
117 | 
118 |     if morph._result_type is not None:
119 |         show_info('morph.word_is_known(w)', _run_word_is_known, count=len(words)*10)
120 |         show_info("[p.normal_form for p in morph.parse(w)]", _run_normal_form)
121 |         show_info("[p.normalized for p in morph.parse(w)]", _run_normalized)
122 |         show_info("[p.lexeme for p in morph.parse(w)]", _run_lexeme, count=len(words)/5)
123 |         show_info("[{'NOUN'} in p.tag for p in morph.parse(w)]", _run_is_noun)
124 |         show_info("[p.tag.POS == 'NOUN' for p in morph.parse(w)]", _run_is_noun2)
125 |         show_info("[p.tag.cyr_repr for p in morph.parse(word)]", _run_cyr_repr)
126 |         show_info("[p.tag.grammemes_cyr for p in morph.parse(word)]", _run_grammemes_cyr)
127 |         show_info("[morph.lat2cyr(p.tag) for p in morph.parse(word)]", _run_POS_cyr)
128 | 
129 |     logger.info("")
130 | 
131 | 
132 | def bench_all(repeats, dict_path=None):
133 |     """ Run all benchmarks """
134 |     logger.debug("loading MorphAnalyzer...")
135 |     morph = MorphAnalyzer(dict_path)
136 |     morph_plain = MorphAnalyzer(dict_path, result_type=None)
137 | 
138 |     logger.debug("loading benchmark data...")
139 |     words = load_words()
140 |     total_usages = get_total_usages(words)
141 | 
142 |     logger.debug("Words: %d, usages: %d", len(words), total_usages)
143 | 
144 |     start_time = datetime.datetime.now()
145 | 
146 |     logger.info("\nbenchmarking MorphAnalyzer():")
147 |     bench_parse(morph, words, total_usages, repeats)
148 |     bench_tag(morph, words, total_usages, repeats)
149 | 
150 |     logger.info("\nbenchmarking MorphAnalyzer(result_type=None):")
151 |     bench_parse(morph_plain, words, total_usages, repeats)
152 | 
153 |     end_time = datetime.datetime.now()
154 |     logger.info("----\nDone in %s.\n" % (end_time-start_time))
155 | 


--------------------------------------------------------------------------------
/benchmarks/utils.py:
--------------------------------------------------------------------------------
 1 | import gc
 2 | import time
 3 | import timeit
 4 | 
 5 | 
 6 | def measure(func, inner_iterations=1, repeats=5):
 7 |     """
 8 |     Runs func ``repeats`` times and returns the fastest speed
 9 |     (inner loop iterations per second). Use ``inner_iterations`` to specify
10 |     the number of inner loop iterations.
11 | 
12 |     Use this function for long-running functions.
13 |     """
14 |     gc.disable()
15 |     times = []
16 |     for x in range(repeats):
17 |         start = time.time()
18 |         func()
19 |         times.append(time.time() - start)
20 | 
21 |     gc.enable()
22 |     return inner_iterations/min(times)
23 | 
24 | 
25 | def bench(stmt, setup, op_count=1, repeats=3, runs=5):
26 |     """
27 |     Runs ``stmt`` benchmark ``repeats``*``runs`` times,
28 |     selects the fastest run and returns the minimum time.
29 |     """
30 |     timer = timeit.Timer(stmt, setup)
31 |     times = []
32 |     for x in range(runs):
33 |         times.append(timer.timeit(repeats))
34 | 
35 |     def op_time(t):
36 |         return op_count*repeats / t
37 | 
38 |     return op_time(min(times))
39 | 
40 | 
41 | def format_bench(name, result, description='K words/sec'):
42 |     return "%25s:\t%0.3f%s" % (name, result, description)
43 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # Internal variables.
 11 | PAPEROPT_a4     = -D latex_paper_size=a4
 12 | PAPEROPT_letter = -D latex_paper_size=letter
 13 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 14 | # the i18n builder cannot share the environment and doctrees with the others
 15 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 16 | 
 17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 18 | 
 19 | help:
 20 | 	@echo "Please use \`make <target>' where <target> is one of"
 21 | 	@echo "  html       to make standalone HTML files"
 22 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 23 | 	@echo "  singlehtml to make a single large HTML file"
 24 | 	@echo "  pickle     to make pickle files"
 25 | 	@echo "  json       to make JSON files"
 26 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 27 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 28 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 29 | 	@echo "  epub       to make an epub"
 30 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 31 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 32 | 	@echo "  text       to make text files"
 33 | 	@echo "  man        to make manual pages"
 34 | 	@echo "  texinfo    to make Texinfo files"
 35 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 36 | 	@echo "  gettext    to make PO message catalogs"
 37 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 38 | 	@echo "  linkcheck  to check all external links for integrity"
 39 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 40 | 
 41 | clean:
 42 | 	-rm -rf $(BUILDDIR)/*
 43 | 
 44 | html:
 45 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 46 | 	@echo
 47 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 48 | 
 49 | dirhtml:
 50 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 51 | 	@echo
 52 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 53 | 
 54 | singlehtml:
 55 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 56 | 	@echo
 57 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 58 | 
 59 | pickle:
 60 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 61 | 	@echo
 62 | 	@echo "Build finished; now you can process the pickle files."
 63 | 
 64 | json:
 65 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 66 | 	@echo
 67 | 	@echo "Build finished; now you can process the JSON files."
 68 | 
 69 | htmlhelp:
 70 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 71 | 	@echo
 72 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 73 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 74 | 
 75 | qthelp:
 76 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 77 | 	@echo
 78 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 79 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 80 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pymorphy2.qhcp"
 81 | 	@echo "To view the help file:"
 82 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pymorphy2.qhc"
 83 | 
 84 | devhelp:
 85 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 86 | 	@echo
 87 | 	@echo "Build finished."
 88 | 	@echo "To view the help file:"
 89 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/pymorphy2"
 90 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pymorphy2"
 91 | 	@echo "# devhelp"
 92 | 
 93 | epub:
 94 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
 95 | 	@echo
 96 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
 97 | 
 98 | latex:
 99 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
100 | 	@echo
101 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
102 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
103 | 	      "(use \`make latexpdf' here to do that automatically)."
104 | 
105 | latexpdf:
106 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
107 | 	@echo "Running LaTeX files through pdflatex..."
108 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
109 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
110 | 
111 | text:
112 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
113 | 	@echo
114 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
115 | 
116 | man:
117 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
118 | 	@echo
119 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
120 | 
121 | texinfo:
122 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
123 | 	@echo
124 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
125 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
126 | 	      "(use \`make info' here to do that automatically)."
127 | 
128 | info:
129 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
130 | 	@echo "Running Texinfo files through makeinfo..."
131 | 	make -C $(BUILDDIR)/texinfo info
132 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
133 | 
134 | gettext:
135 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
136 | 	@echo
137 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
138 | 
139 | changes:
140 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
141 | 	@echo
142 | 	@echo "The overview file is in $(BUILDDIR)/changes."
143 | 
144 | linkcheck:
145 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
146 | 	@echo
147 | 	@echo "Link check complete; look for any errors in the above output " \
148 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
149 | 
150 | doctest:
151 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
152 | 	@echo "Testing of doctests in the sources finished, look at the " \
153 | 	      "results in $(BUILDDIR)/doctest/output.txt."
154 | 


--------------------------------------------------------------------------------
/docs/_static/rtfd_overrides.css:
--------------------------------------------------------------------------------
 1 | /* override table width restrictions */
 2 | /* see https://github.com/snide/sphinx_rtd_theme/issues/117 */
 3 | .wy-table-responsive table td, .wy-table-responsive table th {
 4 |     white-space: normal !important;
 5 | }
 6 | 
 7 | .wy-table-responsive {
 8 |     overflow: visible !important;
 9 | }
10 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # pymorphy2 documentation build configuration file, created by
  2 | # sphinx-quickstart on Sun Jul 29 04:34:30 2012.
  3 | #
  4 | # This file is execfile()d with the current directory set to its containing dir.
  5 | #
  6 | # Note that not all possible configuration values are present in this
  7 | # autogenerated file.
  8 | #
  9 | # All configuration values have a default; values that are commented out
 10 | # serve to show the default.
 11 | 
 12 | import os
 13 | import sys
 14 | 
 15 | # If extensions (or modules to document with autodoc) are in another directory,
 16 | # add these directories to sys.path here. If the directory is relative to the
 17 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 18 | sys.path.insert(0, os.path.abspath(
 19 |     os.path.join(os.path.dirname(__file__), '..')
 20 | ))
 21 | 
 22 | def setup(app):
 23 |     # see https://github.com/snide/sphinx_rtd_theme/issues/117
 24 |     app.add_stylesheet("rtfd_overrides.css")
 25 | 
 26 | # -- General configuration -----------------------------------------------------
 27 | 
 28 | # If your documentation needs a minimal Sphinx version, state it here.
 29 | #needs_sphinx = '1.0'
 30 | 
 31 | # Add any Sphinx extension module names here, as strings. They can be extensions
 32 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 33 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.todo', 'sphinx.ext.viewcode', 'sphinx.ext.graphviz']
 34 | 
 35 | graphviz_output_format = 'svg'
 36 | 
 37 | # Add any paths that contain templates here, relative to this directory.
 38 | templates_path = ['_templates']
 39 | 
 40 | # The suffix of source filenames.
 41 | source_suffix = '.rst'
 42 | 
 43 | # The encoding of source files.
 44 | #source_encoding = 'utf-8-sig'
 45 | 
 46 | # The master toctree document.
 47 | master_doc = 'index'
 48 | 
 49 | # General information about the project.
 50 | project = 'Морфологический анализатор pymorphy2'
 51 | copyright = '2013-2020, Mikhail Korobov'
 52 | 
 53 | # The version info for the project you're documenting, acts as replacement for
 54 | # |version| and |release|, also used in various other places throughout the
 55 | # built documents.
 56 | #
 57 | # The short X.Y version.
 58 | version = 'v0.9'
 59 | # The full version, including alpha/beta/rc tags.
 60 | release = 'v0.9'
 61 | 
 62 | # The language for content autogenerated by Sphinx. Refer to documentation
 63 | # for a list of supported languages.
 64 | language = 'ru'
 65 | 
 66 | # There are two options for replacing |today|: either, you set today to some
 67 | # non-false value, then it is used:
 68 | #today = ''
 69 | # Else, today_fmt is used as the format for a strftime call.
 70 | #today_fmt = '%B %d, %Y'
 71 | 
 72 | # List of patterns, relative to source directory, that match files and
 73 | # directories to ignore when looking for source files.
 74 | exclude_patterns = ['_build']
 75 | 
 76 | # The reST default role (used for this markup: `text`) to use for all documents.
 77 | #default_role = None
 78 | 
 79 | # If true, '()' will be appended to :func: etc. cross-reference text.
 80 | #add_function_parentheses = True
 81 | 
 82 | # If true, the current module name will be prepended to all description
 83 | # unit titles (such as .. function::).
 84 | #add_module_names = True
 85 | 
 86 | # If true, sectionauthor and moduleauthor directives will be shown in the
 87 | # output. They are ignored by default.
 88 | #show_authors = False
 89 | 
 90 | # The name of the Pygments (syntax highlighting) style to use.
 91 | pygments_style = 'sphinx'
 92 | 
 93 | # A list of ignored prefixes for module index sorting.
 94 | #modindex_common_prefix = []
 95 | 
 96 | 
 97 | # -- Options for HTML output ---------------------------------------------------
 98 | 
 99 | # The theme to use for HTML and HTML Help pages.  See the documentation for
100 | # a list of builtin themes.
101 | html_theme = 'sphinx_rtd_theme'
102 | 
103 | # Theme options are theme-specific and customize the look and feel of a theme
104 | # further.  For a list of options available for each theme, see the
105 | # documentation.
106 | #html_theme_options = {}
107 | 
108 | # Add any paths that contain custom themes here, relative to this directory.
109 | #html_theme_path = []
110 | 
111 | # The name for this set of Sphinx documents.  If None, it defaults to
112 | # "<project> v<release> documentation".
113 | html_title = "Морфологический анализатор pymorphy2"
114 | 
115 | # A shorter title for the navigation bar.  Default is the same as html_title.
116 | #html_short_title = u'pymorphy2'
117 | 
118 | # The name of an image file (relative to this directory) to place at the top
119 | # of the sidebar.
120 | #html_logo = None
121 | 
122 | # The name of an image file (within the static path) to use as favicon of the
123 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
124 | # pixels large.
125 | #html_favicon = None
126 | 
127 | # Add any paths that contain custom static files (such as style sheets) here,
128 | # relative to this directory. They are copied after the builtin static files,
129 | # so a file named "default.css" will overwrite the builtin "default.css".
130 | html_static_path = ['_static']
131 | 
132 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
133 | # using the given strftime format.
134 | #html_last_updated_fmt = '%b %d, %Y'
135 | 
136 | # If true, SmartyPants will be used to convert quotes and dashes to
137 | # typographically correct entities.
138 | #html_use_smartypants = True
139 | 
140 | # Custom sidebar templates, maps document names to template names.
141 | #html_sidebars = {}
142 | 
143 | # Additional templates that should be rendered to pages, maps page names to
144 | # template names.
145 | #html_additional_pages = {}
146 | 
147 | # If false, no module index is generated.
148 | #html_domain_indices = True
149 | 
150 | # If false, no index is generated.
151 | #html_use_index = True
152 | 
153 | # If true, the index is split into individual pages for each letter.
154 | #html_split_index = False
155 | 
156 | # If true, links to the reST sources are added to the pages.
157 | #html_show_sourcelink = True
158 | 
159 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
160 | #html_show_sphinx = True
161 | 
162 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
163 | #html_show_copyright = True
164 | 
165 | # If true, an OpenSearch description file will be output, and all pages will
166 | # contain a <link> tag referring to it.  The value of this option must be the
167 | # base URL from which the finished HTML is served.
168 | #html_use_opensearch = ''
169 | 
170 | # This is the file name suffix for HTML files (e.g. ".xhtml").
171 | #html_file_suffix = None
172 | 
173 | # Output file base name for HTML help builder.
174 | htmlhelp_basename = 'pymorphy2doc'
175 | 
176 | 
177 | # -- Options for LaTeX output --------------------------------------------------
178 | 
179 | latex_elements = {
180 | # The paper size ('letterpaper' or 'a4paper').
181 | #'papersize': 'letterpaper',
182 | 
183 | # The font size ('10pt', '11pt' or '12pt').
184 | #'pointsize': '10pt',
185 | 
186 | # Additional stuff for the LaTeX preamble.
187 | #'preamble': '',
188 | }
189 | 
190 | # Grouping the document tree into LaTeX files. List of tuples
191 | # (source start file, target name, title, author, documentclass [howto/manual]).
192 | latex_documents = [
193 |   ('index', 'pymorphy2.tex', 'pymorphy2 Documentation',
194 |    'Mikhail Korobov', 'manual'),
195 | ]
196 | 
197 | # The name of an image file (relative to this directory) to place at the top of
198 | # the title page.
199 | #latex_logo = None
200 | 
201 | # For "manual" documents, if this is true, then toplevel headings are parts,
202 | # not chapters.
203 | #latex_use_parts = False
204 | 
205 | # If true, show page references after internal links.
206 | #latex_show_pagerefs = False
207 | 
208 | # If true, show URL addresses after external links.
209 | #latex_show_urls = False
210 | 
211 | # Documents to append as an appendix to all manuals.
212 | #latex_appendices = []
213 | 
214 | # If false, no module index is generated.
215 | #latex_domain_indices = True
216 | 
217 | 
218 | # -- Options for manual page output --------------------------------------------
219 | 
220 | # One entry per manual page. List of tuples
221 | # (source start file, name, description, authors, manual section).
222 | man_pages = [
223 |     ('index', 'pymorphy2', 'pymorphy2 Documentation',
224 |      ['Mikhail Korobov'], 1)
225 | ]
226 | 
227 | # If true, show URL addresses after external links.
228 | #man_show_urls = False
229 | 
230 | 
231 | # -- Options for Texinfo output ------------------------------------------------
232 | 
233 | # Grouping the document tree into Texinfo files. List of tuples
234 | # (source start file, target name, title, author,
235 | #  dir menu entry, description, category)
236 | texinfo_documents = [
237 |   ('index', 'pymorphy2', 'pymorphy2 Documentation',
238 |    'Mikhail Korobov', 'pymorphy2', 'One line description of project.',
239 |    'Miscellaneous'),
240 | ]
241 | 
242 | # Documents to append as an appendix to all manuals.
243 | #texinfo_appendices = []
244 | 
245 | # If false, no module index is generated.
246 | #texinfo_domain_indices = True
247 | 
248 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
249 | #texinfo_show_urls = 'footnote'
250 | 


--------------------------------------------------------------------------------
/docs/glossary.rst:
--------------------------------------------------------------------------------
 1 | ============
 2 | Терминология
 3 | ============
 4 | 
 5 | .. glossary::
 6 | 
 7 |     лексема
 8 |         Набор всех форм одного слова. Например, "ёж", "ежи" и "ежам"
 9 |         входят в одну лексему. [1]_
10 | 
11 |     лемма
12 |     нормальная форма слова
13 |         Каноническая форма слова (например, форма единственного числа,
14 |         именительного падежа для существительных). [2]_
15 | 
16 |     граммема
17 |         Значение какой-либо грамматической характеристики слова.
18 |         Например, "множественное число" или "деепричастие".
19 |         Множество всех граммем, характеризующих данное слово,
20 |         образует :term:`тег`.
21 | 
22 |         См. также: :ref:`grammeme-docs`.
23 | 
24 |     тег
25 |         Набор :term:`граммем <граммема>`, характеризующих данное слово.
26 |         Например, для слова "ежам" тегом может быть
27 |         ``'NOUN,anim,masc plur,datv'``.
28 | 
29 | 
30 |     парадигма
31 |     словоизменительная парадигма
32 |         Образец для склонения или спряжения; правила, согласно которым
33 |         можно получить все формы слов в :term:`лексеме <лексема>` для данного
34 |         :term:`стема <стем>`.
35 | 
36 |         В pymorphy2 для каждого слова в словаре указано, по каким парадигмам
37 |         это слово могло быть образовано; pymorphy2 также умеет предсказывать
38 |         парадигму для слов, отсутствующих в словаре.
39 | 
40 |     стем
41 |         Неизменяемая часть слова.
42 | 
43 | 
44 | .. [1]
45 | 
46 |     Часто не делается различия между леммой и лексемой, или термин
47 |     "лемма" употребляется в значении "набор форм слова". Но, похоже,
48 |     данное выше определение лексемы все же более стандартное (см., например,
49 |     см. `википедию`_ или `Foundations of Statistical Natural Language Processing`_),
50 |     поэтому в pymorphy2 набор всех форм слова называется именно лексемой.
51 | 
52 | .. [2]
53 | 
54 |     В pymorphy1 и в XML-словаре из OpenCorpora слово "лемма"
55 |     употребляется в значении "лексема". Чтобы не усугублять путаницу,
56 |     в pymorphy2 вместо термина "лемма" употребляется термин
57 |     "нормальная форма слова", а термин "лемма" не используется совсем.
58 | 
59 | 
60 | .. _википедию: https://ru.wikipedia.org/wiki/%D0%9B%D0%B5%D0%BA%D1%81%D0%B5%D0%BC%D0%B0_(%D0%BB%D0%B8%D0%BD%D0%B3%D0%B2%D0%B8%D1%81%D1%82%D0%B8%D0%BA%D0%B0)
61 | .. _Foundations of Statistical Natural Language Processing: https://nlp.stanford.edu/fsnlp/
62 | 
63 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | Морфологический анализатор pymorphy2
 2 | ====================================
 3 | 
 4 | pymorphy2_ написан на языке Python (работает под 2.7 и 3.5+). Он умеет:
 5 | 
 6 | 1. приводить слово к нормальной форме (например, "люди -> человек",
 7 |    или "гулял -> гулять").
 8 | 
 9 | 2. ставить слово в нужную форму. Например, ставить слово
10 |    во множественное число, менять падеж слова и т.д.
11 | 
12 | 3. возвращать грамматическую информацию о слове (число, род,
13 |    падеж, часть речи и т.д.)
14 | 
15 | При работе используется словарь OpenCorpora_; для незнакомых слов
16 | строятся гипотезы. Библиотека достаточно быстрая: в настоящий
17 | момент скорость работы - от нескольких тыс слов/сек до > 100тыс слов/сек
18 | (в зависимости от выполняемой операции, интерпретатора
19 | и установленных пакетов); потребление памяти - 10...20Мб;
20 | полностью :ref:`поддерживается <char-substitutes>` буква ё.
21 | 
22 | Лицензия - MIT. Если вы используете pymorphy2 в научной работе,
23 | см. также раздел :ref:`citing`.
24 | 
25 | Содержание
26 | ----------
27 | 
28 | .. toctree::
29 |    :maxdepth: 2
30 | 
31 |    user/index
32 |    internals/index
33 |    misc/index
34 |    misc/citing
35 |    glossary
36 | 
37 | * :ref:`genindex`
38 | * :ref:`modindex`
39 | * :ref:`search`
40 | 
41 | 
42 | Исходный код - на github_. Если заметили ошибку, то пишите
43 | в `баг-трекер`_. Для обсуждения есть `гугл-группа`_; если есть какие-то
44 | вопросы - пишите туда.
45 | 
46 | 
47 | .. _github: https://github.com/kmike/pymorphy2
48 | .. _баг-трекер: https://github.com/kmike/pymorphy2/issues
49 | .. _гугл-группа: https://groups.google.com/forum/?fromgroups#!forum/pymorphy
50 | .. _pymorphy2: https://github.com/kmike/pymorphy2
51 | .. _pymorphy: https://bitbucket.org/kmike/pymorphy/
52 | .. _OpenCorpora: http://opencorpora.org
53 | 
54 | 


--------------------------------------------------------------------------------
/docs/internals/char-substitutes.rst:
--------------------------------------------------------------------------------
 1 | .. _char-substitutes:
 2 | 
 3 | Буква Ё
 4 | =======
 5 | 
 6 | Если не ударяться в крайности, то можно считать, что в русском языке
 7 | употребление буквы "ё" допустимо, но не обязательно. Это означает, что как
 8 | в исходном тексте, так и в словарях она иногда может быть, а иногда ее
 9 | может не быть.
10 | 
11 | В pymorphy2 считается, что:
12 | 
13 | * в словарях употребление буквы "ё" обязательно; "е" вместо "ё" (как и "ё"
14 |   вместо "е") - это ошибка в словаре. Иными словами, "е" и "ё" в
15 |   словарях - две совсем разные буквы.
16 | 
17 | * В текстах/словах, которые подаются на вход морфологического анализатора,
18 |   употребление буквы "ё" необязательно. Например, слово "озера" должно быть
19 |   разобрано и как "(нет) озера", и как "(глубокие) озёра".
20 | 
21 |   .. note::
22 | 
23 |     При этом входное слово "озёра" будет однозначно разобрано как
24 |     "(глубокие) озёра".
25 | 
26 | Детали реализации
27 | -----------------
28 | 
29 | "Наивный" подход - это генерация все вариантов возможных замен "е" на "ё"
30 | во входном слове и проверка всех вариантов по словарю. В русском языке
31 | "е" - очень распространенная буква, и много слов, где "е" встречается
32 | несколько раз. Например, для слова с 3 буквами "е" нужно сгенерировать
33 | еще 7 вариантов слова - вместо 1 проверки по словарю нужно было
34 | бы выполнить 8 (+ время на генерацию вариантов слов).
35 | 
36 | При разборе pymorphy2 использует другой подход - все слова хранятся в графе,
37 | и при обходе графа кроме направлений "е" каждый раз еще пробуется
38 | направление "ё". При этом в исходном коде pymorphy2 этого обхода графа
39 | в явном виде нет, т.к. библиотеки DAWG_ и DAWG-Python_ сами умеет производить
40 | "поиск с возможными заменами".
41 | 
42 | .. _DAWG: https://github.com/kmike/DAWG
43 | .. _DAWG-Python: https://github.com/kmike/DAWG-Python
44 | 
45 | 
46 | .. note::
47 | 
48 |     По оценкам, полученным в начале разработки (которые, соответственно,
49 |     могут быть неверными для текущей версии), поддержка буквы "ё"
50 |     в pymorphy2 замедляла разбор на 10-40% (в зависимости от интерпретатора).
51 | 


--------------------------------------------------------------------------------
/docs/internals/index.rst:
--------------------------------------------------------------------------------
 1 | .. _internals:
 2 | 
 3 | =====================
 4 | Внутреннее устройство
 5 | =====================
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 2
 9 | 
10 |    dict
11 |    prediction
12 |    char-substitutes
13 | 


--------------------------------------------------------------------------------
/docs/internals/prediction.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | .. _prediction:
  3 | 
  4 | Разбор несловарных слов
  5 | =======================
  6 | 
  7 | В тех случаях, когда слово не получается найти простым поиском
  8 | по словарю (с :ref:`учетом <char-substitutes>` буквы "ё"),
  9 | в дело вступают "предсказатели" - правила разбора несловарных слов.
 10 | 
 11 | .. note::
 12 | 
 13 |     Алгоритмы предсказания в похожи на те, что описаны на
 14 |     `aot.ru <http://aot.ru>`_, и на те, что применяются в pymorphy1_,
 15 |     но отличаются в деталях и содержат дополнительные эвристики.
 16 | 
 17 | .. _pymorphy1: https://pymorphy.readthedocs.io/en/latest/algo.html#prediction-algo
 18 | 
 19 | 
 20 | Отсечение известных префиксов
 21 | -----------------------------
 22 | 
 23 | Во многих языках (в том числе в русском и украинском) существует
 24 | набор словообразовательных префиксов, которые можно приписать к слову,
 25 | и которые при этом не меняют то, как слово разбирается и склоняется.
 26 | 
 27 | В pymorphy2 для каждого поддерживаемого языка хранится небольшой
 28 | список таких префиксов (например, "не", "анти", "псевдо", "супер", "дву"
 29 | и т.д. для русского). Если слово начинается с одного из таких префиксов,
 30 | то pymorphy2 отсекает префикс, разбирает то, что осталось, а затем приписывает
 31 | префикс обратно.
 32 | 
 33 | Отсечение неизвестных префиксов
 34 | -------------------------------
 35 | 
 36 | Если 2 слова отличаются только тем, что к одному из них что-то приписано
 37 | спереди, то, скорее всего, и склоняться они будут одинаково. Поэтому
 38 | если разбираемое слово можно представить как ПРЕФИКС + какое-то другое слово
 39 | из словаря, то pymorphy2 считает, что слово разбирается так же,
 40 | как и это другое слово.
 41 | 
 42 | При этом должны выполняться несколько дополнительных условий:
 43 | 
 44 | * длина словарного слова должна быть не меньше 3;
 45 | * длина префикса не должна быть больше 5;
 46 | * словарное слово - это существительное, прилагательное, глагол, причастие или
 47 |   деепричастие.
 48 | 
 49 | Алгоритм такой: сначала pymorphy2 пробует считать первую букву префиксом,
 50 | потом первые две буквы, потом первые 3 буквы и т.д. до 5, и смотрит, нет ли
 51 | остатка в словаре.
 52 | 
 53 | Предсказание по концу слова
 54 | ---------------------------
 55 | 
 56 | В подходах с отсечением префиксов есть два принципиальных ограничения:
 57 | 
 58 | * разбор не должен зависеть от префикса (что неверно для
 59 |   словоизменительных префиксов "по" и "наи", которые образуют
 60 |   формы прилагательных);
 61 | * морфологический анализатор должен уметь разбирать правую часть слова
 62 |   (путем поиска по словарю или еще как-то) - правая часть слова должна
 63 |   иметь какой-то смысл сама по себе.
 64 | 
 65 | Разбор многих слов нельзя предсказать, отсекая префикс и разбирая остаток
 66 | как словарное слово. Например, хотелось бы, чтоб если в словаре было слово
 67 | "кошка", но не было "мошка" и "ошка", на основе словарного слова "кошка"
 68 | анализатор смог бы предположить, как склоняется "мошка"
 69 | (т.к. они заканчиваются одинаково).
 70 | 
 71 | Для того, чтоб предсказывать формы слов по тому, как слова заканчиваются,
 72 | при конвертации словарей pymorphy2 собирает статистику по окончаниям:
 73 | для каждого возможного окончания слова (от 1 до 5 букв) сохраняются все
 74 | возможные разборы. Другими словами, каждому возможному 1..5-буквенному
 75 | окончанию сопоставляется массив с информацией о возможных вариантах разбора
 76 | ``(частота, номер парадигмы, номер формы в парадигме)``.
 77 | 
 78 | Если для каждого "окончания" хранить все возможные варианты разбора,
 79 | то получится заведомо много лишних (очень маловероятных) правил.
 80 | Поэтому полученные разборы "очищаются":
 81 | 
 82 | * pymorphy2 сохраняет только самый частотный разбор для каждой части речи;
 83 | * разборы, принадлежащие "непродуктивным" парадигмам, удаляются
 84 |   (непродуктивными сейчас считаются парадигмы, которым соответствует
 85 |   менее ``min_paradigm_popularity=3`` лексем в словаре);
 86 | * редкие окончания удаляются (те, которые встретились только
 87 |   ``min_ending_freq=1`` раз);
 88 | * не все части речи продуктивные: например, нельзя приписать
 89 |   что-то к предлогу, чтоб получить другой предлог; все предлоги есть в словаре,
 90 |   и предсказывать незнакомые слова как предлоги неправильно - такие
 91 |   варианты предсказания отбрасываются предсказателем.
 92 | 
 93 | Результат кодируется в DAFSA. Схема хранения похожа на ту, что в основном
 94 | словаре (см. раздел :ref:`Упаковка слов <word-packing>`), только
 95 | 
 96 | * вместо самих слов хранятся все их возможные окончания;
 97 | * к номеру парадигмы и индексу формы в парадигме добавляется
 98 |   еще "продуктивность" данного правила - количество слов
 99 |   в словаре, которые имеют данное окончание и разбираются данным образом.
100 | 
101 | ::
102 | 
103 |     <конец слова> <разделитель> <продуктивность> <номер парадигмы> <номер формы в парадигме>
104 | 
105 | Разбор сводится к поиску наиболее длинной правой части разбираемого слова,
106 | которая есть в DAFSA с окончаниями.
107 | 
108 | Кроме того, для каждого словоизменительного префикса (ПО, НАИ) точно так же
109 | строится еще по одному DAFSA; если слово начинается с одного из этих префиксов,
110 | то анализатор добавляет к результату варианты предсказания, полученные поиском
111 | по соответствующему DAFSA.
112 | 
113 | .. note::
114 | 
115 |     Термин "окончание" тут употребляется в смысле "правая часть
116 |     слова определенной длины"; он не имеет отношения к "школьному"
117 |     определению; кроме того, тут он не имеет отношения к "окончаниям"
118 |     в парадигмах.
119 | 
120 | Наречия на по-
121 | --------------
122 | 
123 | Несловарное слова предсказываются как наречия, если верно следующее:
124 | 
125 | * слово начинается на "по-";
126 | * оно длиннее 5 символов;
127 | * часть слова без "по-" может быть разобрана как прилагательное единственного
128 |   числа дательного падежа.
129 | 
130 | Примеры: по-северному, по-хорошему.
131 | 
132 | 
133 | Частица, отделенная дефисом
134 | ---------------------------
135 | 
136 | Иногда удобно слова, к которым через дефис приписана частица, разбирать как
137 | единый токен. Поэтому pymorphy2 умеет разбирать токены вроде
138 | "смотри-ка" или "посмотрел-таки".
139 | 
140 | 
141 | Составные слова, записанные через дефис
142 | ---------------------------------------
143 | 
144 | pymorphy2 поддерживает составные слова из двух частей, разделенных дефисом.
145 | Для таких слов pymorphy2 сначала разбирает обе части по отдельности
146 | (они могут быть несловарными словами).
147 | 
148 | В настоящий момент поддерживается 2 способа образования таких слов:
149 | 
150 | 1. Левая часть - неизменяемая приставка/основа (например, “интернет-магазин”,
151 |    “воздушно-капельный”). В этом случае форма слова определяется второй частью.
152 |    Этот случай добавляется в возможные варианты разбора всегда.
153 | 2. Равноправные части, склоняемые вместе (например, “человек-паук”). Этот
154 |    случай добавляется в возможные варианты разбора только тогда, когда обе
155 |    части имеют совместимую форму (есть вариант разбора первой части,
156 |    который не противоречит какому-то варианту разбора второй).
157 | 
158 | .. note::
159 | 
160 |     Если слово содержит более одного дефиса (образовано более чем из двух
161 |     частей), это правило не применяется.
162 | 
163 | Инициалы
164 | --------
165 | 
166 | Однобуквенные токены в верхнем регистре pymorphy2 предсказывает как
167 | инициалы: для них возвращаются варианты разбора "имя" и "отчество",
168 | по всем родам, падежам и числам.
169 | 
170 | Сортировка результатов разбора
171 | ------------------------------
172 | 
173 | При предсказании по концу слова результаты сортируются по "продуктивности"
174 | вариантов разбора: наиболее продуктивные варианты будут первыми.
175 | 
176 | Другими словами, варианты разбора (= номера парадигм) упорядочены
177 | по частоте, с которой эти номера парадигм соответствуют данному
178 | окончанию для данной части речи - без учета частотности по корпусу.
179 | 
180 | Экспериментального подтверждения правильности этого подхода нет,
181 | но "интуиция" тут такая:
182 | 
183 | 1) нам не важно, какие слова в корпусе встречаются часто, т.к. предсказатель
184 |    работает для редких слов, и редкие слова он должен предсказывать
185 |    как редкие, а не как распространенные;
186 | 2) для "длинного хвоста" частотности в корпусе конкретные цифры имеют
187 |    не очень много значения, т.к. флуктуации очень большие,
188 |    "эффект хоббита" и т.д.
189 | 3) С другой стороны, важно, какие парадигмы в русском
190 |    языке более продуктивные, какие порождают больше слов.
191 | 
192 | Поэтому используется частотность по парадигмам, полученная
193 | исключительно из словаря.
194 | 
195 | .. note::
196 | 
197 |     В настоящий момент результаты сортируются только при предсказании
198 |     по концу слова. Разборы для словарных слов и разборы, предсказанные
199 |     путем отсечения префикса, специальным образом сейчас не сортируются.
200 | 
201 | .. _OpenCorpora: http://opencorpora.org
202 | 


--------------------------------------------------------------------------------
/docs/internals/umlauts.rst:
--------------------------------------------------------------------------------
1 | :orphan:
2 | 
3 | См. :ref:`char-substitutes`.
4 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | if "%SPHINXBUILD%" == "" (
  6 | 	set SPHINXBUILD=sphinx-build
  7 | )
  8 | set BUILDDIR=_build
  9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
 10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
 11 | if NOT "%PAPER%" == "" (
 12 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 13 | 	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
 14 | )
 15 | 
 16 | if "%1" == "" goto help
 17 | 
 18 | if "%1" == "help" (
 19 | 	:help
 20 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 21 | 	echo.  html       to make standalone HTML files
 22 | 	echo.  dirhtml    to make HTML files named index.html in directories
 23 | 	echo.  singlehtml to make a single large HTML file
 24 | 	echo.  pickle     to make pickle files
 25 | 	echo.  json       to make JSON files
 26 | 	echo.  htmlhelp   to make HTML files and a HTML help project
 27 | 	echo.  qthelp     to make HTML files and a qthelp project
 28 | 	echo.  devhelp    to make HTML files and a Devhelp project
 29 | 	echo.  epub       to make an epub
 30 | 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 31 | 	echo.  text       to make text files
 32 | 	echo.  man        to make manual pages
 33 | 	echo.  texinfo    to make Texinfo files
 34 | 	echo.  gettext    to make PO message catalogs
 35 | 	echo.  changes    to make an overview over all changed/added/deprecated items
 36 | 	echo.  linkcheck  to check all external links for integrity
 37 | 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 38 | 	goto end
 39 | )
 40 | 
 41 | if "%1" == "clean" (
 42 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 43 | 	del /q /s %BUILDDIR%\*
 44 | 	goto end
 45 | )
 46 | 
 47 | if "%1" == "html" (
 48 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 49 | 	if errorlevel 1 exit /b 1
 50 | 	echo.
 51 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 52 | 	goto end
 53 | )
 54 | 
 55 | if "%1" == "dirhtml" (
 56 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 57 | 	if errorlevel 1 exit /b 1
 58 | 	echo.
 59 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 60 | 	goto end
 61 | )
 62 | 
 63 | if "%1" == "singlehtml" (
 64 | 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 65 | 	if errorlevel 1 exit /b 1
 66 | 	echo.
 67 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 68 | 	goto end
 69 | )
 70 | 
 71 | if "%1" == "pickle" (
 72 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
 73 | 	if errorlevel 1 exit /b 1
 74 | 	echo.
 75 | 	echo.Build finished; now you can process the pickle files.
 76 | 	goto end
 77 | )
 78 | 
 79 | if "%1" == "json" (
 80 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
 81 | 	if errorlevel 1 exit /b 1
 82 | 	echo.
 83 | 	echo.Build finished; now you can process the JSON files.
 84 | 	goto end
 85 | )
 86 | 
 87 | if "%1" == "htmlhelp" (
 88 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
 89 | 	if errorlevel 1 exit /b 1
 90 | 	echo.
 91 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
 92 | .hhp project file in %BUILDDIR%/htmlhelp.
 93 | 	goto end
 94 | )
 95 | 
 96 | if "%1" == "qthelp" (
 97 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
 98 | 	if errorlevel 1 exit /b 1
 99 | 	echo.
100 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
101 | .qhcp project file in %BUILDDIR%/qthelp, like this:
102 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\pymorphy2.qhcp
103 | 	echo.To view the help file:
104 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\pymorphy2.ghc
105 | 	goto end
106 | )
107 | 
108 | if "%1" == "devhelp" (
109 | 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
110 | 	if errorlevel 1 exit /b 1
111 | 	echo.
112 | 	echo.Build finished.
113 | 	goto end
114 | )
115 | 
116 | if "%1" == "epub" (
117 | 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
118 | 	if errorlevel 1 exit /b 1
119 | 	echo.
120 | 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
121 | 	goto end
122 | )
123 | 
124 | if "%1" == "latex" (
125 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
126 | 	if errorlevel 1 exit /b 1
127 | 	echo.
128 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
129 | 	goto end
130 | )
131 | 
132 | if "%1" == "text" (
133 | 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
134 | 	if errorlevel 1 exit /b 1
135 | 	echo.
136 | 	echo.Build finished. The text files are in %BUILDDIR%/text.
137 | 	goto end
138 | )
139 | 
140 | if "%1" == "man" (
141 | 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
142 | 	if errorlevel 1 exit /b 1
143 | 	echo.
144 | 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
145 | 	goto end
146 | )
147 | 
148 | if "%1" == "texinfo" (
149 | 	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
150 | 	if errorlevel 1 exit /b 1
151 | 	echo.
152 | 	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
153 | 	goto end
154 | )
155 | 
156 | if "%1" == "gettext" (
157 | 	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
158 | 	if errorlevel 1 exit /b 1
159 | 	echo.
160 | 	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
161 | 	goto end
162 | )
163 | 
164 | if "%1" == "changes" (
165 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
166 | 	if errorlevel 1 exit /b 1
167 | 	echo.
168 | 	echo.The overview file is in %BUILDDIR%/changes.
169 | 	goto end
170 | )
171 | 
172 | if "%1" == "linkcheck" (
173 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
174 | 	if errorlevel 1 exit /b 1
175 | 	echo.
176 | 	echo.Link check complete; look for any errors in the above output ^
177 | or in %BUILDDIR%/linkcheck/output.txt.
178 | 	goto end
179 | )
180 | 
181 | if "%1" == "doctest" (
182 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
183 | 	if errorlevel 1 exit /b 1
184 | 	echo.
185 | 	echo.Testing of doctests in the sources finished, look at the ^
186 | results in %BUILDDIR%/doctest/output.txt.
187 | 	goto end
188 | )
189 | 
190 | :end
191 | 


--------------------------------------------------------------------------------
/docs/misc/2trie.rst:
--------------------------------------------------------------------------------
  1 | .. _2trie:
  2 | 
  3 | Первоначальный формат словарей (отброшенный)
  4 | ============================================
  5 | 
  6 | .. warning::
  7 | 
  8 |     Этот формат словарей в pymorphy2 не используется;
  9 |     описание - документация по менее удачной попытке
 10 |     организовать словари.
 11 | 
 12 |     Первоначальная реализация доступна в одном из первых коммитов.
 13 |     Рассматривайте описанное ниже как бесполезный на практике
 14 |     "исторический" документ.
 15 | 
 16 | 
 17 | В `публикации`_ по mystem был описан способ упаковки словарей с использованием
 18 | 2 trie для "стемов" и "окончаний". В первом прототипе pymorphy2 был
 19 | реализован схожий способ; впоследствии я заменил его на другой.
 20 | 
 21 | Этот первоначальный формат словарей в моей реализации обеспечивал
 22 | скорость разбора порядка 20-60тыс слов/сек (без предсказателя) при потреблении
 23 | памяти 30М (с использованием datrie_), или порядка 2-5 тыс слов/сек
 24 | при потреблении памяти 5M (с использованием marisa-trie_).
 25 | 
 26 | Идея была в том, что слово просматривается с конца, при этом в первом trie
 27 | ищутся возможные варианты разбора для данных окончаний; затем для всех
 28 | найденных вариантов окончаний "начала" слов ищутся во втором trie;
 29 | в результате возвращаются те варианты, где для "начала" и "конца" есть
 30 | общие способы разбора.
 31 | 
 32 | Основной "затык" в производительности был в том, что для каждого
 33 | слова требовалось искать общие для начала и конца номера парадигм.
 34 | Это задача о пересечении 2 множеств, для которой мне не удалось найти
 35 | красивого решения. Питоний set использовать было нельзя, т.к. это требовало
 36 | очень много памяти.
 37 | 
 38 | Лучшее, что получалось - id парадигм хранились в 2 отсортированных
 39 | массивах, а их пересечение находилось итерацией по более короткому массиву
 40 | и "сужающимся" двоичным поиском по более длинному (параллельная итерация по
 41 | обоим массивам на конкретных данных оказывалась всегда медленнее).
 42 | 
 43 | В pymorphy2 я в итоге решил использовать другой
 44 | :ref:`формат словарей <dictionary>`, т.к.
 45 | 
 46 | * другой формат проще;
 47 | * алгоритмы работы получаются проще;
 48 | * скорость разбора получается больше (порядка 100-200 тыс слов/сек без
 49 |   предсказателя) при меньшем потреблении памяти (порядка 15M).
 50 | 
 51 | Но при этом первоначальный формат потенциально позволяет
 52 | тратить еще меньше памяти; некоторые способы ускорения работы
 53 | с ним еще не были опробованы.
 54 | 
 55 | Уменьшение размера массивов, как мне кажется - наиболее перспективный тут
 56 | способ ускорения. Для уменьшения размеров сравниваемых массивов требуется
 57 | уменьшить количество парадигм (например, "вырожденных" с пустым стемом).
 58 | 
 59 | .. _публикации: https://download.yandex.ru/company/iseg-las-vegas.pdf
 60 | .. _marisa-trie: https://github.com/kmike/marisa-trie
 61 | 
 62 | 
 63 | Выделение парадигм
 64 | ------------------
 65 | 
 66 | Изначально в словаре из OpenCorpora нет понятия "парадигмы" слова.
 67 | Парадигма - это таблица форм какого-либо слова, образец для склонения
 68 | или спряжения.
 69 | 
 70 | В pymorphy2 выделенные явным образом парадигмы слов необходимы для того,
 71 | чтоб склонять неизвестные слова - т.к. при этом нужны образцы для склонения.
 72 | 
 73 | Пример исходной леммы::
 74 | 
 75 |     375080
 76 |     ЧЕЛОВЕКОЛЮБИВ   100
 77 |     ЧЕЛОВЕКОЛЮБИВА  102
 78 |     ЧЕЛОВЕКОЛЮБИВО  105
 79 |     ЧЕЛОВЕКОЛЮБИВЫ  110
 80 | 
 81 | Парадигма (пусть будет номер 12345)::
 82 | 
 83 |     ""      100
 84 |     "А"     102
 85 |     "О"     105
 86 |     "Ы"     110
 87 | 
 88 | Вся лемма при этом "сворачивается" в "стем" и номер парадигмы::
 89 | 
 90 |     "ЧЕЛОВЕКОЛЮБИ" 12345
 91 | 
 92 | .. note::
 93 | 
 94 |     Для одного "стема" может быть несколько допустимых парадигм.
 95 | 
 96 | Прилагательные на ПО-
 97 | ^^^^^^^^^^^^^^^^^^^^^
 98 | 
 99 | В словарях у большинства сравнительных прилагательных есть формы на ПО-::
100 | 
101 |     375081
102 |     ЧЕЛОВЕКОЛЮБИВЕЕ COMP,Qual V-ej
103 |     ПОЧЕЛОВЕКОЛЮБИВЕЕ       COMP,Qual Cmp2
104 |     ПОЧЕЛОВЕКОЛЮБИВЕЙ       COMP,Qual Cmp2,V-ej
105 | 
106 | Можно заметить, что в этом случае форма слова определяется не только тем,
107 | как слово заканчивается, но и тем, как слово начинается. Алгоритм с разбиением
108 | на "стем" и "окончание" приведет к тому, что все слово целиком будет считаться
109 | окончанием, а => каждое сравнительное прилагательное породит еще одну
110 | парадигму. Это увеличивает общее количество парадигм в несколько раз и делает
111 | невозможным склонение несловарных сравнительных прилагательных, поэтому
112 | в pymorphy2 парадигма определяется как "окончание", "номер грам. информации"
113 | и "префикс".
114 | 
115 | Пример парадигмы для "ЧЕЛОВЕКОЛЮБИВ"::
116 | 
117 |     ""      100     ""
118 |     "А"     102     ""
119 |     "О"     105     ""
120 |     "Ы"     110     ""
121 | 
122 | Пример парадигмы для "ЧЕЛОВЕКОЛЮБИВЕЕ"::
123 | 
124 |     ""      555     ""
125 |     ""      556     "ПО"
126 |     ""      557     "ПО"
127 | 
128 | .. note::
129 | 
130 |     Сейчас обрабатывается единственный префикс - "ПО". В словарях, похоже,
131 |     нет других префиксов, присущих только отдельным формам слова в пределах
132 |     одной леммы.
133 | 
134 | Упаковка "стемов"
135 | -----------------
136 | 
137 | "Стемы" - строки, основы лемм. Для их хранения используется структура данных
138 | trie_ (с использованием библиотеки datrie_), что позволяет снизить
139 | потребление оперативной памяти (т.к. некоторые общие части слов не дублируются)
140 | и повысить скорость работы (т.к. в trie можно некоторые операции - например,
141 | поиск всех префиксов данной строки - можно выполнять значительно быстрее,
142 | чем в хэш-таблице).
143 | 
144 | .. _trie: https://en.wikipedia.org/wiki/Trie
145 | .. _datrie: https://github.com/pytries/datrie
146 | 
147 | Ключами в trie являются стемы (перевернутые), значениями - список с номерами
148 | допустимых парадигм.
149 | 
150 | Упаковка tuple/list/set
151 | -----------------------
152 | 
153 | Для каждого стема требуется хранить множество id парадигм; обычно это
154 | множества из небольшого числа int-элементов. В питоне накладные расходы на
155 | set() довольно велики::
156 | 
157 |     >>> import sys
158 |     >>> sys.getsizeof({})
159 |     280
160 | 
161 | Если для каждого стема создать даже по одному пустому экземпляру set,
162 | это уже займет порядка 80М памяти. Поэтому set() не используется;
163 | сначала я заменил их на tuple с отсортированными элементами. В таких tuple
164 | можно искать пересечения за O(N+M) через однопроходный алгоритм,
165 | аналогичный сортировке слиянием, или за O(N*log(M)) через двоичный поиск.
166 | 
167 | Но накладные расходы на создание сотен тысяч tuple с числами тоже велики,
168 | поэтому в pymorphy 2 они упаковываются в одномерный массив чисел
169 | (``array.array``).
170 | 
171 | Пусть у нас есть такая структура::
172 | 
173 |     (
174 |         (10, 20, 30),       # 0й элемент
175 |         (20, 40),           # 1й элемент
176 |     )
177 | 
178 | Она упакуется в такой массив::
179 | 
180 |     array.array([3, 10, 20, 30, 2, 20, 40])
181 | 
182 | Сначала указывается длина данных, затем идет сами данные, потом опять длина
183 | и опять данные, и т.д. Для доступа везде вместо старых индексов
184 | (0й элемент, 1й элемент) используются новые: 0й элемент, 4й элемент.
185 | Чтоб получить исходные цифры, нужно залезть в массив по новому индексу,
186 | получить длину N, и взять следующие N элементов.
187 | 
188 | Итоговый формат данных
189 | ----------------------
190 | 
191 | Таблица с грам. информацией
192 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^
193 | 
194 | ::
195 | 
196 |     ['tag1', 'tag2', ...]
197 | 
198 | ``tag<N>`` - набор грам. тегов, например ``NOUN,anim,masc sing,nomn``.
199 | 
200 | Этот массив занимает где-то 0.5M памяти.
201 | 
202 | Парадигмы
203 | ^^^^^^^^^
204 | 
205 | ::
206 | 
207 |     [
208 |         (
209 |             (suffix1, tag_index1, prefix1),
210 |             (suffix2, tag_index2, prefix2),
211 |             ...
212 |         ),
213 |         (
214 |             ...
215 |     ]
216 | 
217 | 
218 | ``suffix<N>`` и ``prefix<N>`` - это строки с окончанием и префиксом
219 | (например, ``"ЫЙ"`` и ``""``); ``tag_index<N>`` - индекс в таблице
220 | с грам. информацией.
221 | 
222 | Парадигмы занимают примерно 7-8M памяти.
223 | 
224 | .. note::
225 | 
226 |     tuple в парадигмах сейчас не упакованы в линейные структуры;
227 |     упаковка должна уменьшить потребление памяти примерно на 3M.
228 | 
229 | 
230 | Стемы
231 | ^^^^^
232 | 
233 | Стемы хранятся в 2 структурах:
234 | 
235 | * ``array.array`` с упакованными множествами номеров возможных парадигм
236 |   для данного стема::
237 | 
238 |        [length0, para_id0, para_id1, ..., length1, para_id0, para_id1, ...]
239 | 
240 | * и trie с ключами-строками и значениями-индексами в массиве значений::
241 | 
242 |        datrie.BaseTrie(
243 |            'stem1': index1,
244 |            'stem2': index2,
245 |            ...
246 |        )
247 | 
248 | "Окончания"
249 | ^^^^^^^^^^^
250 | 
251 | Для каждого "окончания" хранится, в каких парадигмах на каких позициях
252 | оно встречается. Эта информация требуется для быстрого поиска нужного слова
253 | "с конца". Для этого используются 3 структуры:
254 | 
255 | * ``array.array`` с упакованными множествами номеров возможных парадигм
256 |   для данного окончания::
257 | 
258 |        [length0, para_id0, para_id1, ..., length1, para_id0, para_id1, ...]
259 | 
260 |   В отличие от аналогичного множества для стемов, номера парадигм могут
261 |   повторяться в пределах окончания.
262 | 
263 | * ``array.array`` с упакованными множествами индексов в пределах парадигмы::
264 | 
265 |        [length0, index0, index1, ..., length1, index0, index1, ...]
266 | 
267 |   Этот массив работает "вместе" с предыдущим, каждому элементу отсюда
268 |   соответствует элемент оттуда - совместно они предоставляют информацию
269 |   о возможных номерах форм в парадигме для всех окончаний.
270 | 
271 | * trie с ключами-строками и значениями-индексами::
272 | 
273 |        datrie.BaseTrie(
274 |            'suff1': index1,
275 |            'suff2': index2,
276 |            ...
277 |        )
278 | 
279 |   По индексу ``index<N>`` можно из предыдущих двух массивов получить наборы
280 |   форм для данного окончания.
281 | 
282 | .. note::
283 | 
284 |     Длины хранятся 2 раза. Может, это можно как-то улучшить?
285 | 
286 | .. _mystem: https://company.yandex.ru/technologies/mystem/
287 | .. _pymorphy 0.5.6: https://pymorphy.readthedocs.io/en/v0.5.6/index.html
288 | 


--------------------------------------------------------------------------------
/docs/misc/_authors.rst:
--------------------------------------------------------------------------------
1 | 
2 | .. include:: ../../AUTHORS.rst
3 | 


--------------------------------------------------------------------------------
/docs/misc/_changes.rst:
--------------------------------------------------------------------------------
1 | 
2 | .. include:: ../../CHANGES.rst


--------------------------------------------------------------------------------
/docs/misc/api_reference.rst:
--------------------------------------------------------------------------------
 1 | API Reference (auto-generated)
 2 | ==============================
 3 | 
 4 | Morphological Analyzer
 5 | ----------------------
 6 | 
 7 | .. automodule:: pymorphy2.analyzer
 8 |     :members:
 9 |     :undoc-members:
10 | 
11 | Analyzer units
12 | ~~~~~~~~~~~~~~
13 | 
14 | .. automodule:: pymorphy2.units.by_lookup
15 |     :members:
16 | 
17 | .. automodule:: pymorphy2.units.by_analogy
18 |     :members:
19 | 
20 | .. automodule:: pymorphy2.units.by_hyphen
21 |     :members:
22 | 
23 | .. automodule:: pymorphy2.units.by_shape
24 |     :members:
25 | 
26 | Tagset
27 | ------
28 | 
29 | .. automodule:: pymorphy2.tagset
30 |     :members: OpencorporaTag
31 | 
32 | .. _cli:
33 | 
34 | Command-Line Interface
35 | ----------------------
36 | 
37 | .. automodule:: pymorphy2.cli
38 | 
39 | Utilities for OpenCorpora Dictionaries
40 | --------------------------------------
41 | 
42 | .. automodule:: pymorphy2.opencorpora_dict.wrapper
43 |     :members:
44 | 
45 | Various Utilities
46 | -----------------
47 | 
48 | .. automodule:: pymorphy2.tokenizers
49 |     :members:
50 | 
51 | .. automodule:: pymorphy2.shapes
52 |     :members:
53 | 
54 | .. automodule:: pymorphy2.utils
55 |     :members:
56 | 


--------------------------------------------------------------------------------
/docs/misc/citing.rst:
--------------------------------------------------------------------------------
 1 | .. _citing:
 2 | 
 3 | Цитирование
 4 | ===========
 5 | 
 6 | Если вы использовали pymorphy2 в научных целях,
 7 | то будет хорошо, если процитируете следующую
 8 | `статью <https://link.springer.com/chapter/10.1007%2F978-3-319-26123-2_31>`_:
 9 | 
10 | ::
11 | 
12 |     Korobov M.: Morphological Analyzer and Generator for Russian and
13 |     Ukrainian Languages // Analysis of Images, Social Networks and Texts,
14 |     pp 320-332 (2015).
15 | 
16 | Это не обязательно, но автору будет приятно.
17 | 
18 | BibTeX::
19 | 
20 |    @incollection{
21 |       year={2015},
22 |       isbn={978-3-319-26122-5},
23 |       booktitle={Analysis of Images, Social Networks and Texts},
24 |       volume={542},
25 |       series={Communications in Computer and Information Science},
26 |       editor={Khachay, Mikhail Yu. and Konstantinova, Natalia and Panchenko, Alexander and Ignatov, Dmitry I. and Labunets, Valeri G.},
27 |       doi={10.1007/978-3-319-26123-2_31},
28 |       title={Morphological Analyzer and Generator for Russian and Ukrainian Languages},
29 |       url={http://dx.doi.org/10.1007/978-3-319-26123-2_31},
30 |       publisher={Springer International Publishing},
31 |       keywords={Morphological analyzer; Russian; Ukrainian; Morphological generator; Open source; OpenCorpora; LanguageTool; pymorphy2; pymorphy},
32 |       author={Korobov, Mikhail},
33 |       pages={320-332},
34 |       language={English}
35 |    }
36 | 
37 | Препринт статьи доступен для скачивания
38 | на arxiv (`pdf <https://arxiv.org/pdf/1503.07283v1.pdf>`_).
39 | 


--------------------------------------------------------------------------------
/docs/misc/index.rst:
--------------------------------------------------------------------------------
 1 | .. _misc:
 2 | 
 3 | ======
 4 | Разное
 5 | ======
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 2
 9 | 
10 |    _changes
11 |    _authors
12 |    api_reference
13 |    2trie
14 | 


--------------------------------------------------------------------------------
/docs/user/contributing.rst:
--------------------------------------------------------------------------------
 1 | ================================
 2 | Как принять участие в разработке
 3 | ================================
 4 | 
 5 | Общая информация
 6 | ================
 7 | 
 8 | Исходный код pymorphy2 распространяется по лицензии MIT и доступен на github:
 9 | https://github.com/kmike/pymorphy2
10 | 
11 | Баг-трекер: https://github.com/kmike/pymorphy2/issues.
12 | Для общения можно использовать `гугл-группу`_ (есть какие-то идеи,
13 | предложения, замечания - пишите).
14 | 
15 | Если вы хотите улучшить код pymorphy2 - может быть
16 | полезным ознакомиться с разделом :ref:`internals`.
17 | 
18 | pymorphy2 работает под Python 2.x и 3.x без использования
19 | утилиты 2to3; написание такого кода, по опыту, оказывается не сложнее
20 | написания кода просто под 2.х, но поначалу требует некоторой внимательности
21 | и осторожности. Пожалуйста, пишите и запускайте тесты,
22 | если что-то меняете.
23 | 
24 | Улучшать можно не только код - улучшения в документации, идеи и
25 | сообщения об ошибках тоже очень ценны.
26 | 
27 | Словари
28 | =======
29 | 
30 | Поддержка русского языка в pymorphy2 основывается на словарях из OpenCorpora_
31 | и использует наборы текстов оттуда для автоматического тестирования
32 | и замеров скорости; в будущем планируется также использовать размеченный
33 | корпус для снятия неоднозначности разбора, ну и в целом это классный проект.
34 | Любая помощь OpenCorpora_ - это вклад и в pymorphy2.
35 | 
36 | Экспериментальный украинский словарь корнями уходит в проект LanguageTool_;
37 | отдельно он доступен тут: https://github.com/arysin/dict_uk, скрипты
38 | для преобразования в формат OpenCorpora - тут:
39 | https://github.com/dchaplinsky/LT2OpenCorpora.
40 | 
41 | Все словари преобразуются в формат pymorphy2 скриптами отсюда:
42 | https://github.com/kmike/pymorphy2-dicts
43 | 
44 | .. _LanguageTool: https://languagetool.org/
45 | .. _OpenCorpora: http://opencorpora.org
46 | .. _баг-трекер: https://github.com/kmike/pymorphy2/issues
47 | .. _гугл-группу: https://groups.google.com/forum/?fromgroups#!forum/pymorphy
48 | 
49 | .. _testing:
50 | 
51 | Тестирование
52 | ============
53 | 
54 | Тесты лежат в папке :file:`tests`. При написании тестов используется pytest_.
55 | Для их запуска используется утилита tox_, которая позволяет выполнять
56 | тесты для нескольких интерпретаторов питона.
57 | 
58 | Для запуска тестов установите tox_ через pip::
59 | 
60 |     pip install tox
61 | 
62 | и выполните
63 | 
64 | ::
65 | 
66 |     tox
67 | 
68 | из папки с исходным кодом.
69 | 
70 | .. _tox: https://tox.readthedocs.io/en/latest/
71 | .. _pytest: https://pytest.org
72 | 
73 | .. _benchmarking:
74 | 
75 | Замеры скорости работы
76 | ======================
77 | 
78 | Код для бенчмарков лежит в папке :file:`benchmarks`. Для запуска тестов
79 | производительности выполните
80 | 
81 | ::
82 | 
83 |     tox -c bench.ini
84 | 
85 | из папки с исходным кодом pymorphy2.
86 | 


--------------------------------------------------------------------------------
/docs/user/grammemes.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | .. _grammeme-docs:
  3 | 
  4 | Обозначения для граммем (русский язык)
  5 | ======================================
  6 | 
  7 | В pymorphy2 для русского языка используются словари OpenCorpora
  8 | и :term:`граммемы <граммема>`, принятые в OpenCorpora (с небольшими изменениями).
  9 | 
 10 | Полный список граммем OpenCorpora доступен тут: http://opencorpora.org/dict.php?act=gram
 11 | 
 12 | .. _russian-POS:
 13 | 
 14 | Часть речи
 15 | ----------
 16 | 
 17 | ==========   =============================     =================================
 18 | Граммема     Значение                          Примеры
 19 | ==========   =============================     =================================
 20 | NOUN         имя существительное               хомяк
 21 | ADJF         имя прилагательное (полное)       хороший
 22 | ADJS         имя прилагательное (краткое)      хорош
 23 | COMP         компаратив                        лучше, получше, выше
 24 | VERB         глагол (личная форма)             говорю, говорит, говорил
 25 | INFN         глагол (инфинитив)                говорить, сказать
 26 | PRTF         причастие (полное)                прочитавший, прочитанная
 27 | PRTS         причастие (краткое)               прочитана
 28 | GRND         деепричастие                      прочитав, рассказывая
 29 | NUMR         числительное                      три, пятьдесят
 30 | ADVB         наречие                           круто
 31 | NPRO         местоимение-существительное       он
 32 | PRED         предикатив                        некогда
 33 | PREP         предлог                           в
 34 | CONJ         союз                              и
 35 | PRCL         частица                           бы, же, лишь
 36 | INTJ         междометие                        ой
 37 | ==========   =============================     =================================
 38 | 
 39 | Часть речи можно получить через атрибут POS::
 40 | 
 41 |     >>> p = morph.parse('идти')[0]
 42 |     >>> p.tag.POS
 43 |     'INFN'
 44 | 
 45 | .. _russian-cases:
 46 | 
 47 | Падеж
 48 | -----
 49 | 
 50 | ========   ===================    ===========================    ================================
 51 | Граммема   Значение               Пояснение                      Примеры
 52 | ========   ===================    ===========================    ================================
 53 | nomn       именительный           Кто? Что?                      **хомяк** ест
 54 | gent       родительный            Кого? Чего?                    у нас нет **хомяка**
 55 | datv       дательный              Кому? Чему?                    сказать **хомяку** спасибо
 56 | accs       винительный            Кого? Что?                     хомяк читает **книгу**
 57 | ablt       творительный           Кем? Чем?                      зерно съедено **хомяком**
 58 | loct       предложный             О ком? О чём? и т.п.           хомяка несут в **корзинке**
 59 | voct       звательный             Его формы используются         **Саш**, пойдем в кино.
 60 |                                   при обращении к человеку.
 61 | gen2       второй родительный                                    ложка **сахару**
 62 |            (частичный)                                           *(gent - производство сахара)*;
 63 |                                                                  стакан **яду**
 64 |                                                                  *(gent - нет яда)*
 65 | acc2       второй винительный                                    записался в **солдаты**
 66 | loc2       второй предложный                                     я у него в **долгу**
 67 |            (местный)                                             *(loct - напоминать о долге)*;
 68 |                                                                  висит в **шкафу**
 69 |                                                                  *(loct - монолог о шкафе)*;
 70 |                                                                  весь в **снегу**
 71 |                                                                  *(loct - писать о снеге)*
 72 | ========   ===================    ===========================    ================================
 73 | 
 74 | Падеж выделяется у существительных, полных прилагательных, полных причастий,
 75 | числительных и местоимений. Получить его можно через атрибут case::
 76 | 
 77 |     >>> p = morph.parse('хомяку')[0]
 78 |     >>> p.tag.case
 79 |     'datv'
 80 | 
 81 | .. note::
 82 | 
 83 |     В OpenCorpora (на июль 2013) есть еще падежи gen1 и loc1. Они указываются
 84 |     вместо gent/loct, когда у слова есть форма gen2/loc2. В pymorphy2 gen1 и
 85 |     loc1 заменены на gent/loct, чтоб с ними было проще работать.
 86 | 
 87 | .. _russian-numbers:
 88 | 
 89 | Число
 90 | -----
 91 | 
 92 | ==========   =============================     =================================
 93 | Граммема     Значение                          Примеры
 94 | ==========   =============================     =================================
 95 | sing         единственное число                хомяк, говорит
 96 | plur         множественное число               хомяки, говорят
 97 | ==========   =============================     =================================
 98 | 
 99 | ::
100 | 
101 |     >>> p = morph.parse('люди')[0]
102 |     >>> p.tag.number
103 |     'plur'
104 | 
105 | 
106 | Некоторые имена существительные употребляются только во множественном числе;
107 | им проставлена пометка Pltm ("Pluralia tantum")::
108 | 
109 |     >>> morph.parse('дрова')[0].tag
110 |     OpencorporaTag('NOUN,inan,GNdr,Pltm plur,accs')
111 | 
112 | Существуют также существительные, употребляемые только в единственном числе;
113 | им проставлена пометка Sgtm ("Singularia tantum")::
114 | 
115 |     >>> morph.parse('молоко')[0].tag
116 |     OpencorporaTag('NOUN,inan,neut,Sgtm sing,nomn')
117 | 
118 | Ни Sgtm, ни Pltm не являются значениями p.tag.number.
119 | 
120 | .. _russian-genders:
121 | 
122 | Род
123 | ---
124 | 
125 | ==========   =============================     =================================
126 | Граммема     Значение                          Примеры
127 | ==========   =============================     =================================
128 | masc         мужской род                       хомяк, говорил
129 | femn         женский род                       хомячиха, говорила
130 | neut         средний род                       зерно, говорило
131 | ==========   =============================     =================================
132 | 
133 | ::
134 | 
135 |     >>> p = morph.parse('зерно')[0]
136 |     >>> p.tag.gender
137 |     'neut'
138 | 
139 | В русском языке существует понятие "общего рода"; некоторые слова
140 | могут употребляться применительно к людям мужского или женского пола:
141 | "он бедный сирота", "она бедная сирота". Таким словам проставлена пометка
142 | ``Ms-f``::
143 | 
144 |     >>> p = morph.parse('сирота')[0]
145 |     >>> 'Ms-f' in p.tag
146 |     True
147 | 
148 | Существуют также существительные, у которых род не выражен; им проставлена
149 | пометка ``GNdr``. Ни ``Ms-f``, ни ``GNdr`` не является значением p.tag.gender.
150 | 
151 | .. _non-standard-grammemes:
152 | 
153 | Нестандартные граммемы
154 | ----------------------
155 | 
156 | В pymorphy2 используются некоторые граммемы, отсутствующие
157 | в словаре OpenCorpora:
158 | 
159 | ========  ===================================================================
160 | Граммема  Значение
161 | ========  ===================================================================
162 | LATN      Токен состоит из латинских букв (например, "foo-bar" или "Maßstab")
163 | PNCT      Пунктуация (например, ``,`` или ``!?`` или ``…``)
164 | NUMB      Число (например, "204" или "3.14")
165 | intg      целое число (например, "204")
166 | real      вещественное число (например, "3.14")
167 | ROMN      Римское число (например, XI)
168 | UNKN      Токен не удалось разобрать
169 | ========  ===================================================================
170 | 
171 | Пример::
172 | 
173 |     >>> p = morph.parse('...')[0]
174 |     >>> p.tag
175 |     OpencorporaTag('PNCT')
176 | 


--------------------------------------------------------------------------------
/docs/user/index.rst:
--------------------------------------------------------------------------------
 1 | ============
 2 | Документация
 3 | ============
 4 | 
 5 | .. important:: в примерах используется синтаксис Python 3.
 6 | 
 7 | 
 8 | .. toctree::
 9 |    :maxdepth: 2
10 | 
11 |    guide
12 |    grammemes
13 |    contributing
14 | 


--------------------------------------------------------------------------------
/pymorphy2/__init__.py:
--------------------------------------------------------------------------------
1 | from .analyzer import MorphAnalyzer
2 | from .version import __version__
3 | 


--------------------------------------------------------------------------------
/pymorphy2/cli.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import operator
  3 | import sys
  4 | import time
  5 | from functools import lru_cache
  6 | 
  7 | import pymorphy2
  8 | from pymorphy2.tokenizers import simple_word_tokenize
  9 | from pymorphy2.utils import get_mem_usage
 10 | 
 11 | # Hacks are here to make docstring compatible with both
 12 | # docopt and sphinx.ext.autodoc.
 13 | head = """
 14 | 
 15 | Pymorphy2 is a morphological analyzer / inflection engine for Russian language.
 16 | """
 17 | __doc__ = """
 18 | Usage::
 19 | 
 20 |     pymorphy parse [options] [<input>]
 21 |     pymorphy dict meta [--lang <lang> | --dict <path>]
 22 |     pymorphy dict mem_usage [--lang <lang> | --dict <path>] [--verbose]
 23 |     pymorphy -h | --help
 24 |     pymorphy --version
 25 | 
 26 | Options::
 27 | 
 28 |     -l --lemmatize      Include normal forms (lemmas)
 29 |     -s --score          Include non-contextual P(tag|word) scores
 30 |     -t --tag            Include tags
 31 |     --thresh <NUM>      Drop all results with estimated P(tag|word) less
 32 |                         than a threshold [default: 0.0]
 33 |     --tokenized         Assume that input text is already tokenized:
 34 |                         one token per line.
 35 |     -c --cache <SIZE>   Cache size, in entries. Set it to 0 to disable
 36 |                         cache; use 'unlim' value for unlimited cache
 37 |                         size [default: 20000]
 38 |     --lang <lang>       Language to use. Allowed values: ru, uk [default: ru]
 39 |     --dict <path>       Dictionary folder path
 40 |     -v --verbose        Be more verbose
 41 |     -h --help           Show this help
 42 | 
 43 | """
 44 | DOC = head + __doc__.replace('::\n', ':')
 45 | 
 46 | # TODO:
 47 | #   -i --inline         Don't start each output result with a new line
 48 | #   --format <format>   Result format. Allowed values: text, json [default: text]
 49 | #   --nonlex            Parse non-lexical tokens
 50 | 
 51 | 
 52 | logger = logging.getLogger('pymorphy2')
 53 | 
 54 | 
 55 | # ============================== Entry point ============================
 56 | def main(argv=None):
 57 |     """
 58 |     Pymorphy CLI interface dispatcher.
 59 |     """
 60 | 
 61 |     from docopt import docopt
 62 |     args = docopt(DOC, argv, version=pymorphy2.__version__)
 63 | 
 64 |     path = args['--dict']
 65 |     lang = args['--lang']
 66 | 
 67 |     if args['parse']:
 68 |         morph = pymorphy2.MorphAnalyzer(path=path, lang=lang)
 69 |         in_file = _open_for_read(args['<input>'])
 70 | 
 71 |         if any([args['--score'], args['--lemmatize'], args['--tag']]):
 72 |             score, lemmatize, tag = args['--score'], args['--lemmatize'], args['--tag']
 73 |         else:
 74 |             score, lemmatize, tag = True, True, True
 75 | 
 76 |         out_file = sys.stdout
 77 | 
 78 |         return parse(
 79 |             morph=morph,
 80 |             in_file=in_file,
 81 |             out_file=out_file,
 82 |             tokenize=not args['--tokenized'],
 83 |             score=score,
 84 |             normal_form=lemmatize,
 85 |             tag=tag,
 86 |             newlines=True,  # not args['--inline'],
 87 |             cache_size=args['--cache'],
 88 |             thresh=float(args['--thresh']),
 89 |         )
 90 | 
 91 |     if args['dict']:
 92 |         logger.addHandler(logging.StreamHandler())
 93 |         logger.setLevel(logging.DEBUG if args['--verbose'] else logging.INFO)
 94 |         logger.debug(args)
 95 | 
 96 |         if args['mem_usage']:
 97 |             return show_dict_mem_usage(lang, path)
 98 |         elif args['meta']:
 99 |             return show_dict_meta(lang, path)
100 | 
101 | 
102 | def _open_for_read(fn):
103 |     """ Open a file for reading """
104 |     if fn in ['-', '', None]:
105 |         return sys.stdin
106 | 
107 |     return open(fn, 'rt', encoding='utf8')
108 | 
109 | 
110 | # ============================ Commands ===========================
111 | 
112 | def show_dict_mem_usage(lang, dict_path=None):
113 |     """
114 |     Show dictionary memory usage.
115 |     """
116 |     initial_mem = get_mem_usage()
117 |     initial_time = time.time()
118 | 
119 |     morph = pymorphy2.MorphAnalyzer(path=dict_path, lang=lang)
120 | 
121 |     end_time = time.time()
122 |     mem_usage = get_mem_usage()
123 | 
124 |     logger.info(
125 |         'Memory usage: %0.1fM dictionary, %0.1fM total (load time %0.2fs)',
126 |         (mem_usage-initial_mem)/(1024*1024), mem_usage/(1024*1024), end_time-initial_time
127 |     )
128 | 
129 | 
130 | def show_dict_meta(lang, dict_path=None):
131 |     morph = pymorphy2.MorphAnalyzer(path=dict_path, lang=lang)
132 | 
133 |     for key, value in morph.dictionary.meta.items():
134 |         logger.info("%s: %s", key, value)
135 | 
136 | 
137 | def parse(morph, in_file, out_file, tokenize, score, normal_form, tag,
138 |           newlines, cache_size, thresh):
139 |     """
140 |     Parse text from in_file; write output to out_file.
141 |     Both ``in_file`` and ``out_file`` must support unicode.
142 | 
143 |     * If `tokenize` is False assume text is already tokenized - a token per
144 |     new line.
145 |     * If `score` is True, include score in the output.
146 |     * If `normal_form` is True, include normal form in the output.
147 |     * If `tag` is True, include tags in the output.
148 |     * If `newline` is True, write each result on a new line.
149 |     * `cache_size` is a maximum number of entries in internal cache.
150 |     * `thresh` is a minimum allowed parse score
151 | 
152 |     """
153 |     iter_tokens = _iter_tokens_tokenize if tokenize else _iter_tokens_notokenize
154 | 
155 |     parser = _TokenParserFormatter(
156 |         morph=morph,
157 |         score=score,
158 |         normal_form=normal_form,
159 |         tag=tag,
160 |         newlines=newlines,
161 |         thresh=thresh,
162 |     )
163 | 
164 |     _parse = parser.parse
165 |     if cache_size == 'unlim':
166 |         _parse = lru_cache(None)(_parse)
167 |     else:
168 |         cache_size = int(cache_size)
169 |         if cache_size:
170 |             _parse = lru_cache(cache_size)(_parse)
171 |     _write = out_file.write
172 | 
173 |     for token in iter_tokens(in_file):
174 |         _write(_parse(token))
175 | 
176 | 
177 | class _TokenParserFormatter:
178 |     """
179 |     This class defines its `parse` method based on arguments passed.
180 |     Some ugly code is to make all ifs work only once, not for each token.
181 |     """
182 | 
183 |     tpl_newline = "%s{%s}\n"
184 |     tpl_no_newline = "%s{%s} "
185 |     or_sep = "|"
186 | 
187 |     def __init__(self, morph, score, normal_form, tag, newlines, thresh):
188 |         tpl = self.tpl_newline if newlines else self.tpl_no_newline
189 |         morph_tag = morph.tag
190 |         morph_parse = morph.parse
191 |         join = self.or_sep.join
192 | 
193 |         if not normal_form and not tag:
194 |             raise ValueError("Empty output is requested")
195 | 
196 |         if not normal_form and not score and not thresh:
197 |             # morph.tag method is enough
198 |             self.parse = lambda tok: tpl % (tok, join(str(t) for t in morph_tag(tok)))
199 |             return
200 | 
201 |         if normal_form:
202 |             if tag:
203 |                 if score:
204 |                     def _parse_token(tok):
205 |                         seq = [
206 |                             f"{p.normal_form}:{p.score:0.3f}={p.tag}"
207 |                             for p in morph_parse(tok) if p.score >= thresh
208 |                         ]
209 |                         return tpl % (tok, join(seq))
210 |                 else:
211 |                     def _parse_token(tok):
212 |                         seq = [
213 |                             f"{p.normal_form}:{p.tag}"
214 |                             for p in morph_parse(tok) if p.score >= thresh
215 |                         ]
216 |                         return tpl % (tok, join(seq))
217 |             else:
218 |                 val = operator.itemgetter(1)
219 |                 def _parse_token(tok):
220 |                     lemmas = {}
221 |                     for p in morph_parse(tok):
222 |                         lemmas[p.normal_form] = lemmas.get(p.normal_form, 0) + p.score
223 | 
224 |                     items = sorted(
225 |                         [(lemma, w) for (lemma, w) in lemmas.items() if w >= thresh],
226 |                         key=val, reverse=True
227 |                     )
228 |                     if score:
229 |                         seq = [f"{lemma}:{w:0.3f}" for (lemma, w) in items]
230 |                     else:
231 |                         seq = [lemma for (lemma, w) in items]
232 | 
233 |                     return tpl % (tok, join(seq))
234 |         else:
235 |             if score:
236 |                 def _parse_token(tok):
237 |                     seq = [
238 |                         f"{p.score:0.3f}={p.tag}"
239 |                         for p in morph_parse(tok) if p.score >= thresh
240 |                     ]
241 |                     return tpl % (tok, join(seq))
242 |             else:
243 |                 def _parse_token(tok):
244 |                     seq = [
245 |                         "%s" % p.tag
246 |                         for p in morph_parse(tok) if p.score >= thresh
247 |                     ]
248 |                     return tpl % (tok, join(seq))
249 | 
250 |         self.parse = _parse_token
251 | 
252 | 
253 | def _iter_tokens_tokenize(fp):
254 |     """ Return an iterator of input tokens; each line is tokenized """
255 |     return (token for line in fp for token in simple_word_tokenize(line))
256 | 
257 | 
258 | def _iter_tokens_notokenize(fp):
259 |     """ Return an iterator of input tokens; each line is a single token """
260 |     return (line for line in (line.strip() for line in fp) if line)
261 | 


--------------------------------------------------------------------------------
/pymorphy2/dawg.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     from dawg import DAWG, RecordDAWG, IntCompletionDAWG
 3 |     EXTENSION_AVAILABLE = True
 4 | 
 5 | except ImportError:
 6 |     from dawg_python import DAWG, RecordDAWG, IntCompletionDAWG
 7 |     EXTENSION_AVAILABLE = False
 8 | 
 9 | 
10 | def assert_can_create():
11 |     if not EXTENSION_AVAILABLE:
12 |         msg = ("Creating of DAWGs with DAWG-Python is "
13 |                "not supported; install 'dawg' package.")
14 |         raise NotImplementedError(msg)
15 | 
16 | 
17 | class WordsDawg(RecordDAWG):
18 |     """
19 |     DAWG for storing words.
20 |     """
21 | 
22 |     # We are storing 2 unsigned short ints as values:
23 |     # the paradigm ID and the form index (inside paradigm).
24 |     # Byte order is big-endian (this makes word forms properly sorted).
25 |     DATA_FORMAT = ">HH"
26 | 
27 |     def __init__(self, data=None):
28 |         if data is None:
29 |             super().__init__(self.DATA_FORMAT)
30 |         else:
31 |             assert_can_create()
32 |             super().__init__(self.DATA_FORMAT, data)
33 | 
34 | 
35 | class PredictionSuffixesDAWG(WordsDawg):
36 |     """
37 |     DAWG for storing prediction data.
38 |     """
39 | 
40 |     # We are storing 3 unsigned short ints as values:
41 |     # count, the paradigm ID and the form index (inside paradigm).
42 |     # Byte order is big-endian (this makes word forms properly sorted).
43 |     DATA_FORMAT = ">HHH"
44 | 
45 | 
46 | class ConditionalProbDistDAWG(IntCompletionDAWG):
47 | 
48 |     MULTIPLIER = 1000000
49 | 
50 |     def __init__(self, data=None):
51 |         if data is None:
52 |             super().__init__()
53 |         else:
54 |             assert_can_create()
55 |             dawg_data = (
56 |                 (f"{word}:{tag}", int(prob * self.MULTIPLIER))
57 |                 for (word, tag), prob in data
58 |             )
59 |             super().__init__(dawg_data)
60 | 
61 |     def prob(self, word, tag):
62 |         dawg_key = f"{word}:{tag}"
63 |         return self.get(dawg_key, 0) / self.MULTIPLIER
64 | 
65 | 
66 | class DawgPrefixMatcher(DAWG):
67 |     def is_prefixed(self, word):
68 |         return bool(self.prefixes(word))
69 | 
70 | 
71 | class PythonPrefixMatcher:
72 |     def __init__(self, prefixes):
73 |         self._prefixes = tuple(prefixes)
74 | 
75 |     def prefixes(self, word):
76 |         if not self.is_prefixed(word):  # fail-fast path
77 |             return []
78 |         return [pref for pref in self._prefixes if word.startswith(pref)]
79 | 
80 |     def is_prefixed(self, word):
81 |         return word.startswith(self._prefixes)
82 | 
83 | 
84 | PrefixMatcher = DawgPrefixMatcher if EXTENSION_AVAILABLE else PythonPrefixMatcher
85 | 


--------------------------------------------------------------------------------
/pymorphy2/lang/__init__.py:
--------------------------------------------------------------------------------
1 | from . import ru, uk
2 | 


--------------------------------------------------------------------------------
/pymorphy2/lang/ru/__init__.py:
--------------------------------------------------------------------------------
1 | from .config import *
2 | 


--------------------------------------------------------------------------------
/pymorphy2/lang/ru/config.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Constants and configuration for Russian language.
  3 | """
  4 | from pymorphy2 import units
  5 | 
  6 | # paradigm prefixes used for dictionary compilation
  7 | PARADIGM_PREFIXES = ["", "по", "наи"]
  8 | 
  9 | # letters initials can start with
 10 | INITIAL_LETTERS = 'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЭЮЯ'
 11 | 
 12 | # a list of particles which can be attached to a word using a hyphen
 13 | PARTICLES_AFTER_HYPHEN = ["-то", "-ка", "-таки", "-де", "-тко", "-тка", "-с", "-ста"]
 14 | 
 15 | # "ё" is sometimes written as "е", but not the other way around
 16 | CHAR_SUBSTITUTES = {'е': 'ё'}
 17 | 
 18 | # Prefixes which don't change the word parse.
 19 | KNOWN_PREFIXES = [
 20 |     "авиа",
 21 |     "авто",
 22 |     "аква",
 23 |     "анти",
 24 |     "анти-",
 25 |     "антропо",
 26 |     "архи",
 27 |     "арт",
 28 |     "арт-",
 29 |     "астро",
 30 |     "аудио",
 31 |     "аэро",
 32 |     "без",
 33 |     "бес",
 34 |     "био",
 35 |     "вело",
 36 |     "взаимо",
 37 |     "вне",
 38 |     "внутри",
 39 |     "видео",
 40 |     "вице-",
 41 |     "вперед",
 42 |     "впереди",
 43 |     "гекто",
 44 |     "гелио",
 45 |     "гео",
 46 |     "гетеро",
 47 |     "гига",
 48 |     "гигро",
 49 |     "гипер",
 50 |     "гипо",
 51 |     "гомо",
 52 |     "дву",
 53 |     "двух",
 54 |     "де",
 55 |     "дез",
 56 |     "дека",
 57 |     "деци",
 58 |     "дис",
 59 |     "до",
 60 |     "евро",
 61 |     "за",
 62 |     "зоо",
 63 |     "интер",
 64 |     "инфра",
 65 |     "квази",
 66 |     "квази-",
 67 |     "кило",
 68 |     "кино",
 69 |     "контр",
 70 |     "контр-",
 71 |     "космо",
 72 |     "космо-",
 73 |     "крипто",
 74 |     "лейб-",
 75 |     "лже",
 76 |     "лже-",
 77 |     "макро",
 78 |     "макси",
 79 |     "макси-",
 80 |     "мало",
 81 |     "меж",
 82 |     "медиа",
 83 |     "медиа-",
 84 |     "мега",
 85 |     "мета",
 86 |     "мета-",
 87 |     "метео",
 88 |     "метро",
 89 |     "микро",
 90 |     "милли",
 91 |     "мини",
 92 |     "мини-",
 93 |     "моно",
 94 |     "мото",
 95 |     "много",
 96 |     "мульти",
 97 |     "нано",
 98 |     "нарко",
 99 |     "не",
100 |     "небез",
101 |     "недо",
102 |     "нейро",
103 |     "нео",
104 |     "низко",
105 |     "обер-",
106 |     "обще",
107 |     "одно",
108 |     "около",
109 |     "орто",
110 |     "палео",
111 |     "пан",
112 |     "пара",
113 |     "пента",
114 |     "пере",
115 |     "пиро",
116 |     "поли",
117 |     "полу",
118 |     "после",
119 |     "пост",
120 |     "пост-",
121 |     "порно",
122 |     "пра",
123 |     "пра-",
124 |     "пред",
125 |     "пресс-",
126 |     "противо",
127 |     "противо-",
128 |     "прото",
129 |     "псевдо",
130 |     "псевдо-",
131 |     "радио",
132 |     "разно",
133 |     "ре",
134 |     "ретро",
135 |     "ретро-",
136 |     "само",
137 |     "санти",
138 |     "сверх",
139 |     "сверх-",
140 |     "спец",
141 |     "суб",
142 |     "супер",
143 |     "супер-",
144 |     "супра",
145 |     "теле",
146 |     "тетра",
147 |     "топ-",
148 |     "транс",
149 |     "транс-",
150 |     "ультра",
151 |     "унтер-",
152 |     "штаб-",
153 |     "экзо",
154 |     "эко",
155 |     "эндо",
156 |     "эконом-",
157 |     "экс",
158 |     "экс-",
159 |     "экстра",
160 |     "экстра-",
161 |     "электро",
162 |     "энерго",
163 |     "этно",
164 | ]
165 | 
166 | # default analyzer units
167 | DEFAULT_UNITS = [
168 |     [
169 |         units.DictionaryAnalyzer(),
170 |         units.AbbreviatedFirstNameAnalyzer(INITIAL_LETTERS),
171 |         units.AbbreviatedPatronymicAnalyzer(INITIAL_LETTERS),
172 |     ],
173 | 
174 |     units.NumberAnalyzer(),
175 |     units.PunctuationAnalyzer(),
176 |     [
177 |         units.RomanNumberAnalyzer(),
178 |         units.LatinAnalyzer()
179 |     ],
180 | 
181 |     units.HyphenSeparatedParticleAnalyzer(PARTICLES_AFTER_HYPHEN),
182 |     units.HyphenAdverbAnalyzer(),
183 |     units.HyphenatedWordsAnalyzer(skip_prefixes=KNOWN_PREFIXES),
184 |     units.KnownPrefixAnalyzer(known_prefixes=KNOWN_PREFIXES),
185 |     [
186 |         units.UnknownPrefixAnalyzer(),
187 |         units.KnownSuffixAnalyzer()
188 |     ],
189 |     units.UnknAnalyzer(),
190 | ]
191 | 


--------------------------------------------------------------------------------
/pymorphy2/lang/uk/__init__.py:
--------------------------------------------------------------------------------
1 | from .config import *
2 | 


--------------------------------------------------------------------------------
/pymorphy2/lang/uk/_prefixes.py:
--------------------------------------------------------------------------------
  1 | # Prefixes which don't change the word parse.
  2 | # The list is from
  3 | # https://github.com/languagetool-org/languagetool/blob/master/languagetool-language-modules/uk/src/main/resources/org/languagetool/resource/uk/dash_prefixes.txt
  4 | _DASH_PREFIXES = """
  5 | 2D
  6 | 2G
  7 | 3D
  8 | 3G
  9 | 4D
 10 | 4G
 11 | CAD
 12 | call
 13 | CD
 14 | CDMA
 15 | CFI
 16 | CNG
 17 | DDoS
 18 | DNS
 19 | DoS
 20 | DSL
 21 | dvd
 22 | e
 23 | fashion
 24 | FM
 25 | ftp
 26 | G
 27 | GMP
 28 | GPRS
 29 | GPS
 30 | grid
 31 | GSM
 32 | HD
 33 | HR
 34 | HSDPA
 35 | ID
 36 | IMEA
 37 | IP
 38 | IT
 39 | led
 40 | LCD
 41 | LNG
 42 | live
 43 | MLM
 44 | MTV
 45 | mp3
 46 | n
 47 | OSB
 48 | pdf
 49 | PhD
 50 | PIN
 51 | POS
 52 | pr
 53 | QR
 54 | R'n'B
 55 | R'N'B
 56 | R&B
 57 | R&D
 58 | s
 59 | sim
 60 | SOS
 61 | SPA
 62 | sms
 63 | TV
 64 | UMTS
 65 | USB
 66 | VIN
 67 | vip
 68 | VoIP
 69 | WAP
 70 | web
 71 | X
 72 | Y
 73 | аль
 74 | альфа
 75 | анти
 76 | АРВ
 77 | арт
 78 | аудіо
 79 | байк
 80 | байкер
 81 | бард
 82 | бас
 83 | бета
 84 | бізнес
 85 | бліц
 86 | блог
 87 | блок
 88 | блюз
 89 | бомж
 90 | бонус
 91 | ботокс
 92 | боулінг
 93 | брейк
 94 | бренд
 95 | бундес
 96 | вакуум
 97 | веб
 98 | велнес
 99 | ВІЛ
100 | віп
101 | віце
102 | гала
103 | гамма
104 | гей
105 | гейм
106 | генерал
107 | гештальт
108 | ГМ
109 | ГМО
110 | гольф
111 | гоп
112 | горе
113 | готик
114 | гранд
115 | ґранд
116 | графіті
117 | грид
118 | грумінг
119 | дайв
120 | дайвінг
121 | данс
122 | даун
123 | дельта
124 | денс
125 | дзен
126 | джаз
127 | диво
128 | дизайн
129 | дизель
130 | долбі
131 | допінг
132 | ДОТС
133 | драг
134 | дрес
135 | дубль
136 | дурман
137 | е
138 | екіпаж
139 | економ
140 | експерт
141 | екс
142 | експрес
143 | екстра
144 | екстрим
145 | екшн
146 | еліт
147 | ерзац
148 | ескорт
149 | євро
150 | жлоб
151 | зіц
152 | зомбі
153 | ЗПГ
154 | івент
155 | імідж
156 | інвест
157 | інді
158 | інсентив
159 | інтернет
160 | інтим
161 | інформ
162 | історико
163 | ІТ
164 | ІЧ
165 | йога
166 | камер
167 | кантрі
168 | караоке
169 | кастинг
170 | квазі
171 | кемпінг
172 | кваліфайн
173 | кібер
174 | кітч
175 | козак
176 | коктейль
177 | колл
178 | комік
179 | комікс
180 | майстер
181 | конгрес
182 | консалтинг
183 | контент
184 | контр
185 | конференц
186 | концепт
187 | кредит
188 | кремль
189 | крос
190 | КСВ
191 | лайт
192 | лаунж
193 | лейб
194 | лесбі
195 | лгбт
196 | лже
197 | ліберал
198 | лор
199 | люкс
200 | люмпен
201 | максі
202 | маркетинг
203 | мас
204 | мега
205 | медіа
206 | менеджмент
207 | метал
208 | міді
209 | мікс
210 | мілітарі
211 | міні
212 | МММ
213 | модерн
214 | мульт
215 | мультимедіа
216 | напів
217 | націонал
218 | нація
219 | НВЧ
220 | нокаут
221 | ностальжі
222 | нью
223 | обер
224 | онлайн
225 | офіс
226 | ОУН
227 | панк
228 | ПВХ
229 | ПЕТ
230 | піар
231 | пін
232 | плейбек
233 | ПЛР
234 | покер
235 | поп
236 | пост
237 | поттер
238 | постпродакшн
239 | прайм
240 | прайс
241 | прем'єр
242 | преміум
243 | прес
244 | приват
245 | продакшн
246 | профі
247 | псевдо
248 | реаліті
249 | реггі
250 | резус
251 | рейв
252 | рентген
253 | рейтинг
254 | реп
255 | ретро
256 | референс
257 | референц
258 | ритм
259 | РК
260 | рок
261 | ротарі
262 | РХБ
263 | салон
264 | саунд
265 | своп
266 | секонд
267 | секс
268 | сексі
269 | сервіс
270 | скейт
271 | скінхед
272 | скретч
273 | слем
274 | смарт
275 | смс
276 | СНІД
277 | соціал
278 | СОС
279 | соул
280 | софт
281 | спа
282 | спам
283 | спаринг
284 | СПГ
285 | спорт
286 | спрей
287 | стартап
288 | стоп
289 | стрес
290 | стрип
291 | стриптиз
292 | супер
293 | тайм
294 | талант
295 | тандем
296 | танц
297 | тату
298 | ТБ
299 | телеком
300 | тест
301 | топ
302 | топлес
303 | торент
304 | тренд
305 | тренінг
306 | треш
307 | триб'ют
308 | трофі
309 | тур
310 | тюнинг
311 | УЗД
312 | ура
313 | УФ
314 | фан
315 | фест
316 | фешн
317 | фітнес
318 | флеш
319 | ФМ
320 | фолк
321 | фольк
322 | хеш
323 | цар
324 | чудо
325 | хайтек
326 | хард
327 | хіпі
328 | хостел
329 | чіп
330 | шейпінг
331 | шенген
332 | шеф
333 | шопінг
334 | шоу
335 | штаб
336 | юніор
337 | """
338 | 
339 | # TODO: prefixes without a hyphen?
340 | KNOWN_PREFIXES = [
341 |     line.strip() + "-"
342 |     for line in _DASH_PREFIXES.split("\n")
343 |     if line.strip()
344 | ]
345 | _known1 = set(KNOWN_PREFIXES)
346 | 
347 | 
348 | # These prefixes are adapted from pymorphy2 Russian prefixes list;
349 | # see https://github.com/kmike/pymorphy2/issues/58#issuecomment-207670264
350 | KNOWN_PREFIXES += [_p for _p in [
351 |     "авіа",  # авіаквиток
352 |     "авто",  # автоперетворювач
353 |     "аква",  # аквапарк
354 |     "анти",  # анимонопольній
355 |     "анти-", #
356 |     "антропо", # антропогенний
357 |     "архі",  # архіважливий
358 |     "арт",   #
359 |     "арт-",  # арт-майдан
360 |     "астро", # астронавігація
361 |     "аудіо", # аудіокнига
362 |     "аеро",  # аеромобільний
363 |     "без",   # безкоштовно
364 |     "біо",   # біометричний
365 |     "вело",  # велотренажер
366 |     "взаємо", # взаємовиключний
367 |     "поза",  # позаплановий
368 |     "внутрішньо", # внутрішньовенно
369 |     "відео", # відеоспостереження
370 |     "віце-", #
371 |     "вперед", # впередсмотрящий
372 |     "гекто", # гектолітр
373 |     "гелио", # геліоцентрична
374 |     "гео",   # геолокація
375 |     "гетеро", # гетерохромия
376 |     "гіга",  # гігават
377 |     "гігро", # гігроскопічність
378 |     "гіпер", # гіперактивний
379 |     "гіпо",  # гіпоалергенний
380 |     "гомо",  # гомозигота
381 |     "дво",   # двонаправлений, двоповерховий
382 |     "де",    # декваліфікація
383 |     "дез",   # дезінфекція
384 |     # "дека", # ???
385 |     "деци",  # дециметр
386 |     "дис",   # дисваліфікація
387 |     "до",    # доїхати
388 |     "євро",  # євробачення
389 |     "за",    # запрацювати
390 |     "зоо",   # зоомагазин
391 |     "інтер", # інтерактивний
392 |     "інфра", # інфрачервоний
393 |     "квазі", # квазікристал
394 |     "квазі-", # квазі-заходи
395 |     "кіло",  # кілограм
396 |     "кіно",  # кінокамера
397 |     "контр", # контрзаходи
398 |     "контр-", # контр-адмірал
399 |     "космо", # космологія
400 |     "космо-", #
401 |     "крипто", # криптозоологія
402 |     "лейб-", # лейб-гвардія
403 |     "лже",   # лжерелігія
404 |     "лже-",  # лже-розтяжки
405 |     "макро", # макросвіт
406 |     # "макси", # ???
407 |     # "макси-", #  ???
408 |     "мало",  # малоймовірний
409 |     "між",   # міжнаціональний
410 |     "медіа", # медіапрогравач
411 |     "медіа-", #
412 |     "мега",  # мегават
413 |     "мета",  # метапрограмування
414 |     "мета-", #
415 |     "метео", # метеосупутник
416 |     "метро", # метросексуал
417 |     "мікро", # мікросвіт
418 |     "мілі",  # міліграм
419 |     "міні",  #
420 |     "міні-", #
421 |     "моно",  # моновалентна
422 |     "мото",  # мотоспорт
423 |     "багато", # багатоповерховий
424 |     # "мульті", # ???
425 |     "нано",  # нанометр
426 |     "нарко", # наркозалежність
427 |     "не",    # ненадійний
428 |     # "небез", #
429 |     # "недо", #
430 |     "нейро", # нейромедицина
431 |     "нео",   #
432 |     "низько", # низькокалорійний
433 |     "обер-", # обер-офыцер
434 |     "загально", # загальнонаціональний
435 |     # "одно", # ???
436 |     "навколо", # навколоплідний
437 |     "орто",  # ортофосфатна
438 |     "палео", #
439 |     "пан",   # панамериканський
440 |     "пара",  # паранормальний
441 |     "пента", #
442 |     "пере",  # переохолоджений
443 |     "піро",  #
444 |     "полі",  #
445 |     "полу",  #
446 |     "після", # післяопераційний
447 |     "пост",  #
448 |     "пост-", #
449 |     "порно", #
450 |     "пра",   # прадід
451 |     "пра-",  #
452 |     "перед", # передбачення
453 |     "прес-", #
454 |     "проти", # противірусні
455 |     "проти-", #
456 |     "прото", #
457 |     "псевдо", # псевдокод
458 |     "псевдо-", #
459 |     "радіо", #
460 |     "разно", # різнокаліберні
461 |     "ре",    # ревакцінація
462 |     "ретро", # ретроактивний
463 |     "ретро-", #
464 |     "само",  # самонавідний
465 |     "санти", # сантиметр
466 |     "над",   # надпровідний
467 |     "над-",  #
468 |     "спец",  #
469 |     "суб",   # субтропічний
470 |     "супер", # суперпозиція
471 |     "супер-", #
472 |     # "супра", #
473 |     "теле",  # телеприсутність
474 |     "тетра", #
475 |     "топ-",  #
476 |     "транс", # трансатлантичний
477 |     "транс-", #
478 |     "ультра", # ультрафіолет
479 |     "унтер-", #
480 |     "штаб-", # штаб-квартира
481 |     "екзо",  #
482 |     "еко",   #
483 |     "ендо",  #
484 |     "економ-", #
485 |     "екс",   #
486 |     "екс-",  #
487 |     "екстра", #
488 |     "екстра-", #
489 |     "електро", #
490 |     "енерго", #
491 |     "етно",  #
492 | ] if _p not in _known1]
493 | 


--------------------------------------------------------------------------------
/pymorphy2/lang/uk/config.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Constants and configuration for Ukrainian language.
 3 | """
 4 | from pymorphy2 import units
 5 | from ._prefixes import KNOWN_PREFIXES
 6 | 
 7 | # paradigm prefixes used for dictionary compilation
 8 | PARADIGM_PREFIXES = ["", "най", "якнай", "щонай"]
 9 | 
10 | # letters initials can start with
11 | INITIAL_LETTERS = 'АБВГҐДЕЄЖЗІЇЙКЛМНОПРСТУФХЦЧШЩЮЯ'
12 | 
13 | # a list of particles which can be attached to a word using a hyphen
14 | PARTICLES_AFTER_HYPHEN = ["-но", "-таки", "-бо", "-от"]
15 | 
16 | # "ґ" is sometimes written as "г", but not the other way around
17 | CHAR_SUBSTITUTES = {'г': 'ґ'}
18 | 
19 | # default analyzer units
20 | DEFAULT_UNITS = [
21 |     [
22 |         units.DictionaryAnalyzer(),
23 |         units.AbbreviatedFirstNameAnalyzer(INITIAL_LETTERS),
24 |         units.AbbreviatedPatronymicAnalyzer(INITIAL_LETTERS),
25 | 
26 |         # "I" can be a Roman number or an English word
27 |         units.RomanNumberAnalyzer(),
28 |         units.LatinAnalyzer()
29 |     ],
30 | 
31 |     units.NumberAnalyzer(),
32 |     units.PunctuationAnalyzer(),
33 | 
34 |     units.HyphenSeparatedParticleAnalyzer(PARTICLES_AFTER_HYPHEN),
35 |     units.HyphenatedWordsAnalyzer(skip_prefixes=KNOWN_PREFIXES),
36 |     units.KnownPrefixAnalyzer(known_prefixes=KNOWN_PREFIXES),
37 |     [
38 |         units.UnknownPrefixAnalyzer(),
39 |         units.KnownSuffixAnalyzer()
40 |     ],
41 |     units.UnknAnalyzer(),
42 | ]
43 | 


--------------------------------------------------------------------------------
/pymorphy2/opencorpora_dict/__init__.py:
--------------------------------------------------------------------------------
1 | from .compile import convert_to_pymorphy2
2 | from .storage import load_dict as load
3 | from .wrapper import Dictionary
4 | 


--------------------------------------------------------------------------------
/pymorphy2/opencorpora_dict/parse.py:
--------------------------------------------------------------------------------
  1 | """
  2 | :mod:`pymorphy2.opencorpora_dict.parse` is a
  3 | module for OpenCorpora XML dictionaries parsing.
  4 | """
  5 | import collections
  6 | import logging
  7 | 
  8 | try:
  9 |     from lxml.etree import iterparse
 10 | 
 11 |     def xml_clear_elem(elem):
 12 |         elem.clear()
 13 |         while elem.getprevious() is not None:
 14 |             del elem.getparent()[0]
 15 | 
 16 | except ImportError:
 17 |     from xml.etree.ElementTree import iterparse
 18 | 
 19 |     def xml_clear_elem(elem):
 20 |         elem.clear()
 21 | 
 22 | from pymorphy2.utils import with_progress
 23 | 
 24 | 
 25 | logger = logging.getLogger(__name__)
 26 | 
 27 | ParsedDictionary = collections.namedtuple('ParsedDictionary', 'lexemes links grammemes version revision')
 28 | 
 29 | 
 30 | def get_dictionary_info(filename, elem_limit=1000):
 31 |     """ Return dictionary version and revision """
 32 |     for idx, (ev, elem) in enumerate(iterparse(filename, events=('start',))):
 33 |         if elem.tag == 'dictionary':
 34 |             version = elem.get('version')
 35 |             revision = elem.get('revision')
 36 |             return version, revision
 37 |         if idx > elem_limit:
 38 |             return None, None
 39 |     return None, None
 40 | 
 41 | 
 42 | def parse_opencorpora_xml(filename):
 43 |     """
 44 |     Parse OpenCorpora dict XML and return a ``ParsedDictionary`` namedtuple.
 45 |     """
 46 | 
 47 |     links = []
 48 |     lexemes = {}
 49 |     grammemes = []
 50 | 
 51 |     version, revision = get_dictionary_info(filename)
 52 |     logger.info("dictionary v%s, rev%s", version, revision)
 53 |     interesting_tags = {'grammeme', 'lemma', 'link'}
 54 | 
 55 |     def _parse(filename):
 56 |         for ev, elem in iterparse(filename):
 57 |             if elem.tag not in interesting_tags:
 58 |                 continue
 59 |             yield ev, elem
 60 | 
 61 |     logger.info("parsing XML dictionary")
 62 | 
 63 |     for ev, elem in with_progress(_parse(filename), "XML parsing"):
 64 |         if elem.tag == 'grammeme':
 65 |             name = elem.find('name').text
 66 |             parent = elem.get('parent')
 67 |             alias = elem.find('alias').text
 68 |             description = elem.find('description').text
 69 | 
 70 |             grammeme = (name, parent, alias, description)
 71 |             grammemes.append(grammeme)
 72 |             xml_clear_elem(elem)
 73 | 
 74 |         if elem.tag == 'lemma':
 75 |             lex_id, word_forms = _word_forms_from_xml_elem(elem)
 76 |             lexemes[lex_id] = word_forms
 77 |             xml_clear_elem(elem)
 78 | 
 79 |         elif elem.tag == 'link':
 80 |             link_tuple = (
 81 |                 elem.get('from'),
 82 |                 elem.get('to'),
 83 |                 elem.get('type'),
 84 |             )
 85 |             links.append(link_tuple)
 86 |             xml_clear_elem(elem)
 87 | 
 88 |     return ParsedDictionary(
 89 |         lexemes=lexemes,
 90 |         links=links,
 91 |         grammemes=grammemes,
 92 |         version=version,
 93 |         revision=revision
 94 |     )
 95 | 
 96 | 
 97 | def _grammemes_from_elem(elem):
 98 |     return ",".join([g.get('v') for g in elem.iter('g')])
 99 | 
100 | 
101 | def _word_forms_from_xml_elem(elem):
102 |     """
103 |     Return a list of (word, tag) pairs given "lemma" XML element.
104 |     """
105 |     lexeme = []
106 |     lex_id = elem.get('id')
107 | 
108 |     if len(elem) == 0:  # deleted lexeme?
109 |         return lex_id, lexeme
110 | 
111 |     base_info = list(elem.iter('l'))
112 | 
113 |     assert len(base_info) == 1
114 |     base_grammemes = _grammemes_from_elem(base_info[0])
115 | 
116 |     for form_elem in elem.iter('f'):
117 |         grammemes = _grammemes_from_elem(form_elem)
118 |         form = form_elem.get('t').lower()
119 |         if not (base_grammemes + grammemes):
120 |             logger.warning("no information provided for word %s, dropping the whole lexeme" % form)
121 |             return lex_id, []
122 |         if isinstance(form, bytes):  # Python 2.x
123 |             form = form.decode('ascii')
124 |         lexeme.append(
125 |             (form, (base_grammemes + " " + grammemes).strip())
126 |         )
127 | 
128 |     return lex_id, lexeme
129 | 


--------------------------------------------------------------------------------
/pymorphy2/opencorpora_dict/preprocess.py:
--------------------------------------------------------------------------------
  1 | """
  2 | :mod:`pymorphy2.opencorpora_dict.preprocess` is a
  3 | module for preprocessing parsed OpenCorpora dictionaries.
  4 | 
  5 | The presence of this module means that pymorphy2 dictionaries are
  6 | not fully compatible with OpenCorpora.
  7 | """
  8 | import collections
  9 | import logging
 10 | from functools import lru_cache
 11 | 
 12 | from pymorphy2.utils import with_progress
 13 | 
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | 
 17 | def simplify_tags(parsed_dict, skip_space_ambiguity=True):
 18 |     """
 19 |     This function simplifies tags in :param:`parsed_dict`.
 20 |     :param:`parsed_dict` is modified inplace.
 21 |     """
 22 |     logger.info("simplifying tags: looking for tag spellings")
 23 |     spellings = _get_tag_spellings(parsed_dict)
 24 | 
 25 |     logger.info("simplifying tags: looking for spelling duplicates "
 26 |                 "(skip_space_ambiguity: %s)", skip_space_ambiguity)
 27 |     tag_replaces = _get_duplicate_tag_replaces(spellings, skip_space_ambiguity)
 28 |     logger.debug("%d duplicate tags will be removed", len(tag_replaces))
 29 | 
 30 |     logger.info("simplifying tags: fixing")
 31 |     for lex_id in with_progress(parsed_dict.lexemes, "Simplifying tags"):
 32 |         new_lexeme = [
 33 |             (word, _simplify_tag(tag, tag_replaces))
 34 |             for word, tag in parsed_dict.lexemes[lex_id]
 35 |         ]
 36 |         parsed_dict.lexemes[lex_id] = new_lexeme
 37 | 
 38 | 
 39 | def drop_unsupported_parses(parsed_dict):
 40 |     """
 41 |     Remove unsupported parses from OpenCorpora dictionary.
 42 | 
 43 |     In particular, lexemes with Init tags are removed
 44 |     because pymorphy2 handles them differently.
 45 |     """
 46 |     logger.info("dropping unsupported parses")
 47 |     for lex_id in parsed_dict.lexemes:
 48 |         parsed_dict.lexemes[lex_id] = [
 49 |             (word, tag) for word, tag in parsed_dict.lexemes[lex_id]
 50 |             if 'Init' not in tag
 51 |         ]
 52 | 
 53 | 
 54 | @lru_cache()
 55 | def tag2grammemes(tag_str):
 56 |     """ Given tag string, return tag grammemes """
 57 |     return _split_grammemes(replace_redundant_grammemes(tag_str))
 58 | 
 59 | 
 60 | @lru_cache()
 61 | def replace_redundant_grammemes(tag_str):
 62 |     """ Replace 'loc1', 'gen1' and 'acc1' grammemes in ``tag_str`` """
 63 |     return tag_str.replace('loc1', 'loct').replace('gen1', 'gent').replace('acc1', 'accs')
 64 | 
 65 | 
 66 | def _split_grammemes(tag_str):
 67 |     return frozenset(tag_str.replace(' ', ',', 1).split(','))
 68 | 
 69 | 
 70 | def _get_tag_spellings(parsed_dict):
 71 |     """
 72 |     Return a dict where keys are sets of grammemes found in dictionary
 73 |     and values are counters of all tag spellings for these grammemes.
 74 |     """
 75 |     spellings = collections.defaultdict(lambda: collections.defaultdict(int))
 76 |     for tag in _itertags(parsed_dict):
 77 |         spellings[tag2grammemes(tag)][tag] += 1
 78 |     return spellings
 79 | 
 80 | 
 81 | def _get_duplicate_tag_replaces(spellings, skip_space_ambiguity):
 82 |     replaces = {}
 83 |     for grammemes in spellings:
 84 |         tags = spellings[grammemes]
 85 |         if _is_ambiguous(tags.keys(), skip_space_ambiguity):
 86 |             items = sorted(tags.items(), key=lambda it: it[1], reverse=True)
 87 |             top_tag = items[0][0]
 88 |             for tag, count in items[1:]:
 89 |                 replaces[tag] = top_tag
 90 |     return replaces
 91 | 
 92 | 
 93 | def _is_ambiguous(tags, skip_space_ambiguity=True):
 94 |     """
 95 |     >>> _is_ambiguous(['NOUN sing,masc'])
 96 |     False
 97 |     >>> _is_ambiguous(['NOUN sing,masc', 'NOUN masc,sing'])
 98 |     True
 99 |     >>> _is_ambiguous(['NOUN masc,sing', 'NOUN,masc sing'])
100 |     False
101 |     >>> _is_ambiguous(['NOUN masc,sing', 'NOUN,masc sing'], skip_space_ambiguity=False)
102 |     True
103 |     """
104 |     if len(tags) < 2:
105 |         return False
106 | 
107 |     if skip_space_ambiguity:
108 |         # if space position differs then skip this ambiguity
109 |         # XXX: this doesn't handle cases when space position difference
110 |         # is not the only ambiguity
111 |         space_pos = [tag.index(' ') if ' ' in tag else None
112 |                      for tag in map(str, tags)]
113 |         if len(space_pos) == len(set(space_pos)):
114 |             return False
115 | 
116 |     return True
117 | 
118 | 
119 | def _simplify_tag(tag, tag_replaces):
120 |     tag = replace_redundant_grammemes(tag)
121 |     return tag_replaces.get(tag, tag)
122 | 
123 | 
124 | def _itertags(parsed_dict):
125 |     for lex_id in with_progress(parsed_dict.lexemes, "Looking for tag spellings"):
126 |         for word, tag in parsed_dict.lexemes[lex_id]:
127 |             yield tag
128 | 


--------------------------------------------------------------------------------
/pymorphy2/opencorpora_dict/probability.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Module for estimating P(t|w) from partially annotated OpenCorpora XML dump
  3 | and saving this information to a file.
  4 | 
  5 | This module requires NLTK 3.x, opencorpora-tools>=0.4.4 and dawg >= 0.7
  6 | packages for probability estimation and resulting file creation.
  7 | """
  8 | import logging
  9 | import os
 10 | 
 11 | from pymorphy2 import MorphAnalyzer
 12 | from pymorphy2.dawg import ConditionalProbDistDAWG
 13 | from pymorphy2.opencorpora_dict.preprocess import tag2grammemes
 14 | from pymorphy2.opencorpora_dict.storage import update_meta
 15 | from pymorphy2.utils import with_progress
 16 | 
 17 | 
 18 | def add_conditional_tag_probability(corpus_filename, out_path, min_word_freq,
 19 |                                     logger=None, morph=None):
 20 |     """ Add P(t|w) estimates to a compiled dictionary """
 21 | 
 22 |     if morph is None:
 23 |         morph = MorphAnalyzer(out_path, probability_estimator_cls=None)
 24 | 
 25 |     if logger is None:
 26 |         logger = logging.getLogger(__name__)
 27 | 
 28 |     logger.info("Estimating P(t|w) from %s" % corpus_filename)
 29 |     cpd, cfd = estimate_conditional_tag_probability(morph, corpus_filename, logger)
 30 | 
 31 |     logger.info("Encoding P(t|w) as DAWG")
 32 |     d = build_cpd_dawg(morph, cpd, int(min_word_freq))
 33 |     dawg_filename = os.path.join(out_path, 'p_t_given_w.intdawg')
 34 |     d.save(dawg_filename)
 35 | 
 36 |     logger.info("Updating meta information")
 37 |     meta_filename = os.path.join(out_path, 'meta.json')
 38 |     update_meta(meta_filename, [
 39 |         ('P(t|w)', True),
 40 |         ('P(t|w)_unique_words', len(cpd.conditions())),
 41 |         ('P(t|w)_outcomes', cfd.N()),
 42 |         ('P(t|w)_min_word_freq', int(min_word_freq)),
 43 |     ])
 44 |     logger.info('\nDone.')
 45 | 
 46 | 
 47 | def estimate_conditional_tag_probability(morph, corpus_filename, logger=None):
 48 |     """
 49 |     Estimate P(t|w) based on OpenCorpora xml dump.
 50 | 
 51 |     Probability is estimated based on counts of disambiguated
 52 |     ambiguous words, using simple Laplace smoothing.
 53 |     """
 54 |     import nltk
 55 |     import opencorpora
 56 | 
 57 |     if logger is None:
 58 |         logger = logging.getLogger(__name__)
 59 | 
 60 |     class _ConditionalProbDist(nltk.ConditionalProbDist):
 61 |         """
 62 |         This ConditionalProbDist subclass passes 'condition' variable to
 63 |         probdist_factory. See https://github.com/nltk/nltk/issues/500
 64 |         """
 65 |         def __init__(self, cfdist, probdist_factory):
 66 |             self._probdist_factory = probdist_factory
 67 |             for condition in cfdist:
 68 |                 self[condition] = probdist_factory(cfdist[condition], condition)
 69 | 
 70 |     reader = opencorpora.CorpusReader(corpus_filename)
 71 | 
 72 |     disambig_words = list(
 73 |         with_progress(
 74 |             _disambiguated_words(reader),
 75 |             "Reading disambiguated words from corpus"
 76 |         )
 77 |     )
 78 | 
 79 |     disambig_words = with_progress(disambig_words, "Filtering out non-ambiguous words")
 80 |     ambiguous_words = [
 81 |         (w, gr) for (w, gr) in (
 82 |             (w.lower(), tag2grammemes(t))
 83 |             for (w, t) in disambig_words
 84 |             if len(morph.tag(w)) > 1
 85 |         ) if gr != {'UNKN'}
 86 |     ]
 87 | 
 88 |     logger.info("Computing P(t|w)")
 89 | 
 90 |     def probdist_factory(fd, condition):
 91 |         bins = max(len(morph.tag(condition)), fd.B())
 92 |         return nltk.LaplaceProbDist(fd, bins=bins)
 93 | 
 94 |     cfd = nltk.ConditionalFreqDist(ambiguous_words)
 95 |     cpd = _ConditionalProbDist(cfd, probdist_factory)
 96 |     return cpd, cfd
 97 | 
 98 | 
 99 | def build_cpd_dawg(morph, cpd, min_word_freq):
100 |     """
101 |     Return conditional tag probability information encoded as DAWG.
102 | 
103 |     For each "interesting" word and tag the resulting DAWG
104 |     stores ``"word:tag"`` key with ``probability*1000000`` integer value.
105 |     """
106 |     words = [word for (word, fd) in cpd.items()
107 |              if fd.freqdist().N() >= min_word_freq]
108 | 
109 |     prob_data = filter(
110 |         lambda rec: not _all_the_same(rec[1]),
111 |         ((word, _tag_probabilities(morph, word, cpd)) for word in words)
112 |     )
113 |     dawg_data = (
114 |         ((word, tag), prob)
115 |         for word, probs in prob_data
116 |         for tag, prob in probs.items()
117 |     )
118 |     return ConditionalProbDistDAWG(dawg_data)
119 | 
120 | 
121 | def _disambiguated_words(reader):
122 |     return (
123 |         (word, parses[0][1])
124 |         for (word, parses) in reader.iter_parsed_words()
125 |         if len(parses) == 1
126 |     )
127 | 
128 | 
129 | def _all_the_same(probs):
130 |     return len(set(probs.values())) <= 1
131 | 
132 | 
133 | def _parse_probabilities(morph, word, cpd):
134 |     """
135 |     Return probabilities of word parses
136 |     according to CustomConditionalProbDist ``cpd``.
137 |     """
138 |     parses = morph.parse(word)
139 |     probabilities = [cpd[word].prob(p.tag.grammemes) for p in parses]
140 |     return list(zip(parses, probabilities))
141 | 
142 | 
143 | def _tag_probabilities(morph, word, cpd):
144 |     return dict(
145 |         (p.tag, prob)
146 |         for (p, prob) in _parse_probabilities(morph, word, cpd)
147 |     )
148 | 
149 | 
150 | 


--------------------------------------------------------------------------------
/pymorphy2/opencorpora_dict/storage.py:
--------------------------------------------------------------------------------
  1 | """
  2 | :mod:`pymorphy2.opencorpora_dict.storage` is a
  3 | module for saving and loading pymorphy2 dictionaries.
  4 | """
  5 | import array
  6 | import collections
  7 | import datetime
  8 | import logging
  9 | import os
 10 | import struct
 11 | 
 12 | import pymorphy2
 13 | from pymorphy2 import dawg
 14 | from pymorphy2 import tagset
 15 | from pymorphy2.utils import json_write, json_read
 16 | 
 17 | logger = logging.getLogger(__name__)
 18 | 
 19 | CURRENT_FORMAT_VERSION = '2.4'
 20 | 
 21 | LoadedDictionary = collections.namedtuple('LoadedDictionary', [
 22 |     'meta',
 23 |     'gramtab',
 24 |     'suffixes',
 25 |     'paradigms',
 26 |     'words',
 27 |     'prediction_suffixes_dawgs',
 28 |     'Tag',
 29 |     'paradigm_prefixes',
 30 | ])
 31 | 
 32 | 
 33 | def load_dict(path, gramtab_format='opencorpora-int'):
 34 |     """
 35 |     Load pymorphy2 dictionary.
 36 |     ``path`` is a folder name with dictionary data.
 37 |     """
 38 | 
 39 |     _f = lambda p: os.path.join(path, p)
 40 | 
 41 |     meta = load_meta(_f('meta.json'))
 42 |     _assert_format_is_compatible(meta, path)
 43 | 
 44 |     Tag = _load_tag_class(gramtab_format, _f('grammemes.json'))
 45 | 
 46 |     str_gramtab = _load_gramtab(meta, gramtab_format, path)
 47 |     gramtab = [Tag(tag_str) for tag_str in str_gramtab]
 48 | 
 49 |     suffixes = json_read(_f('suffixes.json'))
 50 |     paradigms = _load_paradigms(_f('paradigms.array'))
 51 |     words = dawg.WordsDawg().load(_f('words.dawg'))
 52 | 
 53 |     try:
 54 |         paradigm_prefixes = meta["compile_options"]["paradigm_prefixes"]
 55 |     except KeyError:
 56 |         # support dicts v2.4
 57 |         paradigm_prefixes = json_read(_f('paradigm-prefixes.json'))
 58 | 
 59 |     prediction_suffixes_dawgs = []
 60 |     for prefix_id in range(len(paradigm_prefixes)):
 61 |         fn = _f('prediction-suffixes-%s.dawg' % prefix_id)
 62 |         assert os.path.exists(fn)
 63 |         prediction_suffixes_dawgs.append(dawg.PredictionSuffixesDAWG().load(fn))
 64 | 
 65 |     return LoadedDictionary(
 66 |         meta=meta,
 67 |         gramtab=gramtab,
 68 |         suffixes=suffixes,
 69 |         paradigms=paradigms,
 70 |         words=words,
 71 |         prediction_suffixes_dawgs=prediction_suffixes_dawgs,
 72 |         Tag=Tag,
 73 |         paradigm_prefixes=paradigm_prefixes,
 74 |     )
 75 | 
 76 | 
 77 | def save_compiled_dict(compiled_dict, out_path, source_name, language_code):
 78 |     """
 79 |     Save a compiled_dict to ``out_path``
 80 |     ``out_path`` should be a name of folder where to put dictionaries.
 81 |     """
 82 |     logger.info("Saving...")
 83 |     _f = lambda path: os.path.join(out_path, path)
 84 | 
 85 |     json_write(_f('grammemes.json'), compiled_dict.parsed_dict.grammemes)
 86 | 
 87 |     gramtab_formats = {}
 88 |     for format, Tag in tagset.registry.items():
 89 |         Tag._init_grammemes(compiled_dict.parsed_dict.grammemes)
 90 |         new_gramtab = [Tag._from_internal_tag(tag) for tag in compiled_dict.gramtab]
 91 | 
 92 |         gramtab_name = "gramtab-%s.json" % format
 93 |         gramtab_formats[format] = gramtab_name
 94 | 
 95 |         json_write(_f(gramtab_name), new_gramtab)
 96 | 
 97 |     with open(_f('paradigms.array'), 'wb') as f:
 98 |         f.write(struct.pack("<H", len(compiled_dict.paradigms)))
 99 |         for para in compiled_dict.paradigms:
100 |             f.write(struct.pack("<H", len(para)))
101 |             para.tofile(f)
102 | 
103 |     json_write(_f('suffixes.json'), compiled_dict.suffixes)
104 |     compiled_dict.words_dawg.save(_f('words.dawg'))
105 | 
106 |     for prefix_id, prediction_suffixes_dawg in enumerate(compiled_dict.prediction_suffixes_dawgs):
107 |         prediction_suffixes_dawg.save(_f('prediction-suffixes-%s.dawg' % prefix_id))
108 | 
109 |     logger.debug("computing metadata..")
110 | 
111 |     def _dawg_len(dawg):
112 |         return sum(1 for k in dawg.iterkeys())  # method .keys() of DAWG/DAWG-Python returns a list
113 | 
114 |     logger.debug('  words_dawg_len')
115 |     words_dawg_len = _dawg_len(compiled_dict.words_dawg)
116 |     logger.debug('  prediction_suffixes_dawgs_len')
117 | 
118 |     prediction_suffixes_dawg_lenghts = []
119 |     for prediction_suffixes_dawg in compiled_dict.prediction_suffixes_dawgs:
120 |         prediction_suffixes_dawg_lenghts.append(_dawg_len(prediction_suffixes_dawg))
121 | 
122 |     write_meta(_f('meta.json'), [
123 |         ['language_code', language_code],
124 |         ['format_version', CURRENT_FORMAT_VERSION],
125 |         ['pymorphy2_version', pymorphy2.__version__],
126 |         ['compiled_at', datetime.datetime.utcnow().isoformat()],
127 | 
128 |         ['source', source_name],
129 |         ['source_version', compiled_dict.parsed_dict.version],
130 |         ['source_revision', compiled_dict.parsed_dict.revision],
131 |         ['source_lexemes_count', len(compiled_dict.parsed_dict.lexemes)],
132 |         ['source_links_count', len(compiled_dict.parsed_dict.links)],
133 | 
134 |         ['gramtab_length', len(compiled_dict.gramtab)],
135 |         ['gramtab_formats', gramtab_formats],
136 |         ['paradigms_length', len(compiled_dict.paradigms)],
137 |         ['suffixes_length', len(compiled_dict.suffixes)],
138 | 
139 |         ['words_dawg_length', words_dawg_len],
140 |         ['compile_options', compiled_dict.compile_options],
141 |         ['prediction_suffixes_dawg_lengths', prediction_suffixes_dawg_lenghts],
142 |     ])
143 | 
144 | 
145 | def load_meta(filename):
146 |     """ Load metadata. """
147 |     meta = json_read(filename, parse_float=str)
148 |     if hasattr(collections, 'OrderedDict'):
149 |         return collections.OrderedDict(meta)
150 |     return dict(meta)
151 | 
152 | 
153 | def write_meta(filename, meta):
154 |     """ Save metadata to a file. """
155 |     if isinstance(meta, dict):
156 |         meta = list(meta.items())
157 |     json_write(filename, meta)
158 | 
159 | 
160 | def update_meta(filename, extra):
161 |     """ Update meta with extra fields """
162 |     meta = load_meta(filename)
163 |     meta.update(extra)
164 |     write_meta(filename, meta)
165 | 
166 | 
167 | def _load_tag_class(gramtab_format, grammemes_filename):
168 |     """ Load and initialize Tag class (according to ``gramtab_format``). """
169 |     if gramtab_format not in tagset.registry:
170 |         raise ValueError("This gramtab format ('%s') is unsupported." % gramtab_format)
171 | 
172 |     # FIXME: clone the class
173 |     Tag = tagset.registry[gramtab_format] #._clone_class()
174 | 
175 |     grammemes = json_read(grammemes_filename)
176 |     Tag._init_grammemes(grammemes)
177 | 
178 |     return Tag
179 | 
180 | 
181 | def _load_gramtab(meta, gramtab_format, path):
182 |     """ Load gramtab (a list of tags) """
183 |     gramtab_formats = meta.get('gramtab_formats', {})
184 |     if gramtab_format not in gramtab_formats:
185 |         raise ValueError("This gramtab format ({}) is unavailable; available formats: {}".format(gramtab_format,
186 |                                                                                                  gramtab_formats.keys()))
187 | 
188 |     gramtab_filename = os.path.join(path, gramtab_formats[gramtab_format])
189 |     return json_read(gramtab_filename)
190 | 
191 | 
192 | def _load_paradigms(filename):
193 |     """ Load paradigms data """
194 |     paradigms = []
195 |     with open(filename, 'rb') as f:
196 |         paradigms_count = struct.unpack("<H", f.read(2))[0]
197 | 
198 |         for x in range(paradigms_count):
199 |             paradigm_len = struct.unpack("<H", f.read(2))[0]
200 | 
201 |             para = array.array("H")
202 |             para.fromfile(f, paradigm_len)
203 | 
204 |             paradigms.append(para)
205 |     return paradigms
206 | 
207 | 
208 | def _assert_format_is_compatible(meta, path):
209 |     """ Raise an exception if dictionary format is not compatible """
210 |     format_version = str(meta.get('format_version', '0.0'))
211 | 
212 |     if '.' not in format_version:
213 |         raise ValueError('Invalid format_version: %s' % format_version)
214 | 
215 |     major, minor = format_version.split('.')
216 |     curr_major, curr_minor = CURRENT_FORMAT_VERSION.split('.')
217 | 
218 |     if major != curr_major:
219 |         msg = (f"Error loading dictionaries from {path}: "
220 |                f"the format ('{format_version}') is not supported; "
221 |                f"required format is '{curr_major}.x'.")
222 |         raise ValueError(msg)
223 | 
224 | 


--------------------------------------------------------------------------------
/pymorphy2/opencorpora_dict/wrapper.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | from .storage import load_dict
  4 | 
  5 | logger = logging.getLogger(__name__)
  6 | 
  7 | 
  8 | class Dictionary:
  9 |     """
 10 |     OpenCorpora dictionary wrapper class.
 11 |     """
 12 | 
 13 |     def __init__(self, path):
 14 | 
 15 |         logger.info("Loading dictionaries from %s", path)
 16 | 
 17 |         self._data = load_dict(path)
 18 | 
 19 |         logger.info("format: %(format_version)s, revision: %(source_revision)s, updated: %(compiled_at)s",
 20 |                     self._data.meta)
 21 | 
 22 |         # attributes from opencorpora_dict.storage.LoadedDictionary
 23 |         self.paradigms = self._data.paradigms
 24 |         self.gramtab = self._data.gramtab
 25 |         self.paradigm_prefixes = self._data.paradigm_prefixes
 26 |         self.suffixes = self._data.suffixes
 27 |         self.words = self._data.words
 28 |         self.prediction_suffixes_dawgs = self._data.prediction_suffixes_dawgs
 29 |         self.meta = self._data.meta
 30 |         self.Tag = self._data.Tag
 31 |         self.lang = self.meta.get('language_code')
 32 | 
 33 |         # extra attributes
 34 |         self.path = path
 35 | 
 36 |     def build_tag_info(self, para_id, idx):
 37 |         """
 38 |         Return tag as a string.
 39 |         """
 40 |         paradigm = self.paradigms[para_id]
 41 |         tag_info_offset = len(paradigm) // 3
 42 |         tag_id = paradigm[tag_info_offset + idx]
 43 |         return self.gramtab[tag_id]
 44 | 
 45 |     def build_paradigm_info(self, para_id):
 46 |         """
 47 |         Return a list of
 48 | 
 49 |             (prefix, tag, suffix)
 50 | 
 51 |         tuples representing the paradigm.
 52 |         """
 53 |         paradigm = self.paradigms[para_id]
 54 |         paradigm_len = len(paradigm) // 3
 55 |         res = []
 56 |         for idx in range(paradigm_len):
 57 |             prefix_id = paradigm[paradigm_len*2 + idx]
 58 |             prefix = self.paradigm_prefixes[prefix_id]
 59 | 
 60 |             suffix_id = paradigm[idx]
 61 |             suffix = self.suffixes[suffix_id]
 62 | 
 63 |             res.append(
 64 |                 (prefix, self.build_tag_info(para_id, idx), suffix)
 65 |             )
 66 |         return res
 67 | 
 68 |     def build_normal_form(self, para_id, idx, fixed_word):
 69 |         """
 70 |         Build a normal form.
 71 |         """
 72 | 
 73 |         if idx == 0:  # a shortcut: normal form is a word itself
 74 |             return fixed_word
 75 | 
 76 |         paradigm = self.paradigms[para_id]
 77 |         paradigm_len = len(paradigm) // 3
 78 | 
 79 |         stem = self.build_stem(paradigm, idx, fixed_word)
 80 | 
 81 |         normal_prefix_id = paradigm[paradigm_len*2 + 0]
 82 |         normal_suffix_id = paradigm[0]
 83 | 
 84 |         normal_prefix = self.paradigm_prefixes[normal_prefix_id]
 85 |         normal_suffix = self.suffixes[normal_suffix_id]
 86 | 
 87 |         return normal_prefix + stem + normal_suffix
 88 | 
 89 |     def build_stem(self, paradigm, idx, fixed_word):
 90 |         """
 91 |         Return word stem (given a word, paradigm and the word index).
 92 |         """
 93 |         paradigm_len = len(paradigm) // 3
 94 | 
 95 |         prefix_id = paradigm[paradigm_len*2 + idx]
 96 |         prefix = self.paradigm_prefixes[prefix_id]
 97 | 
 98 |         suffix_id = paradigm[idx]
 99 |         suffix = self.suffixes[suffix_id]
100 | 
101 |         if suffix:
102 |             return fixed_word[len(prefix):-len(suffix)]
103 |         else:
104 |             return fixed_word[len(prefix):]
105 | 
106 |     def word_is_known(self, word, substitutes_compiled=None):
107 |         """
108 |         Check if a ``word`` is in the dictionary.
109 | 
110 |         To allow some fuzzyness pass ``substitutes_compiled`` argument;
111 |         it should be a result of :meth:`DAWG.compile_replaces()`.
112 |         This way you can e.g. handle ё letters replaced with е in the
113 |         input words.
114 | 
115 |         .. note::
116 | 
117 |             Dictionary words are not always correct words;
118 |             the dictionary also contains incorrect forms which
119 |             are commonly used. So for spellchecking tasks this
120 |             method should be used with extra care.
121 | 
122 |         """
123 |         if substitutes_compiled:
124 |             return bool(self.words.similar_keys(word, substitutes_compiled))
125 |         else:
126 |             return word in self.words
127 | 
128 |     def iter_known_words(self, prefix=""):
129 |         """
130 |         Return an iterator over ``(word, tag, normal_form, para_id, idx)``
131 |         tuples with dictionary words that starts with a given prefix
132 |         (default empty prefix means "all words").
133 |         """
134 | 
135 |         for word, (para_id, idx) in self.words.iteritems(prefix):  # .items() of DAWG returns a list
136 |             tag = self.build_tag_info(para_id, idx)
137 |             normal_form = self.build_normal_form(para_id, idx, word)
138 |             yield word, tag, normal_form, para_id, idx
139 | 
140 |     def __repr__(self):
141 |         return "<%s>" % self.__class__.__name__
142 | 


--------------------------------------------------------------------------------
/pymorphy2/shapes.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import unicodedata
  3 | import warnings
  4 | 
  5 | _latin_letters_cache = {}
  6 | 
  7 | 
  8 | def is_latin_char(uchr):
  9 |     try:
 10 |         return _latin_letters_cache[uchr]
 11 |     except KeyError:
 12 |         if isinstance(uchr, bytes):
 13 |             uchr = uchr.decode('ascii')
 14 |         is_latin = 'LATIN' in unicodedata.name(uchr)
 15 |         return _latin_letters_cache.setdefault(uchr, is_latin)
 16 | 
 17 | 
 18 | def is_latin(token):
 19 |     """
 20 |     Return True if all token letters are latin and there is at
 21 |     least one latin letter in the token:
 22 | 
 23 |         >>> is_latin('foo')
 24 |         True
 25 |         >>> is_latin('123-FOO')
 26 |         True
 27 |         >>> is_latin('123')
 28 |         False
 29 |         >>> is_latin(':)')
 30 |         False
 31 |         >>> is_latin('')
 32 |         False
 33 | 
 34 |     """
 35 |     return (
 36 |             any(ch.isalpha() for ch in token) and
 37 |             all(is_latin_char(ch) for ch in token if ch.isalpha())
 38 |     )
 39 | 
 40 | 
 41 | def is_punctuation(token):
 42 |     """
 43 |     Return True if a word contains only spaces and punctuation marks
 44 |     and there is at least one punctuation mark:
 45 | 
 46 |         >>> is_punctuation(', ')
 47 |         True
 48 |         >>> is_punctuation('..!')
 49 |         True
 50 |         >>> is_punctuation('x')
 51 |         False
 52 |         >>> is_punctuation(' ')
 53 |         False
 54 |         >>> is_punctuation('')
 55 |         False
 56 | 
 57 |     """
 58 |     if isinstance(token, bytes):  # python 2.x ascii str
 59 |         token = token.decode('ascii')
 60 | 
 61 |     return (
 62 |             bool(token) and
 63 |             not token.isspace() and
 64 |             all(unicodedata.category(ch)[0] == 'P' for ch in token if not ch.isspace())
 65 |     )
 66 | 
 67 | 
 68 | # The regex is from "Dive into Python" book.
 69 | ROMAN_NUMBERS_RE = re.compile("""
 70 |     M{0,4}              # thousands - 0 to 4 M's
 71 |     (CM|CD|D?C{0,3})    # hundreds - 900 (CM), 400 (CD), 0-300 (0 to 3 C's),
 72 |                         #            or 500-800 (D, followed by 0 to 3 C's)
 73 |     (XC|XL|L?X{0,3})    # tens - 90 (XC), 40 (XL), 0-30 (0 to 3 X's),
 74 |                         #        or 50-80 (L, followed by 0 to 3 X's)
 75 |     (IX|IV|V?I{0,3})    # ones - 9 (IX), 4 (IV), 0-3 (0 to 3 I's),
 76 |                         #        or 5-8 (V, followed by 0 to 3 I's)
 77 |     $                   # end of string
 78 | """, re.VERBOSE | re.IGNORECASE)
 79 | 
 80 | 
 81 | def is_roman_number(token, _match=ROMAN_NUMBERS_RE.match):
 82 |     """
 83 |     Return True if token looks like a Roman number:
 84 | 
 85 |         >>> is_roman_number('II')
 86 |         True
 87 |         >>> is_roman_number('IX')
 88 |         True
 89 |         >>> is_roman_number('XIIIII')
 90 |         False
 91 |         >>> is_roman_number('')
 92 |         False
 93 | 
 94 |     """
 95 |     if not token:
 96 |         return False
 97 |     return _match(token) is not None
 98 | 
 99 | 
100 | def restore_capitalization(word, example):
101 |     """
102 |     Make the capitalization of the ``word`` be the same as in ``example``:
103 | 
104 |         >>> restore_capitalization('bye', 'Hello')
105 |         'Bye'
106 |         >>> restore_capitalization('half-an-hour', 'Minute')
107 |         'Half-An-Hour'
108 |         >>> restore_capitalization('usa', 'IEEE')
109 |         'USA'
110 |         >>> restore_capitalization('pre-world', 'anti-World')
111 |         'pre-World'
112 |         >>> restore_capitalization('123-do', 'anti-IEEE')
113 |         '123-DO'
114 |         >>> restore_capitalization('123--do', 'anti--IEEE')
115 |         '123--DO'
116 | 
117 |     In the alignment fails, the reminder is lower-cased:
118 | 
119 |         >>> restore_capitalization('foo-BAR-BAZ', 'Baz-Baz')
120 |         'Foo-Bar-baz'
121 |         >>> restore_capitalization('foo', 'foo-bar')
122 |         'foo'
123 | 
124 |     .. note:
125 | 
126 |         Currently this function doesn't handle uppercase letters in
127 |         the middle of the token (e.g. McDonald).
128 | 
129 |     """
130 |     if '-' in example:
131 |         results = []
132 |         word_parts = word.split('-')
133 |         example_parts = example.split('-')
134 | 
135 |         for i, part in enumerate(word_parts):
136 |             if len(example_parts) > i:
137 |                 results.append(_make_the_same_case(part, example_parts[i]))
138 |             else:
139 |                 results.append(part.lower())
140 | 
141 |         return '-'.join(results)
142 | 
143 |     return _make_the_same_case(word, example)
144 | 
145 | 
146 | def restore_word_case(word, example):
147 |     """ This function is renamed to ``restore_capitalization`` """
148 |     warnings.warn(
149 |         "`restore_word_case` function is renamed to `restore_capitalization`; "
150 |         "old alias will be removed in future releases.",
151 |     )
152 |     return restore_capitalization(word, example)
153 | 
154 | 
155 | def _make_the_same_case(word, example):
156 |     if example.islower():
157 |         return word.lower()
158 |     elif example.isupper():
159 |         return word.upper()
160 |     elif example.istitle():
161 |         return word.title()
162 |     else:
163 |         return word.lower()
164 | 


--------------------------------------------------------------------------------
/pymorphy2/tokenizers.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | GROUPING_SPACE_REGEX = re.compile(r'([^\w_-]|[+])', re.UNICODE)
 4 | 
 5 | 
 6 | def simple_word_tokenize(text, _split=GROUPING_SPACE_REGEX.split):
 7 |     """
 8 |     Split text into tokens. Don't split by a hyphen.
 9 |     Preserve punctuation, but not whitespaces.
10 |     """
11 |     return [t for t in _split(text) if t and not t.isspace()]
12 | 


--------------------------------------------------------------------------------
/pymorphy2/units/__init__.py:
--------------------------------------------------------------------------------
 1 | from .abbreviations import (
 2 |     AbbreviatedFirstNameAnalyzer,
 3 |     AbbreviatedPatronymicAnalyzer
 4 | )
 5 | from .by_analogy import (
 6 |     KnownPrefixAnalyzer,
 7 |     KnownSuffixAnalyzer,
 8 |     UnknownPrefixAnalyzer
 9 | )
10 | from .by_hyphen import (
11 |     HyphenatedWordsAnalyzer,
12 |     HyphenAdverbAnalyzer,
13 |     HyphenSeparatedParticleAnalyzer
14 | )
15 | from .by_lookup import DictionaryAnalyzer
16 | from .by_shape import (
17 |     LatinAnalyzer,
18 |     PunctuationAnalyzer,
19 |     NumberAnalyzer,
20 |     RomanNumberAnalyzer
21 | )
22 | from .unkn import UnknAnalyzer
23 | 


--------------------------------------------------------------------------------
/pymorphy2/units/abbreviations.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Analyzer units for abbreviated words
 3 | ------------------------------------
 4 | """
 5 | from pymorphy2.units.base import BaseAnalyzerUnit
 6 | 
 7 | 
 8 | class _InitialsAnalyzer(BaseAnalyzerUnit):
 9 |     def __init__(self, letters, tag_pattern=None, score=0.1):
10 |         if tag_pattern is None:
11 |             if hasattr(self, 'TAG_PATTERN'):
12 |                 tag_pattern = self.TAG_PATTERN
13 |             else:
14 |                 raise ValueError("Please provide tag_pattern.")
15 |         self.tag_pattern = tag_pattern
16 |         self.score = score
17 |         self.letters = letters
18 |         self._letters_set = set(letters)
19 | 
20 |     def init(self, morph):
21 |         super().init(morph)
22 |         self._init_grammemes(self.morph.TagClass)
23 |         self._tags = self._get_gender_case_tags(self.tag_pattern)
24 | 
25 |     def _init_grammemes(self, tag_cls):
26 |         tag_cls.add_grammemes_to_known('Init', 'иниц', overwrite=False)
27 | 
28 |     def _get_gender_case_tags(self, pattern):
29 |         return [
30 |             self.morph.TagClass(pattern % {'gender': gender, 'case': case})
31 |             for gender in ['masc', 'femn']
32 |             for case in ['nomn', 'gent', 'datv', 'accs', 'ablt', 'loct']
33 |         ]
34 | 
35 |     def parse(self, word, word_lower, seen_parses):
36 |         if word not in self._letters_set:
37 |             return []
38 |         return [
39 |             (word_lower, tag, word_lower, self.score, ((self, word),))
40 |             for tag in self._tags
41 |         ]
42 | 
43 |     def tag(self, word, word_lower, seen_tags):
44 |         if word not in self._letters_set:
45 |             return []
46 |         return self._tags[:]
47 | 
48 | 
49 | class AbbreviatedFirstNameAnalyzer(_InitialsAnalyzer):
50 |     TAG_PATTERN = 'NOUN,anim,%(gender)s,Sgtm,Name,Fixd,Abbr,Init sing,%(case)s'
51 | 
52 |     def init(self, morph):
53 |         super().init(morph)
54 |         self._tags_masc = [tag for tag in self._tags if 'masc' in tag]
55 |         self._tags_femn = [tag for tag in self._tags if 'femn' in tag]
56 |         assert self._tags_masc + self._tags_femn == self._tags
57 | 
58 |     def _init_grammemes(self, tag_cls):
59 |         super()._init_grammemes(tag_cls)
60 |         self.morph.TagClass.add_grammemes_to_known('Name', 'имя', overwrite=False)
61 | 
62 |     def get_lexeme(self, form):
63 |         # 2 lexemes: masc and femn
64 |         fixed_word, form_tag, normal_form, score, methods_stack = form
65 |         tags = self._tags_masc if 'masc' in form_tag else self._tags_femn
66 |         return [
67 |             (fixed_word, tag, normal_form, score, methods_stack)
68 |             for tag in tags
69 |         ]
70 | 
71 |     def normalized(self, form):
72 |         # don't normalize female names to male names
73 |         fixed_word, form_tag, normal_form, score, methods_stack = form
74 |         tags = self._tags_masc if 'masc' in form_tag else self._tags_femn
75 |         return fixed_word, tags[0], normal_form, score, methods_stack
76 | 
77 | 
78 | class AbbreviatedPatronymicAnalyzer(_InitialsAnalyzer):
79 |     TAG_PATTERN = 'NOUN,anim,%(gender)s,Sgtm,Patr,Fixd,Abbr,Init sing,%(case)s'
80 | 
81 |     def _init_grammemes(self, tag_cls):
82 |         super()._init_grammemes(tag_cls)
83 |         self.morph.TagClass.add_grammemes_to_known('Patr', 'отч', overwrite=False)
84 | 
85 |     def get_lexeme(self, form):
86 |         fixed_word, _, normal_form, score, methods_stack = form
87 |         return [
88 |             (fixed_word, tag, normal_form, score, methods_stack)
89 |             for tag in self._tags
90 |         ]
91 | 
92 |     def normalized(self, form):
93 |         fixed_word, _, normal_form, score, methods_stack = form
94 |         return fixed_word, self._tags[0], normal_form, score, methods_stack
95 | 


--------------------------------------------------------------------------------
/pymorphy2/units/base.py:
--------------------------------------------------------------------------------
  1 | from inspect import getfullargspec
  2 | 
  3 | from pymorphy2.units.utils import (
  4 |     without_last_method,
  5 |     append_method,
  6 |     add_tag_if_not_seen,
  7 | )
  8 | from pymorphy2.utils import kwargs_repr
  9 | 
 10 | 
 11 | def inspect_args(func):
 12 |     return getfullargspec(func).args
 13 | 
 14 | 
 15 | class BaseAnalyzerUnit:
 16 |     """
 17 |     Base class for analyzer units.
 18 | 
 19 |     For parsing to work subclasses must implement `parse` method;
 20 |     as an optimization they may also override `tag` method.
 21 | 
 22 |     For inflection to work (this includes normalization) a subclass
 23 |     must implement `normalized` and `get_lexeme` methods.
 24 | 
 25 |     In __init__ method all parameters must be saved as instance variables
 26 |     for analyzer unit to work.
 27 |     """
 28 |     morph = None
 29 |     dict = None
 30 |     _repr_skip_value_params = None
 31 | 
 32 |     def init(self, morph):
 33 |         self.morph = morph
 34 |         self.dict = morph.dictionary
 35 | 
 36 |     def clone(self):
 37 |         return self.__class__(**self._get_params())
 38 | 
 39 |     def parse(self, word, word_lower, seen_parses):
 40 |         raise NotImplementedError()
 41 | 
 42 |     def tag(self, word, word_lower, seen_tags):
 43 |         # By default .tag() uses .parse().
 44 |         # Usually it is possible to write a more efficient implementation;
 45 |         # analyzers should do it when possible.
 46 |         result = []
 47 |         for p in self.parse(word, word_lower, set()):
 48 |             add_tag_if_not_seen(p[1], result, seen_tags)
 49 |         return result
 50 | 
 51 |     def normalized(self, form):
 52 |         raise NotImplementedError()
 53 | 
 54 |     def get_lexeme(self, form):
 55 |         raise NotImplementedError()
 56 | 
 57 |     def __repr__(self):
 58 |         cls_text = self.__class__.__name__
 59 |         kwargs_text = kwargs_repr(self._get_params(),
 60 |                                   self._repr_skip_value_params)
 61 |         return f"{cls_text}({kwargs_text})"
 62 | 
 63 |     @classmethod
 64 |     def _get_param_names(cls):
 65 |         """
 66 |         Get parameter names for the analyzer unit.
 67 |         It works by introspecting `__init__` arguments.
 68 |         `__init__` method must not use *args.
 69 |         """
 70 |         if cls.__init__ is object.__init__:
 71 |             return []
 72 |         args = inspect_args(cls.__init__)
 73 |         return sorted(args[1:])
 74 | 
 75 |     def _get_params(self):
 76 |         """ Return a dict with the parameters for this analyzer unit. """
 77 |         return dict(
 78 |             (key, getattr(self, key, None)) for key in self._get_param_names()
 79 |         )
 80 | 
 81 | 
 82 | class AnalogyAnalyzerUnit(BaseAnalyzerUnit):
 83 | 
 84 |     def normalized(self, form):
 85 |         base_analyzer, this_method = self._method_info(form)
 86 |         return self._normalized(form, base_analyzer, this_method)
 87 | 
 88 |     def _normalized(self, form, base_analyzer, this_method):
 89 |         normalizer = self.normalizer(form, this_method)
 90 | 
 91 |         form = without_last_method(next(normalizer))
 92 |         normal_form = normalizer.send(base_analyzer.normalized(form))
 93 |         return append_method(normal_form, this_method)
 94 | 
 95 |     def get_lexeme(self, form):
 96 |         base_analyzer, this_method = self._method_info(form)
 97 |         return self._get_lexeme(form, base_analyzer, this_method)
 98 | 
 99 |     def _get_lexeme(self, form, base_analyzer, this_method):
100 |         lexemizer = self.lexemizer(form, this_method)
101 |         form = without_last_method(next(lexemizer))
102 |         lexeme = lexemizer.send(base_analyzer.get_lexeme(form))
103 |         return [append_method(f, this_method) for f in lexeme]
104 | 
105 |     def normalizer(self, form, this_method):
106 |         """ A coroutine for normalization """
107 | 
108 |         # 1. undecorate form:
109 |         # form = undecorate(form)
110 | 
111 |         # 2. get normalized version of undecorated form:
112 |         normal_form = yield form
113 | 
114 |         # 3. decorate the normalized version:
115 |         # normal_form = decorate(normal_form)
116 | 
117 |         # 4. return the result
118 |         yield normal_form
119 | 
120 |     def lexemizer(self, form, this_method):
121 |         """ A coroutine for preparing lexemes """
122 |         lexeme = yield form
123 |         yield lexeme
124 | 
125 |     def _method_info(self, form):
126 |         methods_stack = form[4]
127 |         base_method, this_method = methods_stack[-2:]
128 |         base_analyzer = base_method[0]
129 |         return base_analyzer, this_method
130 | 


--------------------------------------------------------------------------------
/pymorphy2/units/by_lookup.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Dictionary analyzer unit
  3 | ------------------------
  4 | """
  5 | import logging
  6 | 
  7 | from pymorphy2.units.base import BaseAnalyzerUnit
  8 | 
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | 
 12 | class DictionaryAnalyzer(BaseAnalyzerUnit):
 13 |     """
 14 |     Analyzer unit that analyzes word using dictionary.
 15 |     """
 16 | 
 17 |     def parse(self, word, word_lower, seen_parses):
 18 |         """
 19 |         Parse a word using this dictionary.
 20 |         """
 21 |         res = []
 22 |         para_data = self.dict.words.similar_items(word_lower, self.morph.char_substitutes)
 23 | 
 24 |         for fixed_word, parses in para_data:
 25 |             # `fixed_word` is a word with proper substitute (e.g. ё) letters
 26 | 
 27 |             for para_id, idx in parses:
 28 |                 normal_form = self.dict.build_normal_form(para_id, idx, fixed_word)
 29 |                 tag = self.dict.build_tag_info(para_id, idx)
 30 |                 method = ((self, fixed_word, para_id, idx),)
 31 |                 res.append((fixed_word, tag, normal_form, 1.0, method))
 32 | 
 33 |         # res.sort(key=lambda p: len(p[1]))  #  prefer simple parses
 34 |         return res
 35 | 
 36 |     def tag(self, word, word_lower, seen_tags):
 37 |         """
 38 |         Tag a word using this dictionary.
 39 |         """
 40 |         para_data = self.dict.words.similar_item_values(word_lower, self.morph.char_substitutes)
 41 | 
 42 |         # avoid extra attribute lookups
 43 |         paradigms = self.dict.paradigms
 44 |         gramtab = self.dict.gramtab
 45 | 
 46 |         # tag known word
 47 |         result = []
 48 |         for parse in para_data:
 49 |             for para_id, idx in parse:
 50 |                 # result.append(self.build_tag_info(para_id, idx))
 51 |                 # .build_tag_info is unrolled for speed
 52 |                 paradigm = paradigms[para_id]
 53 |                 paradigm_len = len(paradigm) // 3
 54 |                 tag_id = paradigm[paradigm_len + idx]
 55 |                 result.append(gramtab[tag_id])
 56 | 
 57 |         return result
 58 | 
 59 |     def get_lexeme(self, form):
 60 |         """
 61 |         Return a lexeme (given a parsed word).
 62 |         """
 63 |         fixed_word, tag, normal_form, score, methods_stack = form
 64 |         _, para_id, idx = self._extract_para_info(methods_stack)
 65 | 
 66 |         _para = self.dict.paradigms[para_id]
 67 |         stem = self.dict.build_stem(_para, idx, fixed_word)
 68 | 
 69 |         result = []
 70 |         paradigm = self.dict.build_paradigm_info(para_id)  # XXX: reuse _para?
 71 | 
 72 |         for index, (_prefix, _tag, _suffix) in enumerate(paradigm):
 73 |             word = _prefix + stem + _suffix
 74 |             new_methods_stack = self._fix_stack(methods_stack, word, para_id, index)
 75 |             parse = (word, _tag, normal_form, 1.0, new_methods_stack)
 76 |             result.append(parse)
 77 | 
 78 |         return result
 79 | 
 80 |     def normalized(self, form):
 81 |         fixed_word, tag, normal_form, score, methods_stack = form
 82 |         original_word, para_id, idx = self._extract_para_info(methods_stack)
 83 | 
 84 |         if idx == 0:
 85 |             return form
 86 | 
 87 |         tag = self.dict.build_tag_info(para_id, 0)
 88 |         new_methods_stack = self._fix_stack(methods_stack, normal_form, para_id, 0)
 89 | 
 90 |         return normal_form, tag, normal_form, 1.0, new_methods_stack
 91 | 
 92 |     def _extract_para_info(self, methods_stack):
 93 |         # This method assumes that DictionaryAnalyzer is the first
 94 |         # and the only method in methods_stack.
 95 |         analyzer, original_word, para_id, idx = methods_stack[0]
 96 |         assert analyzer is self
 97 |         return original_word, para_id, idx
 98 | 
 99 |     def _fix_stack(self, methods_stack, word, para_id, idx):
100 |         method0 = self, word, para_id, idx
101 |         return (method0,) + methods_stack[1:]
102 | 


--------------------------------------------------------------------------------
/pymorphy2/units/by_shape.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Analyzer units that analyzes non-word tokes
  3 | -------------------------------------------
  4 | """
  5 | from pymorphy2.shapes import is_latin, is_punctuation, is_roman_number
  6 | from pymorphy2.units.base import BaseAnalyzerUnit
  7 | 
  8 | 
  9 | class _ShapeAnalyzer(BaseAnalyzerUnit):
 10 |     EXTRA_GRAMMEMES = []
 11 |     EXTRA_GRAMMEMES_CYR = []
 12 | 
 13 |     def __init__(self, score=0.9):
 14 |         self.score = score
 15 | 
 16 |     def init(self, morph):
 17 |         super().init(morph)
 18 | 
 19 |         for lat, cyr in zip(self.EXTRA_GRAMMEMES, self.EXTRA_GRAMMEMES_CYR):
 20 |             self.morph.TagClass.add_grammemes_to_known(lat, cyr)
 21 | 
 22 |     def parse(self, word, word_lower, seen_parses):
 23 |         shape = self.check_shape(word, word_lower)
 24 |         if not shape:
 25 |             return []
 26 | 
 27 |         methods = ((self, word),)
 28 |         return [(word_lower, self.get_tag(word, shape), word_lower, self.score, methods)]
 29 | 
 30 |     def tag(self, word, word_lower, seen_tags):
 31 |         shape = self.check_shape(word, word_lower)
 32 |         if not shape:
 33 |             return []
 34 |         return [self.get_tag(word, shape)]
 35 | 
 36 |     def get_lexeme(self, form):
 37 |         return [form]
 38 | 
 39 |     def normalized(self, form):
 40 |         return form
 41 | 
 42 |     # implement these 2 methods in a subclass:
 43 |     def check_shape(self, word, word_lower):
 44 |         raise NotImplementedError()
 45 | 
 46 |     def get_tag(self, word, shape):
 47 |         raise NotImplementedError()
 48 | 
 49 | 
 50 | class _SingleShapeAnalyzer(_ShapeAnalyzer):
 51 |     TAG_STR = None
 52 |     TAG_STR_CYR = None
 53 | 
 54 |     def init(self, morph):
 55 |         assert self.TAG_STR is not None
 56 |         assert self.TAG_STR_CYR is not None
 57 |         self.EXTRA_GRAMMEMES = self.TAG_STR.split(',')
 58 |         self.EXTRA_GRAMMEMES_CYR = self.TAG_STR_CYR.split(',')
 59 |         super().init(morph)
 60 |         self._tag = self.morph.TagClass(self.TAG_STR)
 61 | 
 62 |     def get_tag(self, word, shape):
 63 |         return self._tag
 64 | 
 65 | 
 66 | class PunctuationAnalyzer(_SingleShapeAnalyzer):
 67 |     """
 68 |     This analyzer tags punctuation marks as "PNCT".
 69 |     Example: "," -> PNCT
 70 |     """
 71 |     TAG_STR = 'PNCT'
 72 |     TAG_STR_CYR = 'ЗПР'  # aot.ru uses this name
 73 | 
 74 |     def check_shape(self, word, word_lower):
 75 |         return is_punctuation(word)
 76 | 
 77 | 
 78 | class LatinAnalyzer(_SingleShapeAnalyzer):
 79 |     """
 80 |     This analyzer marks latin words with "LATN" tag.
 81 |     Example: "pdf" -> LATN
 82 |     """
 83 |     TAG_STR = 'LATN'
 84 |     TAG_STR_CYR = 'ЛАТ'
 85 | 
 86 |     def check_shape(self, word, word_lower):
 87 |         return is_latin(word)
 88 | 
 89 | 
 90 | class NumberAnalyzer(_ShapeAnalyzer):
 91 |     """
 92 |     This analyzer marks integer numbers with "NUMB,int" or "NUMB,real" tags.
 93 |     Example: "12" -> NUMB,int; "12.4" -> NUMB,real
 94 | 
 95 |     .. note::
 96 | 
 97 |         Don't confuse it with "NUMR": "тридцать" -> NUMR
 98 | 
 99 |     """
100 |     EXTRA_GRAMMEMES = ['NUMB', 'intg', 'real']
101 |     EXTRA_GRAMMEMES_CYR = ['ЧИСЛО', 'цел', 'вещ']
102 | 
103 |     def init(self, morph):
104 |         super().init(morph)
105 |         self._tags = {
106 |             'intg': morph.TagClass('NUMB,intg'),
107 |             'real': morph.TagClass('NUMB,real'),
108 |         }
109 | 
110 |     def check_shape(self, word, word_lower):
111 |         try:
112 |             int(word)
113 |             return 'intg'
114 |         except ValueError:
115 |             try:
116 |                 float(word.replace(',', '.'))
117 |                 return 'real'
118 |             except ValueError:
119 |                 pass
120 |         return False
121 | 
122 |     def get_tag(self, word, shape):
123 |         return self._tags[shape]
124 | 
125 | 
126 | class RomanNumberAnalyzer(_SingleShapeAnalyzer):
127 |     TAG_STR = 'ROMN'
128 |     TAG_STR_CYR = 'РИМ'
129 | 
130 |     def check_shape(self, word, word_lower):
131 |         return is_roman_number(word)
132 | 


--------------------------------------------------------------------------------
/pymorphy2/units/unkn.py:
--------------------------------------------------------------------------------
 1 | from pymorphy2.units.base import BaseAnalyzerUnit
 2 | 
 3 | 
 4 | class UnknAnalyzer(BaseAnalyzerUnit):
 5 |     """
 6 |     Add an UNKN parse if other analyzers returned nothing.
 7 |     This allows to always have at least one parse result.
 8 |     """
 9 |     def init(self, morph):
10 |         super().init(morph)
11 |         self.morph.TagClass.add_grammemes_to_known('UNKN', 'НЕИЗВ')
12 |         self._tag = self.morph.TagClass('UNKN')
13 | 
14 |     def parse(self, word, word_lower, seen_parses):
15 |         if seen_parses:
16 |             return []
17 | 
18 |         methods = ((self, word),)
19 |         return [(word_lower, self._tag, word_lower, 1.0, methods)]
20 | 
21 |     def tag(self, word, word_lower, seen_tags):
22 |         if seen_tags:
23 |             return []
24 |         return [self._tag]
25 | 
26 |     def get_lexeme(self, form):
27 |         return [form]
28 | 
29 |     def normalized(self, form):
30 |         return form
31 | 


--------------------------------------------------------------------------------
/pymorphy2/units/utils.py:
--------------------------------------------------------------------------------
 1 | def add_parse_if_not_seen(parse, result_list, seen_parses):
 2 |     try:
 3 |         para_id = parse[4][0][2]
 4 |     except IndexError:
 5 |         para_id = None
 6 | 
 7 |     word = parse[0]
 8 |     tag = parse[1]
 9 | 
10 |     reduced_parse = word, tag, para_id
11 | 
12 |     if reduced_parse in seen_parses:
13 |         return
14 |     seen_parses.add(reduced_parse)
15 |     result_list.append(parse)
16 | 
17 | 
18 | def add_tag_if_not_seen(tag, result_list, seen_tags):
19 |     if tag in seen_tags:
20 |         return
21 |     seen_tags.add(tag)
22 |     result_list.append(tag)
23 | 
24 | 
25 | def with_suffix(form, suffix):
26 |     """ Return a new form with ``suffix`` attached """
27 |     word, tag, normal_form, score, methods_stack = form
28 |     return word + suffix, tag, normal_form + suffix, score, methods_stack
29 | 
30 | 
31 | def without_fixed_suffix(form, suffix_length):
32 |     """ Return a new form with ``suffix_length`` chars removed from right """
33 |     word, tag, normal_form, score, methods_stack = form
34 |     return (word[:-suffix_length], tag, normal_form[:-suffix_length],
35 |             score, methods_stack)
36 | 
37 | 
38 | def without_fixed_prefix(form, prefix_length):
39 |     """ Return a new form with ``prefix_length`` chars removed from left """
40 |     word, tag, normal_form, score, methods_stack = form
41 |     return (word[prefix_length:], tag, normal_form[prefix_length:],
42 |             score, methods_stack)
43 | 
44 | 
45 | def with_prefix(form, prefix):
46 |     """ Return a new form with ``prefix`` added """
47 |     word, tag, normal_form, score, methods_stack = form
48 |     return prefix + word, tag, prefix + normal_form, score, methods_stack
49 | 
50 | 
51 | def replace_methods_stack(form, new_methods_stack):
52 |     """
53 |     Return a new form with ``methods_stack``
54 |     replaced with ``new_methods_stack``
55 |     """
56 |     return form[:4] + (new_methods_stack,)
57 | 
58 | 
59 | def without_last_method(form):
60 |     """ Return a new form without last method from methods_stack """
61 |     stack = form[4][:-1]
62 |     return form[:4] + (stack,)
63 | 
64 | 
65 | def append_method(form, method):
66 |     """ Return a new form with ``method`` added to methods_stack """
67 |     stack = form[4]
68 |     return form[:4] + (stack+(method,),)
69 | 


--------------------------------------------------------------------------------
/pymorphy2/utils.py:
--------------------------------------------------------------------------------
  1 | import codecs
  2 | import heapq
  3 | import itertools
  4 | import json
  5 | import os
  6 | 
  7 | 
  8 | def get_mem_usage():
  9 |     """
 10 |     Return memory usage of the current process, in bytes.
 11 |     Requires psutil Python package.
 12 |     """
 13 |     import psutil
 14 |     proc = psutil.Process(os.getpid())
 15 |     return proc.memory_info().rss
 16 | 
 17 | 
 18 | def combinations_of_all_lengths(it):
 19 |     """
 20 |     Return an iterable with all possible combinations of items from ``it``:
 21 | 
 22 |         >>> for comb in combinations_of_all_lengths('ABC'):
 23 |         ...     print("".join(comb))
 24 |         A
 25 |         B
 26 |         C
 27 |         AB
 28 |         AC
 29 |         BC
 30 |         ABC
 31 | 
 32 |     """
 33 |     return itertools.chain(
 34 |         *(itertools.combinations(it, num+1) for num in range(len(it)))
 35 |     )
 36 | 
 37 | 
 38 | def longest_common_substring(data):
 39 |     """
 40 |     Return a longest common substring of a list of strings:
 41 | 
 42 |         >>> longest_common_substring(["apricot", "rice", "cricket"])
 43 |         'ric'
 44 |         >>> longest_common_substring(["apricot", "banana"])
 45 |         'a'
 46 |         >>> longest_common_substring(["foo", "bar", "baz"])
 47 |         ''
 48 |         >>> longest_common_substring(["", "foo"])
 49 |         ''
 50 |         >>> longest_common_substring(["apricot"])
 51 |         'apricot'
 52 |         >>> longest_common_substring([])
 53 |         ''
 54 | 
 55 |     See http://stackoverflow.com/questions/2892931/.
 56 |     """
 57 |     if len(data) == 1:
 58 |         return data[0]
 59 |     if not data or len(data[0]) == 0:
 60 |         return ''
 61 |     substr = ''
 62 |     for i in range(len(data[0])):
 63 |         for j in range(len(data[0])-i+1):
 64 |             if j > len(substr) and all(data[0][i:i+j] in x for x in data):
 65 |                 substr = data[0][i:i+j]
 66 |     return substr
 67 | 
 68 | 
 69 | def json_write(filename, obj, **json_options):
 70 |     """ Create file ``filename`` with ``obj`` serialized to JSON """
 71 | 
 72 |     json_options.setdefault('ensure_ascii', False)
 73 |     json_options.setdefault('indent', 2)
 74 |     with codecs.open(filename, 'w', 'utf8') as f:
 75 |         json.dump(obj, f, **json_options)
 76 | 
 77 | 
 78 | def json_read(filename, **json_options):
 79 |     """ Read an object from a json file ``filename`` """
 80 |     with codecs.open(filename, 'r', 'utf8') as f:
 81 |         return json.load(f, **json_options)
 82 | 
 83 | 
 84 | def largest_elements(iterable, key, n=1):
 85 |     """
 86 |     Return a list of large elements of the ``iterable``
 87 |     (according to ``key`` function).
 88 | 
 89 |     ``n`` is a number of top element values to consider; when n==1
 90 |     (default) only largest elements are returned; when n==2 - elements
 91 |     with one of the top-2 values, etc.
 92 | 
 93 |     >>> s = [-4, 3, 5, 7, 4, -7]
 94 |     >>> largest_elements(s, abs)
 95 |     [7, -7]
 96 |     >>> largest_elements(s, abs, 2)
 97 |     [5, 7, -7]
 98 |     >>> largest_elements(s, abs, 3)
 99 |     [-4, 5, 7, 4, -7]
100 | 
101 |     """
102 |     it1, it2 = itertools.tee(iterable)
103 |     top_keys = set(heapq.nlargest(n, set(map(key, it1))))
104 |     return [el for el in it2 if key(el) in top_keys]
105 | 
106 | 
107 | def word_splits(word, min_reminder=3, max_prefix_length=5):
108 |     """
109 |     Return all splits of a word (taking in account min_reminder and
110 |     max_prefix_length).
111 |     """
112 |     max_split = min(max_prefix_length, len(word)-min_reminder)
113 |     split_indexes = range(1, 1+max_split)
114 |     return [(word[:i], word[i:]) for i in split_indexes]
115 | 
116 | 
117 | def kwargs_repr(kwargs=None, dont_show_value=None):
118 |     """
119 |     >>> kwargs_repr(dict(foo="123", a=5, x=8))
120 |     "a=5, foo='123', x=8"
121 |     >>> kwargs_repr(dict(foo="123", a=5, x=8), dont_show_value=['foo'])
122 |     'a=5, foo=<...>, x=8'
123 |     >>> kwargs_repr()
124 |     ''
125 |     """
126 |     kwargs = kwargs or {}
127 |     dont_show_value = set(dont_show_value or [])
128 |     return ", ".join(
129 |         "{}={}".format(k, repr(v) if k not in dont_show_value else "<...>")
130 |         for k, v in sorted(kwargs.items())
131 |     )
132 | 
133 | 
134 | def with_progress(iterable, desc=None, total=None, leave=True):
135 |     """
136 |     Return an iterator which prints the iteration progress using tqdm package.
137 |     Return iterable intact if tqdm is not available.
138 |     """
139 |     try:
140 |         from tqdm import tqdm
141 | 
142 |         # workarounds for tqdm bugs
143 |         def _it(iterable, desc, total, leave):
144 |             if total is None:
145 |                 try:
146 |                     total = len(iterable)
147 |                 except Exception:
148 |                     total = 0
149 |             yield from tqdm(iterable, desc=desc, total=total, leave=leave)
150 |             if leave:
151 |                 print("")
152 | 
153 |         return _it(iterable, desc, total, leave)
154 | 
155 |     except ImportError:
156 |         return iterable
157 | 


--------------------------------------------------------------------------------
/pymorphy2/version.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.9.1"
2 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | universal=0
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import platform
 3 | 
 4 | from setuptools import setup
 5 | 
 6 | 
 7 | # from Cython.Build import cythonize
 8 | 
 9 | 
10 | def get_version():
11 |     with open("pymorphy2/version.py", "rt") as f:
12 |         return f.readline().split("=")[1].strip(' "\n')
13 | 
14 | 
15 | # TODO: use environment markers instead of Python code in order to
16 | # allow building proper wheels. Markers are not enabled right now because
17 | # of setuptools/wheel incompatibilities and the 'pip >= 6.0' requirement.
18 | 
19 | # extras_require = {
20 | #     'fast:platform_python_implementation==CPython': ["DAWG>=0.7.7"],
21 | # }
22 | 
23 | is_cpython = platform.python_implementation() == 'CPython'
24 | 
25 | 
26 | install_requires = [
27 |     'dawg2-python >= 0.8.0',
28 |     'pymorphy2-dicts-ru >=2.4, <3.0',
29 |     'docopt-ng >= 0.6',
30 |     'setuptools >= 68.2.2 ; python_version >= "3.12"',
31 | ]
32 | 
33 | extras_require = {'fast': []}
34 | if is_cpython:
35 |     extras_require['fast'].append("DAWG2 >= 0.9.0, < 1.0.0")
36 | 
37 | setup(
38 |     name='pymorphy2',
39 |     version=get_version(),
40 |     author='Mikhail Korobov',
41 |     author_email='kmike84@gmail.com',
42 |     url='https://github.com/kmike/pymorphy2/',
43 | 
44 |     description='Morphological analyzer (POS tagger + inflection engine) for Russian language.',
45 |     long_description=open('README.rst').read(),
46 | 
47 |     license='MIT license',
48 |     packages=[
49 |         'pymorphy2',
50 |         'pymorphy2.units',
51 |         'pymorphy2.lang',
52 |         'pymorphy2.lang.ru',
53 |         'pymorphy2.lang.uk',
54 |         'pymorphy2.opencorpora_dict',
55 |     ],
56 |     entry_points={
57 |         'console_scripts': ['pymorphy = pymorphy2.cli:main']
58 |     },
59 |     install_requires=install_requires,
60 |     extras_require=extras_require,
61 |     zip_safe=False,
62 | 
63 |     # ext_modules=cythonize([
64 |     #     'pymorphy2/*.py',
65 |     #     'pymorphy2/units/*.py',
66 |     #     'pymorphy2/opencorpora_dict/*.py',
67 |     # ], annotate=True, profile=True),
68 | 
69 |     classifiers=[
70 |         'Development Status :: 4 - Beta',
71 |         'Intended Audience :: Developers',
72 |         'Intended Audience :: Science/Research',
73 |         'License :: OSI Approved :: MIT License',
74 |         'Natural Language :: Russian',
75 |         'Programming Language :: Python',
76 |         'Programming Language :: Python :: 3.8',
77 |         'Programming Language :: Python :: 3.9',
78 |         'Programming Language :: Python :: 3.10',
79 |         'Programming Language :: Python :: 3.11',
80 |         'Programming Language :: Python :: 3.12',
81 |         'Programming Language :: Python :: Implementation :: CPython',
82 |         'Programming Language :: Python :: Implementation :: PyPy',
83 |         'Topic :: Software Development :: Libraries :: Python Modules',
84 |         'Topic :: Scientific/Engineering :: Information Analysis',
85 |         'Topic :: Text Processing :: Linguistic',
86 |     ],
87 | )
88 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.fixture(scope='session')
 5 | def morph():
 6 |     import pymorphy2
 7 |     return pymorphy2.MorphAnalyzer()
 8 | 
 9 | 
10 | @pytest.fixture(scope='session')
11 | def Tag(morph):
12 |     return morph.TagClass
13 | 


--------------------------------------------------------------------------------
/tests/test_cli.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import docopt
 4 | import pytest
 5 | 
 6 | from pymorphy2 import cli
 7 | 
 8 | 
 9 | def run_pymorphy2(args=(), stdin=None):
10 |     cli.main(args)
11 | 
12 | 
13 | def test_show_usage():
14 |     with pytest.raises(docopt.DocoptExit) as e:
15 |         run_pymorphy2([])
16 |     assert 'Usage:' in str(e.value)
17 | 
18 | 
19 | def test_show_memory_usage(capsys):
20 |     pytest.importorskip("psutil")
21 | 
22 |     run_pymorphy2(['dict', 'mem_usage'])
23 |     out = ' '.join(capsys.readouterr())
24 |     assert 'Memory usage:' in out
25 | 
26 | 
27 | def test_show_dict_meta(capsys, morph):
28 |     meta = morph.dictionary.meta
29 |     run_pymorphy2(['dict', 'meta'])
30 |     out = ' '.join(capsys.readouterr())
31 |     assert meta['compiled_at'] in out
32 | 
33 | 
34 | def test_parse_basic(tmpdir, capsys):
35 |     logging.raiseExceptions = False
36 |     try:
37 |         p = tmpdir.join('words.txt')
38 |         p.write_text("""
39 |         крот пришел
40 |         """, encoding='utf8')
41 |         run_pymorphy2(["parse", str(p)])
42 |         out, err = capsys.readouterr()
43 |         print(out)
44 |         print(err)
45 |         assert out.strip() == """
46 | крот{крот:1.000=NOUN,anim,masc sing,nomn}
47 | пришел{прийти:1.000=VERB,perf,intr masc,sing,past,indc}
48 |         """.strip()
49 |     finally:
50 |         logging.raiseExceptions = True
51 | 


--------------------------------------------------------------------------------
/tests/test_dict_loading.py:
--------------------------------------------------------------------------------
 1 | from importlib.util import find_spec
 2 | 
 3 | import pytest
 4 | 
 5 | import pymorphy2
 6 | from pymorphy2.analyzer import lang_dict_path
 7 | 
 8 | 
 9 | def test_old_dictionaries_supported():
10 |     pytest.importorskip("pymorphy2_dicts")
11 |     m = pymorphy2.MorphAnalyzer(lang='ru-old')
12 |     assert m.lang == 'ru-old'
13 |     assert m.tag('стиль')[0].POS == 'NOUN'
14 | 
15 | 
16 | def test_old_dictionaries_not_installed():
17 |     if find_spec("pymorphy2_dicts"):
18 |         pytest.skip("pymorphy2_dicts package is installed")
19 | 
20 |     with pytest.raises(ValueError):
21 |         pymorphy2.MorphAnalyzer(lang='ru-old')
22 | 
23 | 
24 | def test_old_dictionaries_supported_by_path():
25 |     pymorphy2_dicts = pytest.importorskip("pymorphy2_dicts")
26 |     m = pymorphy2.MorphAnalyzer(pymorphy2_dicts.get_path())
27 |     assert m.lang == 'ru'
28 |     assert m.tag('стиль')[0].POS == 'NOUN'
29 | 
30 | 
31 | def test_morph_analyzer_bad_path():
32 |     with pytest.raises(IOError):
33 |         pymorphy2.MorphAnalyzer("/sdfgsd/gdsfgsdfg/dfgdsfg/dsfgsdfg/as")
34 | 
35 | 
36 | def test_language_from_dict():
37 |     ru_path = lang_dict_path('ru')
38 |     m = pymorphy2.MorphAnalyzer(path=ru_path)
39 |     assert m.lang == 'ru'
40 | 
41 | 
42 | def test_bad_language():
43 |     with pytest.raises(ValueError):
44 |         pymorphy2.MorphAnalyzer(lang='something-unsupported')
45 | 
46 | 
47 | def test_nonmatching_language():
48 |     ru_path = lang_dict_path('ru')
49 |     m = pymorphy2.MorphAnalyzer(path=ru_path, lang='uk')
50 |     assert 'Init' in m.parse('Ї')[0].tag
51 |     assert m.lang == 'uk'
52 | 


--------------------------------------------------------------------------------
/tests/test_inflection.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | from pymorphy2.shapes import restore_capitalization
  4 | 
  5 | 
  6 | def with_test_data(data):
  7 |     return pytest.mark.parametrize(
  8 |         ("word", "grammemes", "result"),
  9 |         data
 10 |     )
 11 | 
 12 | 
 13 | def assert_first_inflected_variant(word, grammemes, result, morph):
 14 |     inflected_variants = [p.inflect(set(grammemes)) for p in morph.parse(word)]
 15 |     inflected_variants = [v for v in inflected_variants if v]
 16 |     # inflected_variants = morph.inflect(word, grammemes)
 17 |     assert len(inflected_variants)
 18 | 
 19 |     inflected = inflected_variants[0]
 20 |     assert restore_capitalization(inflected.word, word) == result
 21 | 
 22 | 
 23 | @with_test_data([
 24 |     # суслики и бутявки
 25 |     ("суслик", ["datv"], "суслику"),
 26 |     ("суслики", ["datv"], "сусликам"),
 27 |     ("сусликов", ["datv"], "сусликам"),
 28 |     ("суслика", ["datv"], "суслику"),
 29 |     ("суслик", ["datv", "plur"], "сусликам"),
 30 | 
 31 |     ("бутявка", ["datv"], "бутявке"),
 32 |     ("бутявок", ["datv"], "бутявкам"),
 33 | 
 34 |     # глаголы, причастия, деепричастия
 35 |     ("гуляю", ["past"], "гулял"),
 36 |     ("гулял", ["pres"], "гуляю"),
 37 |     ("гулял", ["INFN"], "гулять"),
 38 |     ("гулял", ["GRND"], "гуляв"),
 39 |     ("гулял", ["PRTF"], "гулявший"),
 40 |     ("гуляла", ["PRTF"], "гулявшая"),
 41 |     ("гуляю", ["PRTF", "datv"], "гуляющему"),
 42 |     ("гулявший", ["VERB"], "гулял"),
 43 |     ("гулявший", ["VERB", "femn"], "гуляла"),
 44 |     ("иду", ["2per"], "идёшь"),
 45 |     ("иду", ["2per", "plur"], "идёте"),
 46 |     ("иду", ["3per"], "идёт"),
 47 |     ("иду", ["3per", "plur"], "идут"),
 48 |     ("иду", ["impr", "excl"], "иди"),
 49 | 
 50 |     # баг из pymorphy
 51 |     ('киев', ['loct'], 'киеве'),
 52 | 
 53 |     # одушевленность
 54 |     ('слабый', ['accs', 'inan'], 'слабый'),
 55 |     ('слабый', ['accs', 'anim'], 'слабого'),
 56 | 
 57 |     # сравнительные степени прилагательных
 58 |     ('быстрый', ['COMP'], 'быстрее'),
 59 |     ('хорошая', ['COMP'], 'лучше'),
 60 | 
 61 |     # частицы - не отрезаются
 62 |     ('скажи-ка', ['futr'], 'скажу-ка'),
 63 | ])
 64 | def test_first_inflected_value(word, grammemes, result, morph):
 65 |     assert_first_inflected_variant(word, grammemes, result, morph)
 66 | 
 67 | 
 68 | def test_orel(morph):
 69 |     assert_first_inflected_variant('орел', ['gent'], 'орла', morph)
 70 | 
 71 | 
 72 | @with_test_data([
 73 |     ('снег', ['gent'], 'снега'),
 74 |     ('снег', ['gen2'], 'снегу'),
 75 |     ('Боря', ['voct'], 'Борь'),
 76 | ])
 77 | def test_second_cases(word, grammemes, result, morph):
 78 |     assert_first_inflected_variant(word, grammemes, result, morph)
 79 | 
 80 | 
 81 | @with_test_data([
 82 |     ('валенок', ['gent', 'sing'], 'валенка'),
 83 |     ('валенок', ['gen2', 'sing'], 'валенка'),  # there is no gen2
 84 |     ('велосипед', ['loct'], 'велосипеде'),  # о велосипеде
 85 |     ('велосипед', ['loc2'], 'велосипеде'),  # а тут второго предложного нет, в велосипеде
 86 |     ('хомяк', ['voct'], 'хомяк'),  # there is not voct, nomn should be used
 87 |     ('Геннадий', ['voct'], 'Геннадий'),  # there is not voct, nomn should be used
 88 | ])
 89 | def test_case_substitution(word, grammemes, result, morph):
 90 |     assert_first_inflected_variant(word, grammemes, result, morph)
 91 | 
 92 | 
 93 | @pytest.mark.xfail
 94 | @with_test_data([
 95 |     # доп. падежи, fixme
 96 |     ('лес', ['loct'], 'лесе'),   # о лесе
 97 |     ('лес', ['loc2'], 'лесу'),   # в лесу
 98 |     ('острова', ['datv'], 'островам'),
 99 | ])
100 | def test_best_guess(word, grammemes, result, morph):
101 |     assert_first_inflected_variant(word, grammemes, result, morph)
102 | 
103 | 
104 | @with_test_data([
105 |     ('заснеженный', ['gent'], 'заснеженного'),  # не "заснежённого"
106 |     ('ведро', ['gent'], 'ведра'),  # не "вёдра"
107 | ])
108 | def test_not_informal(word, grammemes, result, morph):
109 |     assert_first_inflected_variant(word, grammemes, result, morph)
110 | 


--------------------------------------------------------------------------------
/tests/test_lexemes.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | 
  4 | # lexemes are divided by blank lines;
  5 | # lines that starts with "#" are comments;
  6 | # lines that starts with "XFAIL" excludes lexeme from testing.
  7 | 
  8 | def parse_lexemes(lexemes_txt):
  9 |     lexemes_txt = "".join(
 10 |         line for line in lexemes_txt.strip().splitlines(True)
 11 |              if not line.startswith("#")
 12 |     )
 13 |     return lexemes_txt.split("\n\n")
 14 | 
 15 | 
 16 | def get_lexeme_words(lexeme):
 17 |     lexeme_words = tuple(lexeme.split())
 18 |     if lexeme_words[0].startswith('XFAIL'):
 19 |         pytest.xfail()
 20 |     return lexeme_words
 21 | 
 22 | 
 23 | def parse_full_lexeme(lexeme):
 24 |     forms = lexeme.strip().splitlines()
 25 |     return [form.split(None, 1) for form in forms]
 26 | 
 27 | 
 28 | LEXEMES = parse_lexemes("""
 29 | # =========== noun
 30 | кот кота коту кота котом коте
 31 | коты котов котам котов котами котах
 32 | 
 33 | # =========== pronoun
 34 | он его него ему нему его него
 35 | им ним нём
 36 | 
 37 | # =========== pronoun with a particle
 38 | он-то его-то него-то ему-то нему-то его-то него-то
 39 | им-то ним-то нём-то
 40 | 
 41 | # =========== noun with a known prefix
 42 | лжекот лжекота лжекоту лжекота лжекотом лжекоте
 43 | лжекоты лжекотов лжекотам лжекотов лжекотами лжекотах
 44 | 
 45 | # =========== noun with two known prefixes (hyphenated)
 46 | экс-лжекот экс-лжекота экс-лжекоту экс-лжекота экс-лжекотом экс-лжекоте
 47 | экс-лжекоты экс-лжекотов экс-лжекотам экс-лжекотов экс-лжекотами экс-лжекотах
 48 | 
 49 | # =========== noun with two known prefixes
 50 | экслжекот экслжекота экслжекоту экслжекота экслжекотом экслжекоте экслжекоты
 51 | экслжекотов экслжекотам экслжекотов экслжекотами экслжекотах
 52 | 
 53 | # =========== noun witn a guessed prefix
 54 | буропёс буропса буропсу буропса буропсом буропсе
 55 | буропсы буропсов буропсам буропсов буропсами буропсах
 56 | 
 57 | # =========== both parts can be inflected the same way
 58 | кот-маг кота-мага коту-магу кота-мага котом-магом коте-маге
 59 | коты-маги котов-магов котам-магам котов-магов котами-магами котах-магах
 60 | 
 61 | команда-участница команды-участницы команде-участнице команду-участницу командой-участницей командою-участницею команде-участнице
 62 | команды-участницы команд-участниц командам-участницам команды-участниц командами-участницами командах-участницах
 63 | 
 64 | # =========== prediction using suffix
 65 | йотка йотки йотке йотку йоткой йоткою йотке
 66 | йотки йоток йоткам йотки йотками йотках
 67 | 
 68 | # =========== left part is fixed
 69 | кото-пёс кото-пса кото-псу кото-пса кото-псом кото-псе
 70 | кото-псы кото-псов кото-псам кото-псов кото-псами кото-псах
 71 | 
 72 | # =========== left part is fixed, right is with known prefix
 73 | кото-псевдопёс кото-псевдопса кото-псевдопсу кото-псевдопса кото-псевдопсом кото-псевдопсе
 74 | кото-псевдопсы кото-псевдопсов кото-псевдопсам кото-псевдопсов кото-псевдопсами кото-псевдопсах
 75 | 
 76 | # =========== numeral with gender
 77 | два двух двум два двух двумя двух две две два два
 78 | 
 79 | # =========== two adverbs
 80 | красиво-туманно
 81 | 
 82 | # =========== adverb ПО-..
 83 | по-театральному
 84 | 
 85 | по-западному
 86 | 
 87 | # =========== two numerals: one depends on gender, the other doesn't
 88 | XFAIL: see https://github.com/kmike/pymorphy2/issues/18
 89 | два-три двух-трёх двум-трем два-три двух-трёх двумя-тремя двух-трёх
 90 | две-три двух-трёх двум-трем две-три двух-трёх двумя-тремя двух-трёх
 91 | два-три двух-трёх двум-трём два-три двумя-тремя двух-трёх
 92 | 
 93 | # =========== two nouns that parses differently
 94 | человек-гора человека-горы человеку-горе человека-гору человеком-горой человеком-горою человеке-горе
 95 | люди-горы людей-гор людям-горам людей-горы людьми-горами людях-горах
 96 | 
 97 | XFAIL
 98 | гора-человек горы-человека горе-человеку гору-человека горой-человеком горе-человеке
 99 | горы-люди гор-людей гор-человек горам-людям горам-человекам горы-людей горами-людьми горами-человеками горах-людях горах-человеках
100 | 
101 | XFAIL: this is currently too complex
102 | человек-гора человека-горы человеку-горе человека-гору человеком-горой человеком-горою человеке-горе
103 | люди-горы людей-гор человек-гор людям-горам человекам-горам людей-гор людьми-горами человеками-горами людях-горах человеках-горах
104 | 
105 | # =========== two nouns, one of which has gen1/gen2 forms
106 | лес-колдун леса-колдуна лесу-колдуну лес-колдуна лесом-колдуном лесе-колдуне
107 | леса-колдуны лесов-колдунов лесам-колдунам леса-колдунов лесами-колдунами лесах-колдунах
108 | 
109 | """)
110 | 
111 | 
112 | LEXEMES_FULL = parse_lexemes("""
113 | # ============ noun, a sanity check
114 | кот        NOUN,anim,masc sing,nomn
115 | кота       NOUN,anim,masc sing,gent
116 | коту       NOUN,anim,masc sing,datv
117 | кота       NOUN,anim,masc sing,accs
118 | котом      NOUN,anim,masc sing,ablt
119 | коте       NOUN,anim,masc sing,loct
120 | коты       NOUN,anim,masc plur,nomn
121 | котов      NOUN,anim,masc plur,gent
122 | котам      NOUN,anim,masc plur,datv
123 | котов      NOUN,anim,masc plur,accs
124 | котами     NOUN,anim,masc plur,ablt
125 | котах      NOUN,anim,masc plur,loct
126 | 
127 | # =========== adverb
128 | театрально ADVB
129 | 
130 | по-театральному ADVB
131 | 
132 | # =========== pronoun with a particle
133 | он-то      NPRO,masc,3per,Anph sing,nomn
134 | его-то     NPRO,masc,3per,Anph sing,gent
135 | него-то    NPRO,masc,3per,Anph sing,gent,Af-p
136 | ему-то     NPRO,masc,3per,Anph sing,datv
137 | нему-то    NPRO,masc,3per,Anph sing,datv,Af-p
138 | его-то     NPRO,masc,3per,Anph sing,accs
139 | него-то    NPRO,masc,3per,Anph sing,accs,Af-p
140 | им-то      NPRO,masc,3per,Anph sing,ablt
141 | ним-то     NPRO,masc,3per,Anph sing,ablt,Af-p
142 | нём-то     NPRO,masc,3per,Anph sing,loct,Af-p
143 | 
144 | # ========== initials
145 | И  NOUN,anim,masc,Sgtm,Name,Fixd,Abbr,Init sing,nomn
146 | И  NOUN,anim,masc,Sgtm,Name,Fixd,Abbr,Init sing,gent
147 | И  NOUN,anim,masc,Sgtm,Name,Fixd,Abbr,Init sing,datv
148 | И  NOUN,anim,masc,Sgtm,Name,Fixd,Abbr,Init sing,accs
149 | И  NOUN,anim,masc,Sgtm,Name,Fixd,Abbr,Init sing,ablt
150 | И  NOUN,anim,masc,Sgtm,Name,Fixd,Abbr,Init sing,loct
151 | 
152 | И  NOUN,anim,femn,Sgtm,Name,Fixd,Abbr,Init sing,nomn
153 | И  NOUN,anim,femn,Sgtm,Name,Fixd,Abbr,Init sing,gent
154 | И  NOUN,anim,femn,Sgtm,Name,Fixd,Abbr,Init sing,datv
155 | И  NOUN,anim,femn,Sgtm,Name,Fixd,Abbr,Init sing,accs
156 | И  NOUN,anim,femn,Sgtm,Name,Fixd,Abbr,Init sing,ablt
157 | И  NOUN,anim,femn,Sgtm,Name,Fixd,Abbr,Init sing,loct
158 | 
159 | И  NOUN,anim,masc,Sgtm,Patr,Fixd,Abbr,Init sing,nomn
160 | И  NOUN,anim,masc,Sgtm,Patr,Fixd,Abbr,Init sing,gent
161 | И  NOUN,anim,masc,Sgtm,Patr,Fixd,Abbr,Init sing,datv
162 | И  NOUN,anim,masc,Sgtm,Patr,Fixd,Abbr,Init sing,accs
163 | И  NOUN,anim,masc,Sgtm,Patr,Fixd,Abbr,Init sing,ablt
164 | И  NOUN,anim,masc,Sgtm,Patr,Fixd,Abbr,Init sing,loct
165 | И  NOUN,anim,femn,Sgtm,Patr,Fixd,Abbr,Init sing,nomn
166 | И  NOUN,anim,femn,Sgtm,Patr,Fixd,Abbr,Init sing,gent
167 | И  NOUN,anim,femn,Sgtm,Patr,Fixd,Abbr,Init sing,datv
168 | И  NOUN,anim,femn,Sgtm,Patr,Fixd,Abbr,Init sing,accs
169 | И  NOUN,anim,femn,Sgtm,Patr,Fixd,Abbr,Init sing,ablt
170 | И  NOUN,anim,femn,Sgtm,Patr,Fixd,Abbr,Init sing,loct
171 | 
172 | # ============ UNKN
173 | ьё UNKN
174 | """)
175 | 
176 | 
177 | # ============ Tests:
178 | 
179 | @pytest.mark.parametrize("lexeme", LEXEMES)
180 | def test_has_proper_lexemes(lexeme, morph):
181 |     """
182 |     Check if the lexeme of the first word in the lexeme is the same lexeme.
183 |     """
184 |     lexeme_words = get_lexeme_words(lexeme)
185 | 
186 |     variants = _lexemes_for_word(lexeme_words[0], morph)
187 |     assert lexeme_words in variants, "{} not in \n{}".format(lexeme, "\n".join([" ".join(v) for v in variants]))
188 | 
189 | 
190 | @pytest.mark.parametrize("lexeme", LEXEMES)
191 | def test_lexemes_sanity(lexeme, morph):
192 |     """
193 |     Check if parse.lexeme works properly by applying it several times.
194 |     """
195 |     lexeme_words = get_lexeme_words(lexeme)
196 | 
197 |     for word in lexeme_words:
198 |         for p in morph.parse(word):
199 |             assert p.lexeme[0].lexeme == p.lexeme
200 | 
201 | 
202 | @pytest.mark.parametrize("lexeme", LEXEMES)
203 | def test_normalized_is_first(lexeme, morph):
204 |     """
205 |     Test that parse.normalized is a first form in lexeme.
206 |     """
207 |     lexeme_words = get_lexeme_words(lexeme)
208 | 
209 |     first_parse = morph.parse(lexeme_words[0])[0]
210 |     normal_form = (first_parse.word, first_parse.tag.POS)
211 | 
212 |     for word in lexeme_words:
213 |         parses = morph.parse(word)
214 |         normalized = [(p.normalized.word, p.normalized.tag.POS) for p in parses]
215 |         assert normal_form in normalized
216 | 
217 | 
218 | @pytest.mark.parametrize("lexeme", LEXEMES_FULL)
219 | def test_full_lexemes(lexeme, morph):
220 |     """
221 |     Test that full lexemes are correct.
222 |     """
223 |     forms = parse_full_lexeme(lexeme)
224 |     forms_lower = [(w.lower(), tag) for w, tag in forms]
225 |     for word, tag in forms:
226 |         assert_has_full_lexeme(word, forms_lower, morph)
227 | 
228 | 
229 | def assert_has_full_lexeme(word, forms, morph):
230 |     assert any([(f.word, str(f.tag)) for f in p.lexeme] == forms for p in morph.parse(word)), \
231 |         f"Word {word} doesn't have lexeme {forms}"
232 | 
233 | 
234 | def _lexemes_for_word(word, morph):
235 |     res = []
236 |     for p in morph.parse(word):
237 |         res.append(tuple(f.word for f in p.lexeme))
238 |     return res
239 | 


--------------------------------------------------------------------------------
/tests/test_numeral_agreement.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | 
  4 | @pytest.mark.parametrize(('word', 'result'), [
  5 |     # прилагательные
  6 |     ("бесплатная", ["бесплатная", "бесплатные", "бесплатных"]),
  7 |     ("бесплатный", ["бесплатный", "бесплатных", "бесплатных"]),
  8 | 
  9 |     # числительные
 10 |     ("первый", ["первый", "первых", "первых"]),
 11 |     ("первая", ["первая", "первые", "первых"]),
 12 | 
 13 |     # существительные
 14 |     ("книга", ["книга", "книги", "книг"]),
 15 |     ("болт", ["болт", "болта", "болтов"]),
 16 | 
 17 |     # причастия
 18 |     ("летящий", ["летящий", "летящих", "летящих"]),
 19 |     ("летящая", ["летящая", "летящие", "летящих"]),
 20 | 
 21 |     # остальное части речи мы никак не согласовываем с числами
 22 |     ("играет", ["играет", "играет", "играет"])
 23 | ])
 24 | def test_plural_forms(word, result, morph):
 25 |     parsed = morph.parse(word)
 26 |     assert len(parsed)
 27 |     for plural, num in zip(result, [1, 2, 5]):
 28 |         assert parsed[0].make_agree_with_number(num).word == plural
 29 | 
 30 | 
 31 | @pytest.mark.parametrize(('word', 'form', 'result'), [
 32 |     ("книга", 'gent', ["книги", "книг", "книг"]),
 33 |     ("книга", 'datv', ["книге", "книгам", "книгам"]),
 34 |     ("книга", 'accs', ["книгу", "книги", "книг"]),
 35 |     ("книга", 'ablt', ["книгой", "книгами", "книгами"]),
 36 |     ("книга", 'loct', ["книге", "книгах", "книгах"]),
 37 | 
 38 |     ("час", "accs", ["час", "часа", "часов"]), # see https://github.com/kmike/pymorphy2/issues/32
 39 |     ("день", "accs", ["день", "дня", "дней"]),
 40 |     ("минута", "accs", ["минуту", "минуты", "минут"]),
 41 | 
 42 |     ("бесплатный", "gent", ["бесплатного", "бесплатных", "бесплатных"]),
 43 |     ("бесплатный", "datv", ["бесплатному", "бесплатным", "бесплатным"]),
 44 |     ("бесплатный", "accs,anim", ["бесплатного", "бесплатных", "бесплатных"]), # animacy make sense in accs
 45 |     ("бесплатный", "accs,inan", ["бесплатный", "бесплатных", "бесплатных"]),
 46 |     ("бесплатный", "ablt", ["бесплатным", "бесплатными", "бесплатными"]),
 47 |     ("бесплатный", "loct", ["бесплатном", "бесплатных", "бесплатных"]),
 48 | 
 49 |     ("бесплатная", "gent", ["бесплатной", "бесплатных", "бесплатных"]),
 50 |     ("бесплатная", "datv", ["бесплатной", "бесплатным", "бесплатным"]),
 51 |     ("бесплатная", "accs,anim", ["бесплатную", "бесплатных", "бесплатных"]),
 52 |     ("бесплатная", "accs,inan", ["бесплатную", "бесплатные", "бесплатных"]),
 53 |     ("бесплатная", "ablt", ["бесплатной", "бесплатными", "бесплатными"]),
 54 |     ("бесплатная", "loct", ["бесплатной", "бесплатных", "бесплатных"]),
 55 | 
 56 |     ("летящий", "gent", ["летящего", "летящих", "летящих"]),
 57 |     ("летящий", "datv", ["летящему", "летящим", "летящим"]),
 58 |     ("летящий", "accs,anim", ["летящего", "летящих", "летящих"]),
 59 |     ("летящий", "accs,inan", ["летящий", "летящих", "летящих"]),
 60 |     ("летящий", "ablt", ["летящим", "летящими", "летящими"]),
 61 |     ("летящий", "loct", ["летящем", "летящих", "летящих"]),
 62 | 
 63 |     ("летящая", "gent", ["летящей", "летящих", "летящих"]),
 64 |     ("летящая", "datv", ["летящей", "летящим", "летящим"]),
 65 |     ("летящая", "accs,anim", ["летящую", "летящих", "летящих"]),
 66 |     ("летящая", "accs,inan", ["летящую", "летящие", "летящих"]),
 67 |     ("летящая", "ablt", ["летящей", "летящими", "летящими"]),
 68 |     ("летящая", "loct", ["летящей", "летящих", "летящих"]),
 69 | 
 70 |     ("белка", "accs", ["белку", "белок", "белок"]),
 71 |     ("бобер", "accs", ["бобра", "бобров", "бобров"]),
 72 |     ("камень", "accs", ["камень", "камня", "камней"]),
 73 |     ("лопата", "accs", ["лопату", "лопаты", "лопат"])
 74 | ])
 75 | def test_plural_inflected(word, form, result, morph):
 76 |     parsed = [p for p in morph.parse(word) if p.tag.case == 'nomn']
 77 |     assert len(parsed)
 78 |     gram_tag = morph.TagClass(form)
 79 |     inflected_word = parsed[0].inflect({gram_tag.case})
 80 |     if gram_tag.animacy and inflected_word.tag.animacy:
 81 |         # morph.parse('летящая')[0].inflect({'accs','inan'}).word == 'летящий'
 82 |         inflected_word = inflected_word.inflect({gram_tag.animacy})
 83 |     assert inflected_word.word == result[0]
 84 |     for plural, num in zip(result, [1, 2, 5]):
 85 |         assert inflected_word.make_agree_with_number(num, gram_tag.animacy).word == plural
 86 | 
 87 | 
 88 | @pytest.mark.parametrize(('word', 'num', 'result'), [
 89 |     ("лопата", 0, "лопат"),
 90 |     ("лопата", 1, "лопата"),
 91 |     ("лопата", 2, "лопаты"),
 92 |     ("лопата", 4, "лопаты"),
 93 |     ("лопата", 5, "лопат"),
 94 |     ("лопата", 6, "лопат"),
 95 |     ("лопата", 11, "лопат"),
 96 |     ("лопата", 12, "лопат"),
 97 |     ("лопата", 15, "лопат"),
 98 |     ("лопата", 21, "лопата"),
 99 |     ("лопата", 24, "лопаты"),
100 |     ("лопата", 25, "лопат"),
101 |     ("лопата", 101, "лопата"),
102 |     ("лопата", 103, "лопаты"),
103 |     ("лопата", 105, "лопат"),
104 |     ("лопата", 111, "лопат"),
105 |     ("лопата", 112, "лопат"),
106 |     ("лопата", 151, "лопата"),
107 |     ("лопата", 122, "лопаты"),
108 |     ("лопата", 5624, "лопаты"),
109 |     ("лопата", 5431, "лопата"),
110 |     ("лопата", 7613, "лопат"),
111 |     ("лопата", 2111, "лопат"),
112 | ])
113 | def test_plural_num(word, num, result, morph):
114 |     parsed = morph.parse(word)
115 |     assert len(parsed)
116 |     assert parsed[0].make_agree_with_number(num).word == result
117 | 


--------------------------------------------------------------------------------
/tests/test_opencorpora_dict.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import pytest
  4 | 
  5 | import pymorphy2
  6 | from pymorphy2 import lang
  7 | from pymorphy2.dawg import assert_can_create
  8 | from pymorphy2.opencorpora_dict.compile import (
  9 |     _to_paradigm,
 10 |     convert_to_pymorphy2
 11 | )
 12 | from pymorphy2.opencorpora_dict.parse import parse_opencorpora_xml
 13 | 
 14 | 
 15 | class TestToyDictionary:
 16 | 
 17 |     XML_PATH = os.path.join(
 18 |         os.path.dirname(__file__),
 19 |         '..',
 20 |         'dev_data',
 21 |         'toy_dict.xml'
 22 |     )
 23 | 
 24 |     def test_parse_xml(self):
 25 |         dct = parse_opencorpora_xml(self.XML_PATH)
 26 |         assert dct.version == '0.92'
 27 |         assert dct.revision == '389440'
 28 | 
 29 |         assert dct.links[0] == ('5', '6', '1')
 30 |         assert len(dct.links) == 13
 31 | 
 32 |         assert dct.grammemes[1] == ('NOUN', 'POST', 'СУЩ', 'имя существительное')
 33 |         assert len(dct.grammemes) == 114
 34 | 
 35 |         assert dct.lexemes['14'] == [('ёжиться', 'INFN,impf,intr')]
 36 | 
 37 |         # bad values should be dropped
 38 |         assert dct.lexemes['111111'] == []
 39 |         assert dct.lexemes['222222'] == []
 40 | 
 41 |     def test_convert_to_pymorphy2(self, tmpdir):
 42 | 
 43 |         # import logging
 44 |         # from pymorphy2.opencorpora_dict.compile import logger
 45 |         # logger.setLevel(logging.DEBUG)
 46 |         # logger.addHandler(logging.StreamHandler())
 47 | 
 48 |         try:
 49 |             assert_can_create()
 50 |         except NotImplementedError as e:
 51 |             raise pytest.skip(str(e))
 52 | 
 53 |         # create a dictionary
 54 |         out_path = str(tmpdir.join('dicts'))
 55 |         options = {
 56 |             'min_paradigm_popularity': 0,
 57 |             'min_ending_freq': 0,
 58 |             'paradigm_prefixes': lang.ru.PARADIGM_PREFIXES,
 59 |         }
 60 |         convert_to_pymorphy2(self.XML_PATH, out_path,
 61 |                              source_name='toy', language_code='ru',
 62 |                              overwrite=True, compile_options=options)
 63 | 
 64 |         # use it
 65 |         morph = pymorphy2.MorphAnalyzer(out_path)
 66 |         assert morph.tag('ёжиться') == [morph.TagClass('INFN,impf,intr')]
 67 | 
 68 |         # tag simplification should work
 69 |         assert morph.tag("ёж")[0] == morph.tag("ванька-встанька")[0]
 70 | 
 71 |         # Init tags should be handled correctly
 72 |         assert 'Init' in morph.tag("Ц")[0]
 73 |         assert 'Init' not in morph.tag("ц")[0]
 74 | 
 75 |         # normalization tests
 76 |         assert morph.normal_forms('абсурднее') == ['абсурдный']
 77 |         assert morph.normal_forms('а') == ['а']
 78 | 
 79 | 
 80 | class TestToParadigm:
 81 | 
 82 |     def test_simple(self):
 83 |         lexeme = [
 84 |             ["ярче", "COMP,Qual"],
 85 |             ["ярчей", "COMP,Qual V-ej"],
 86 |         ]
 87 |         stem, forms = _to_paradigm(lexeme, lang.ru.PARADIGM_PREFIXES)
 88 |         assert stem == "ярче"
 89 |         assert forms == (
 90 |             ("", "COMP,Qual", ""),
 91 |             ("й", "COMP,Qual V-ej", ""),
 92 |         )
 93 | 
 94 |     def test_single_prefix(self):
 95 |         lexeme = [
 96 |             ["ярче", "COMP,Qual"],
 97 |             ["поярче", "COMP,Qual Cmp2"],
 98 |         ]
 99 |         stem, forms = _to_paradigm(lexeme, lang.ru.PARADIGM_PREFIXES)
100 |         assert stem == "ярче"
101 |         assert forms == (
102 |             ("", "COMP,Qual", ""),
103 |             ("", "COMP,Qual Cmp2", "по"),
104 |         )
105 | 
106 |     def test_multiple_prefixes(self):
107 |         lexeme = [
108 |             ["ярче", "COMP,Qual"],
109 |             ["ярчей", "COMP,Qual V-ej"],
110 |             ["поярче", "COMP,Qual Cmp2"],
111 |             ["поярчей", "COMP,Qual Cmp2,V-ej"],
112 |             ["наиярчайший", "ADJF,Supr,Qual masc,sing,nomn"],
113 |         ]
114 |         stem, forms = _to_paradigm(lexeme, lang.ru.PARADIGM_PREFIXES)
115 |         assert stem == 'ярч'
116 | 
117 |     def test_multiple_prefixes_2(self):
118 |         lexeme = [
119 |             ["подробнейший", 1],
120 |             ["наиподробнейший", 2],
121 |             ["поподробнее", 3]
122 |         ]
123 |         stem, forms = _to_paradigm(lexeme, lang.ru.PARADIGM_PREFIXES)
124 |         assert stem == 'подробне'
125 |         assert forms == (
126 |             ("йший", 1, ""),
127 |             ("йший", 2, "наи"),
128 |             ("е", 3, "по"),
129 |         )
130 | 
131 |     def test_platina(self):
132 |         lexeme = [
133 |             ["платиновее", 1],
134 |             ["платиновей", 2],
135 |             ["поплатиновее", 3],
136 |             ["поплатиновей", 4],
137 |         ]
138 |         stem, forms = _to_paradigm(lexeme, lang.ru.PARADIGM_PREFIXES)
139 |         assert forms == (
140 |             ("е", 1, ""),
141 |             ("й", 2, ""),
142 |             ("е", 3, "по"),
143 |             ("й", 4, "по"),
144 |         )
145 |         assert stem == 'платинове'
146 | 
147 |     def test_no_prefix(self):
148 |         lexeme = [["английский", 1], ["английского", 2]]
149 |         stem, forms = _to_paradigm(lexeme, lang.ru.PARADIGM_PREFIXES)
150 |         assert stem == 'английск'
151 |         assert forms == (
152 |             ("ий", 1, ""),
153 |             ("ого", 2, ""),
154 |         )
155 | 
156 |     def test_single(self):
157 |         lexeme = [["английски", 1]]
158 |         stem, forms = _to_paradigm(lexeme, lang.ru.PARADIGM_PREFIXES)
159 |         assert stem == 'английски'
160 |         assert forms == (("", 1, ""),)
161 | 
162 | 
163 | 


--------------------------------------------------------------------------------
/tests/test_parsing.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | from utils import assert_parse_is_correct
  4 | 
  5 | 
  6 | def _to_test_data(text):
  7 |     """
  8 |     Lines should be of this format: <word> <normal_form> <tag>.
  9 |     Lines that starts with "#" and blank lines are skipped.
 10 |     Lines that starts with "XFAIL" excludes the next line from testing.
 11 |     """
 12 | 
 13 |     def generator():
 14 |         xfail = False
 15 |         for line in text.splitlines():
 16 |             if not line.strip() or line.startswith("#"):
 17 |                 continue
 18 |             elif line.startswith("XFAIL"):
 19 |                 xfail = True
 20 |                 continue
 21 | 
 22 |             parts = line.split(None, 2)
 23 |             if xfail:
 24 |                 # skip
 25 |                 xfail = False
 26 |             else:
 27 |                 yield parts
 28 | 
 29 |     return list(generator())
 30 | 
 31 | 
 32 | PARSES = _to_test_data("""
 33 | # ========= nouns
 34 | кошка       кошка       NOUN,inan,femn sing,nomn
 35 | 
 36 | # ========= adjectives
 37 | хорошему            хороший     ADJF,Qual masc,sing,datv
 38 | лучший              хороший     ADJF,Supr,Qual masc,sing,nomn
 39 | XFAIL
 40 | наиневероятнейший   вероятный   ADJF,Supr,Qual masc,sing,nomn
 41 | наистарейший        старый      ADJF,Supr,Qual masc,sing,nomn
 42 | 
 43 | # ========= е/ё
 44 | котенок     котёнок     NOUN,anim,masc sing,nomn
 45 | котёнок     котёнок     NOUN,anim,masc sing,nomn
 46 | озера       озеро       NOUN,inan,neut sing,gent
 47 | озера       озеро       NOUN,inan,neut plur,nomn
 48 | 
 49 | # ========= particle after a hyphen
 50 | ей-то               она-то              NPRO,femn,3per,Anph sing,datv
 51 | скажи-ка            сказать-ка          VERB,perf,tran sing,impr,excl
 52 | измохратился-таки   измохратиться-таки  VERB,perf,intr masc,sing,past,indc
 53 | 
 54 | # ========= compound words with hyphen and immutable left
 55 | интернет-магазина       интернет-магазин    NOUN,inan,masc sing,gent
 56 | pdf-документов          pdf-документ        NOUN,inan,masc plur,gent
 57 | аммиачно-селитрового    аммиачно-селитровый ADJF,Qual masc,sing,gent
 58 | быстро-быстро           быстро-быстро       ADVB
 59 | 
 60 | # ========= compound words with hyphen and mutable left
 61 | команд-участниц     команда-участница   NOUN,inan,femn plur,gent
 62 | бегает-прыгает      бегать-прыгать      VERB,impf,intr sing,3per,pres,indc
 63 | дул-надувался       дуть-надуваться     VERB,impf,tran masc,sing,past,indc
 64 | 
 65 | # ПО- (there were bugs for such words in pymorphy 0.5.6)
 66 | почтово-банковский  почтово-банковский  ADJF masc,sing,nomn
 67 | по-прежнему         по-прежнему         ADVB
 68 | 
 69 | # other old bugs
 70 | поездов-экспрессов          поезд-экспресс          NOUN,inan,masc plur,gent
 71 | подростками-практикантами   подросток-практикант    NOUN,anim,masc plur,ablt
 72 | подводников-североморцев    подводник-североморец   NOUN,anim,masc plur,gent
 73 | 
 74 | # issue with normal form caching
 75 | залом   зал     NOUN,inan,masc sing,ablt
 76 | 
 77 | # cities
 78 | санкт-петербурга    санкт-петербург     NOUN,inan,masc,Geox sing,gent
 79 | ростове-на-дону     ростов-на-дону      NOUN,inan,masc,Sgtm,Geox sing,loct
 80 | 
 81 | # ========= non-dictionary adverbs
 82 | по-западному        по-западному        ADVB
 83 | по-театральному     по-театральному     ADVB
 84 | по-воробьиному      по-воробьиному      ADVB
 85 | 
 86 | # ========= hyphenated words with non-cyrillic parts
 87 | # this used to raise an exception
 88 | 
 89 | Ретро-FM    ретро-fm    LATN
 90 | 
 91 | # ====================== non-words
 92 | .       .       PNCT
 93 | ,       ,       PNCT
 94 | ...     ...     PNCT
 95 | ?!      ?!      PNCT
 96 | -       -       PNCT
 97 | …       …       PNCT
 98 | 
 99 | 123         123         NUMB,intg
100 | 0           0           NUMB,intg
101 | 123.1       123.1       NUMB,real
102 | 123,1       123,1       NUMB,real
103 | I           i           ROMN
104 | MCMLXXXIX   mcmlxxxix   ROMN
105 | XVIII       xviii       ROMN
106 | 
107 | # ========= LATN
108 | Foo     foo     LATN
109 | I       i       LATN
110 | 
111 | # ========= UNKN
112 | ьё      ьё      UNKN
113 | 
114 | # ============== common lowercased abbreviations
115 | 
116 | руб     руб     NOUN,inan,masc,Fixd,Abbr plur,gent
117 | млн     млн     NOUN,inan,masc,Fixd,Abbr plur,gent
118 | тыс     тыс     NOUN,inan,femn,Fixd,Abbr plur,gent
119 | ст      ст      NOUN,inan,femn,Fixd,Abbr sing,accs
120 | """)
121 | 
122 | PARSES_UPPER = [(w.upper(), norm, tag) for (w, norm, tag) in PARSES]
123 | PARSES_TITLE = [(w.title(), norm, tag) for (w, norm, tag) in PARSES]
124 | 
125 | SYSTEMATIC_ERRORS = _to_test_data("""
126 | # ============== foreign first names
127 | Уилл    уилл        NOUN,anim,masc,Name sing,nomn
128 | Джеф    джеф        NOUN,anim,masc,Name sing,nomn
129 | 
130 | # ============== last names
131 | Сердюков    сердюков    NOUN,anim,masc,Surn sing,nomn
132 | Третьяк     третьяк     NOUN,anim,masc,Surn sing,nomn
133 | 
134 | # ============== common lowercased abbreviations
135 | # should normal forms be expanded?
136 | 
137 | г       г       NOUN,inan,masc,Fixd,Abbr sing,loc2
138 | п       п       NOUN,inan,masc,Fixd,Abbr sing,accs
139 | 
140 | # ============== uppercased abbreviations
141 | # it seems is not possible to properly guess gender and number
142 | 
143 | ГКРФ        гкрф    NOUN,inan,masc,Sgtm,Fixd,Abbr sing,nomn
144 | ПДД         пдд     NOUN,inan,neut,Pltm,Fixd,Abbr plur,nomn
145 | ФП          фп      NOUN,inan,neut,Sgtm,Fixd,Abbr sing,nomn
146 | ООП         ооп     NOUN,inan,neut,Sgtm,Fixd,Abbr sing,nomn
147 | ПИН         пин     NOUN,inan,masc,Sgtm,Fixd,Abbr sing,nomn
148 | УБРиР       убрир   NOUN,inan,masc,Abbr sing,nomn
149 | УБРиРе      убрир   NOUN,inan,masc,Abbr sing,ablt
150 | УБРиР-е     убрир   NOUN,inan,masc,Abbr sing,ablt
151 | 
152 | # =============== numerals
153 | 3-го        3-й     ADJF,Anum masc,sing,gent
154 | 41-й        41-й    ADJF,Anum masc,sing,nomn
155 | 41-м        41-м    ADJF,Anum masc,sing,loct
156 | 2001-й      2001-й  ADJF,Anum masc,sing,nomn
157 | 8-му        8-й     ADJF,Anum masc,sing,datv
158 | 3-х         3       NUMR,gent
159 | 
160 | уловка-22   уловка-22   NOUN,inan,femn sing,nomn
161 | 
162 | """)
163 | 
164 | 
165 | def run_for_all(parses):
166 |     return pytest.mark.parametrize(("word", "normal_form", "tag"), parses)
167 | 
168 | 
169 | # ====== Tests:
170 | def _test_has_parse(parses):
171 |     @run_for_all(parses)
172 |     def test_case(word, normal_form, tag, morph):
173 |         parse = morph.parse(word)
174 |         assert_parse_is_correct(parse, word, normal_form, tag)
175 | 
176 |     return test_case
177 | 
178 | test_has_parse = _test_has_parse(PARSES)
179 | test_has_parse_title = _test_has_parse(PARSES_TITLE)
180 | test_has_parse_upper = _test_has_parse(PARSES_UPPER)
181 | 
182 | test_has_parse_systematic_errors = pytest.mark.xfail(_test_has_parse(SYSTEMATIC_ERRORS))
183 | 
184 | 
185 | def _test_tag(parses):
186 |     @run_for_all(parses)
187 |     def test_tag_produces_the_same_as_parse(word, normal_form, tag, morph):
188 |         """
189 |         Check if morph.tag produces the same results as morph.parse.
190 |         """
191 |         assert set(morph.tag(word)) == set(p.tag for p in morph.parse(word))
192 | 
193 |     return test_tag_produces_the_same_as_parse
194 | 
195 | test_tag = _test_tag(PARSES)
196 | test_tag_title = _test_tag(PARSES_TITLE)
197 | test_tag_upper = _test_tag(PARSES_UPPER)
198 | 


--------------------------------------------------------------------------------
/tests/test_prefix_matching.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from pymorphy2 import lang
 4 | from pymorphy2.dawg import PythonPrefixMatcher, PrefixMatcher
 5 | 
 6 | MATCHERS = [PythonPrefixMatcher, PrefixMatcher]
 7 | HAS_PREFIXES = [
 8 |     ["псевдокот", True],
 9 |     ["кот", False],
10 | ]
11 | PREFIXES = [
12 |     ['псевдокот', ['псевдо']],
13 |     ['супер-кот', ['супер', 'супер-']],
14 |     ['кот', []],
15 | ]
16 | 
17 | 
18 | @pytest.mark.parametrize('matcher_cls', MATCHERS)
19 | @pytest.mark.parametrize(['word', 'is_prefixed'], HAS_PREFIXES)
20 | def test_prefix_matcher_is_prefixed(matcher_cls, word, is_prefixed):
21 |     matcher = matcher_cls(lang.ru.KNOWN_PREFIXES)
22 |     assert matcher.is_prefixed(word) == is_prefixed
23 | 
24 | 
25 | @pytest.mark.parametrize('matcher_cls', MATCHERS)
26 | @pytest.mark.parametrize(['word', 'prefixes'], PREFIXES)
27 | def test_prefix_matcher_prefixes(matcher_cls, word, prefixes):
28 |     matcher = matcher_cls(lang.ru.KNOWN_PREFIXES)
29 |     assert set(matcher.prefixes(word)) == set(prefixes)
30 | 


--------------------------------------------------------------------------------
/tests/test_result_wrapper.py:
--------------------------------------------------------------------------------
 1 | def test_indexing(morph):
 2 |     assert len(morph.parse('стреляли')) == 1
 3 |     p = morph.parse('стреляли')[0]
 4 | 
 5 |     assert p[0] == 'стреляли' # word
 6 |     assert p[1].POS == 'VERB' # tag
 7 |     assert p[2] == 'стрелять'
 8 | 
 9 |     assert p[0] == p.word
10 |     assert p[1] == p.tag
11 |     assert p[2] == p.normal_form
12 | 
13 | 
14 | def test_inflect_valid(morph):
15 |     p = morph.parse('стреляли')[0]
16 |     assert p.inflect({'femn'}).word == 'стреляла'
17 | 
18 | 
19 | def test_inflect_invalid(morph):
20 |     p = morph.parse('стреляли')[0]
21 |     assert p.inflect({'NOUN'}) is None
22 | 
23 | 
24 | def test_is_known(morph):
25 |     assert morph.parse('стреляли')[0].is_known
26 |     assert not morph.parse('сптриояли')[0].is_known
27 | 
28 | 
29 | def test_normalized(morph):
30 |     assert morph.parse('стреляли')[0].normalized.word == 'стрелять'
31 | 


--------------------------------------------------------------------------------
/tests/test_tagset.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | 
  3 | import pytest
  4 | 
  5 | import pymorphy2
  6 | from pymorphy2.tagset import OpencorporaTag
  7 | 
  8 | 
  9 | def test_hashing(Tag):
 10 |     tag1 = Tag('NOUN')
 11 |     tag2 = Tag('NOUN')
 12 |     tag3 = Tag('VERB')
 13 | 
 14 |     assert tag1 == tag2
 15 |     assert tag1 != tag3
 16 |     assert {tag1} == {tag2}
 17 |     assert {tag3} != {tag1}
 18 | 
 19 | 
 20 | @pytest.mark.parametrize(("tag", "cls"), [
 21 |         ['NOUN', 'NOUN'],
 22 |         ['NOUN,sing', 'NOUN'],
 23 |         ['NOUN sing', 'NOUN'],
 24 |     ])
 25 | def test_cls(tag, cls, Tag):
 26 |     assert Tag(tag).POS == cls
 27 | 
 28 | 
 29 | def test_repr(Tag):
 30 |     assert repr(Tag('NOUN anim,plur')) == "OpencorporaTag('NOUN anim,plur')"
 31 | 
 32 | 
 33 | # Cloning of the Tag class is disabled to allow pickling
 34 | @pytest.mark.xfail
 35 | def test_extra_grammemes(Tag):
 36 |     m = pymorphy2.MorphAnalyzer()
 37 | 
 38 |     assert m.TagClass.KNOWN_GRAMMEMES is not Tag.KNOWN_GRAMMEMES
 39 |     assert m.TagClass.KNOWN_GRAMMEMES is not OpencorporaTag.KNOWN_GRAMMEMES
 40 | 
 41 |     assert 'new_grammeme' not in Tag.KNOWN_GRAMMEMES
 42 |     assert 'new_grammeme' not in m.TagClass.KNOWN_GRAMMEMES
 43 | 
 44 |     m.TagClass.KNOWN_GRAMMEMES.add('new_grammeme')
 45 | 
 46 |     new_tag = m.TagClass('NOUN,sing,new_grammeme')
 47 | 
 48 |     assert 'new_grammeme' in new_tag
 49 |     assert 'new_grammeme' in m.TagClass.KNOWN_GRAMMEMES
 50 |     assert 'new_grammeme' not in OpencorporaTag.KNOWN_GRAMMEMES
 51 |     assert 'new_grammeme' not in Tag.KNOWN_GRAMMEMES
 52 | 
 53 | 
 54 | def test_len(Tag):
 55 |     assert len(Tag('NOUN')) == 1
 56 |     assert len(Tag('NOUN plur')) == 2
 57 |     assert len(Tag('NOUN plur,masc')) == 3
 58 |     assert len(Tag('NOUN,plur,masc')) == 3
 59 | 
 60 | 
 61 | def test_pickle(Tag):
 62 |     tag = Tag('NOUN')
 63 |     data = pickle.dumps(tag, pickle.HIGHEST_PROTOCOL)
 64 |     tag_unpickled = pickle.loads(data)
 65 |     assert tag == tag_unpickled
 66 | 
 67 | 
 68 | def test_pickle_custom():
 69 |     m = pymorphy2.MorphAnalyzer()
 70 |     m.TagClass.KNOWN_GRAMMEMES.add('new_grammeme')
 71 |     tag = m.TagClass('new_grammeme')
 72 |     data = pickle.dumps(tag, pickle.HIGHEST_PROTOCOL)
 73 |     tag_unpickled = pickle.loads(data)
 74 |     assert tag == tag_unpickled
 75 | 
 76 | 
 77 | class TestUpdated:
 78 | 
 79 |     def test_number(self, Tag):
 80 |         tag = Tag('NOUN,sing,masc')
 81 |         grammemes = tag.updated_grammemes(required={'plur'})
 82 |         assert grammemes == {'NOUN', 'plur'}
 83 | 
 84 |     def test_order(self, Tag):
 85 |         tag = Tag('VERB,impf,tran sing,3per,pres,indc')
 86 |         grammemes = tag.updated_grammemes(required={'1per'})
 87 |         assert grammemes == set('VERB,sing,impf,tran,1per,pres,indc'.split(','))
 88 | 
 89 | 
 90 | class TestAttributes:
 91 | 
 92 |     def test_attributes(self, Tag):
 93 |         tag = Tag('VERB,impf,tran sing,3per,pres,indc')
 94 |         assert tag.POS == 'VERB'
 95 |         assert tag.gender is None
 96 |         assert tag.animacy is None
 97 |         assert tag.number == 'sing'
 98 |         assert tag.case is None
 99 |         assert tag.tense == 'pres'
100 |         assert tag.aspect == 'impf'
101 |         assert tag.mood == 'indc'
102 |         assert tag.person == '3per'
103 |         assert tag.transitivity == 'tran'
104 |         assert tag.voice is None # ?
105 |         assert tag.involvement is None
106 | 
107 |     def test_attributes2(self, Tag):
108 |         tag = Tag('NOUN,inan,masc plur,accs')
109 |         assert tag.POS == 'NOUN'
110 |         assert tag.gender == 'masc'
111 |         assert tag.animacy == 'inan'
112 |         assert tag.number == 'plur'
113 |         assert tag.case == 'accs'
114 |         assert tag.tense is None
115 |         assert tag.aspect is None
116 |         assert tag.mood is None
117 |         assert tag.person is None
118 |         assert tag.transitivity is None
119 |         assert tag.voice is None
120 |         assert tag.involvement is None
121 | 
122 |     def test_attributes3(self, Tag):
123 |         tag = Tag('PRTF,impf,tran,pres,pssv inan,masc,sing,accs')
124 |         assert tag.voice == 'pssv'
125 | 
126 |     def test_attributes4(self, Tag):
127 |         tag = Tag('VERB,perf,tran plur,impr,excl')
128 |         assert tag.involvement == 'excl'
129 | 
130 |     def test_attribute_exceptions(self, Tag):
131 |         tag = Tag('NOUN,inan,masc plur,accs')
132 | 
133 |         with pytest.raises(ValueError):
134 |             tag.POS == 'hello'
135 | 
136 |         with pytest.raises(ValueError):
137 |             tag.POS == 'noun'
138 | 
139 |     def test_attributes_as_set_items(self, Tag):
140 |         tag = Tag('NOUN,inan,masc plur,accs')
141 | 
142 |         # this doesn't raise an exception
143 |         assert tag.gender in {'masc', 'sing'}
144 | 
145 | 
146 | class TestContains:
147 | 
148 |     def test_contains_correct(self, Tag):
149 |         tag_text = 'VERB,perf,tran plur,impr,excl'
150 |         tag = Tag(tag_text)
151 |         for grammeme in tag_text.replace(' ', ',').split(','):
152 |             assert grammeme in tag
153 | 
154 |     def test_not_contains(self, Tag):
155 |         # we need to use a prepared Tag class for this to work
156 |         tag = Tag('VERB,perf,tran plur,impr,excl')
157 | 
158 |         assert 'VERB' in tag
159 |         assert 'NOUN' not in tag
160 |         assert 'sing' not in tag
161 |         assert 'Dist' not in tag
162 | 
163 |     def test_contains_error(self, Tag):
164 |         # we need to use a prepared Tag class for this to work
165 |         tag = Tag('VERB,perf,tran plur,impr,excl')
166 | 
167 |         with pytest.raises(ValueError):
168 |             assert 'foo' in tag
169 | 
170 |         with pytest.raises(ValueError):
171 |             assert 'VERP' in tag
172 | 
173 |     def test_contains_set(self, Tag):
174 |         tag = Tag('VERB,perf,tran plur,impr,excl')
175 |         assert {'VERB', 'perf'} in tag
176 |         assert {'VERB', 'sing'} not in tag
177 | 
178 |         assert set() in tag  # ??
179 | 
180 |         with pytest.raises(ValueError):
181 |             assert {'VERB', 'pref'} in tag
182 | 
183 | 
184 | class TestCyrillic:
185 |     def test_cyr_repr(self, Tag):
186 |         tag = Tag('VERB,perf,tran plur,impr,excl')
187 |         assert tag.cyr_repr == 'ГЛ,сов,перех мн,повел,выкл'
188 | 
189 |     def test_grammemes_cyr(self, Tag):
190 |         tag = Tag('VERB,perf,tran plur,impr,excl')
191 |         assert tag.grammemes_cyr == frozenset(['ГЛ','сов','перех', 'мн','повел','выкл'])
192 | 
193 |     def test_cyr_extra_grammemes(self, Tag):
194 |         tag = Tag('ROMN')
195 |         assert tag.cyr_repr == 'РИМ'
196 | 
197 |     @pytest.mark.parametrize(('lat', 'cyr'), [
198 |         ('VERB,perf,tran plur,impr,excl', 'ГЛ,сов,перех мн,повел,выкл'),
199 |         ('ROMN', 'РИМ'),
200 |         ('ROMN,unknown_grammeme', 'РИМ,unknown_grammeme'),
201 |         ('plur', 'мн'),
202 |     ])
203 |     def test_lat2cyr(self, lat, cyr, Tag, morph):
204 |         assert Tag.lat2cyr(lat) == cyr
205 |         assert Tag.cyr2lat(cyr) == lat
206 |         assert morph.lat2cyr(lat) == cyr
207 |         assert morph.cyr2lat(cyr) == lat
208 | 


--------------------------------------------------------------------------------
/tests/test_threading.py:
--------------------------------------------------------------------------------
 1 | import concurrent.futures
 2 | import random
 3 | 
 4 | import pymorphy2
 5 | from test_parsing import PARSES
 6 | from utils import assert_parse_is_correct
 7 | 
 8 | 
 9 | def _check_analyzer(morph, parses):
10 |     for word, normal_form, tag in parses:
11 |         parse = morph.parse(word)
12 |         assert_parse_is_correct(parse, word, normal_form, tag)
13 | 
14 | 
15 | def _check_new_analyzer(parses):
16 |     morph = pymorphy2.MorphAnalyzer()
17 |     for word, normal_form, tag in parses:
18 |         parse = morph.parse(word)
19 |         assert_parse_is_correct(parse, word, normal_form, tag)
20 | 
21 | 
22 | def _create_morph_analyzer(i):
23 |     morph = pymorphy2.MorphAnalyzer()
24 |     word, normal_form, tag = random.choice(PARSES)
25 |     parse = morph.parse(word)
26 |     assert_parse_is_correct(parse, word, normal_form, tag)
27 | 
28 | 
29 | def test_threading_single_morph_analyzer(morph):
30 |     with concurrent.futures.ThreadPoolExecutor(3) as executor:
31 |         res = list(executor.map(_check_analyzer, [morph]*10, [PARSES]*10))
32 | 
33 | 
34 | def test_threading_multiple_morph_analyzers():
35 |     with concurrent.futures.ThreadPoolExecutor(3) as executor:
36 |         res = list(executor.map(_check_new_analyzer, [PARSES]*10))
37 | 
38 | 
39 | def test_threading_create_analyzer():
40 |     with concurrent.futures.ThreadPoolExecutor(3) as executor:
41 |         res = list(executor.map(_create_morph_analyzer, range(10)))
42 | 


--------------------------------------------------------------------------------
/tests/test_tokenizers.py:
--------------------------------------------------------------------------------
 1 | from pymorphy2.tokenizers import simple_word_tokenize
 2 | 
 3 | 
 4 | class TestSimpleWordTokenize:
 5 | 
 6 |     def test_split_simple(self):
 7 |         assert simple_word_tokenize('Мама мыла раму') == ['Мама', 'мыла', 'раму']
 8 |         assert simple_word_tokenize('Постой, паровоз!') == ['Постой', ',', 'паровоз', '!']
 9 | 
10 |     def test_split_hyphen(self):
11 |         assert simple_word_tokenize('Ростов-на-Дону') == ['Ростов-на-Дону']
12 |         assert simple_word_tokenize('Ура - победа') == ['Ура', '-', 'победа']
13 | 
14 |     def test_split_signs(self):
15 |         assert simple_word_tokenize('a+b=c_1') == ['a','+','b','=','c_1']
16 | 
17 |     def test_exctract_words(self):
18 |         text = '''Это  отразилось: на количественном,и на качествен_ном
19 |                 - росте карельско-финляндского сотрудничества - офигеть! кони+лошади=масло.
20 |                 -сказал кто-то --нет--'''
21 | 
22 |         assert simple_word_tokenize(text) == [
23 |             'Это', 'отразилось', ':', 'на', 'количественном', ',', 'и', 'на',
24 |             'качествен_ном', '-', 'росте', 'карельско-финляндского',
25 |             'сотрудничества', '-', 'офигеть', '!', 'кони', '+', 'лошади',
26 |             '=', 'масло', '.', '-сказал', 'кто-то', '--нет--',
27 |         ]
28 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from pymorphy2.utils import get_mem_usage
 4 | 
 5 | 
 6 | def test_get_mem_usage():
 7 |     pytest.importorskip("psutil")
 8 |     rss = get_mem_usage()
 9 |     assert 1000000 < rss < 1000000000  # 1MB to 1GB
10 | 


--------------------------------------------------------------------------------
/tests/utils.py:
--------------------------------------------------------------------------------
 1 | def assert_parse_is_correct(parses, word, normal_form, tag):
 2 |     """
 3 |     Check if one of the word parses has normal form ``normal_form``
 4 |     and tag ``tag``.
 5 |     """
 6 |     for p in parses:
 7 |         if p.normal_form == normal_form and str(p.tag) == tag:
 8 |             return
 9 |     assert False, parses
10 | 
11 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = {py37,py38,py39,py310,py311,pypy3}-{fast,slow},docs
 3 | minversion = 2.1.1
 4 | skip_missing_interpreters = True
 5 | 
 6 | [testenv]
 7 | deps=
 8 |     pytest
 9 |     pytest-cov
10 |     tqdm
11 |     coverage >= 4.0
12 |     psutil
13 | 
14 |     {py37,py38,py39,py310,py311}-fast: lxml
15 | 
16 |     ; keep support for pymorphy2==0.8 dictionaries
17 |     slow: pymorphy2-dicts==2.4.393442.3710985
18 | 
19 | 
20 | commands=
21 |     ; a workaround for tox bug: https://bitbucket.org/hpk42/tox/issue/176/
22 |     pip install -I {toxinidir}
23 | 
24 |     fast: pip install pymorphy2[fast]
25 | 
26 |     py37,py38,py39,py310,py311,pypy3: pymorphy dict mem_usage
27 | 
28 |     py.test \
29 |         --doctest-modules \
30 |         --cov=. \
31 |         {posargs:pymorphy2 tests}
32 | 
33 | ; setenv=
34 | ;    PYMORPHY2_DICT_PATH = ../pymorphy2-dicts/pymorphy2_dicts/data
35 | 
36 | [testenv:docs]
37 | deps=
38 |     sphinx
39 |     sphinx-rtd-theme
40 | changedir=docs
41 | commands=
42 |     sphinx-build -W -b html . {envtmpdir}/html
43 | 


--------------------------------------------------------------------------------