├── rupo
    ├── __init__.py
    ├── dict
    │   ├── __init__.py
    │   ├── zaliznyak.py
    │   ├── wiki.py
    │   └── cmu.py
    ├── files
    │   ├── __init__.py
    │   ├── test_writer.py
    │   ├── test_reader.py
    │   ├── writer.py
    │   └── reader.py
    ├── main
    │   ├── __init__.py
    │   ├── test_vocabulary.py
    │   ├── test_markup.py
    │   ├── morph.py
    │   ├── test_tokenizer.py
    │   ├── vocabulary.py
    │   ├── tokenizer.py
    │   └── markup.py
    ├── metre
    │   ├── __init__.py
    │   ├── test_pattern_analyzer.py
    │   ├── test_metre_classifier.py
    │   ├── pattern_analyzer.py
    │   └── metre_classifier.py
    ├── rhymes
    │   ├── __init__.py
    │   ├── test_rhymes.py
    │   └── rhymes.py
    ├── stress
    │   ├── __init__.py
    │   ├── test_dict.py
    │   ├── test_predictor.py
    │   ├── word.py
    │   ├── predictor.py
    │   └── dict.py
    ├── util
    │   ├── __init__.py
    │   ├── timeit.py
    │   ├── tqdm_open.py
    │   ├── data.py
    │   ├── mixins.py
    │   └── preprocess.py
    ├── generate
    │   ├── __init__.py
    │   ├── generator.py
    │   └── transforms.py
    ├── data
    │   └── examples
    │   │   ├── text.txt
    │   │   ├── text.xml
    │   │   ├── morph_markup.txt
    │   │   ├── markup.xml
    │   │   └── markup.json
    ├── settings.py
    ├── test_api.py
    └── api.py
├── setup.cfg
├── docs
    ├── source
    │   ├── modules.rst
    │   ├── index.rst
    │   ├── rupo.rhymes.rst
    │   ├── rupo.util.rst
    │   ├── rupo.rst
    │   ├── rupo.files.rst
    │   ├── rupo.metre.rst
    │   ├── rupo.main.rst
    │   ├── rupo.generate.rst
    │   └── conf.py
    └── Makefile
├── .gitignore
├── .gitattributes
├── .codeclimate.yml
├── requirements.txt
├── download.sh
├── .travis.yml
├── generate_poem.py
├── setup.py
├── README.md
└── LICENSE


/rupo/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rupo/dict/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rupo/files/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rupo/main/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rupo/metre/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rupo/rhymes/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rupo/stress/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rupo/util/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rupo/generate/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md


--------------------------------------------------------------------------------
/docs/source/modules.rst:
--------------------------------------------------------------------------------
1 | rupo
2 | ====
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    rupo
8 |    setup
9 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.o
 2 | *.so
 3 | *.pyc
 4 | *~
 5 | .idea
 6 | *.trie
 7 | *.pickle
 8 | *.h5
 9 | dist/
10 | rupo-*
11 | rupo.*
12 | rupo/data/generator_models/


--------------------------------------------------------------------------------
/rupo/data/examples/text.txt:
--------------------------------------------------------------------------------
1 | Забывши волнения жизни мятежной,
2 | Один жил в пустыне рыбак молодой.
3 | Однажды на скале прибрежной,
4 | Над тихой прозрачной рекой
5 | Он с удой беспечно
6 | Сидел
7 | И думой сердечной
8 | К прошедшему счастью летел.


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | 
2 | rupo/data/stress_models/stress_ru_word30_LSTM256_dropout0.4_acc99_wer3.h5 filter=lfs diff=lfs merge=lfs -text
3 | rupo/data/g2p_models/g2p_ru_maxlen40_BLSTM256_BLSTM256_dropout0.2_acc992_wer140.h5 filter=lfs diff=lfs merge=lfs -text
4 | 


--------------------------------------------------------------------------------
/rupo/util/timeit.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import logging
 3 | 
 4 | 
 5 | def timeit(method):
 6 |     def timed(*args, **kw):
 7 |         ts = time.time()
 8 |         result = method(*args, **kw)
 9 |         te = time.time()
10 |         logging.debug('%s %2.2f sec' % (method.__name__, te-ts))
11 |         return result
12 |     return timed


--------------------------------------------------------------------------------
/.codeclimate.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | engines:
 3 |   csslint:
 4 |     enabled: false
 5 |   duplication:
 6 |     enabled: true
 7 |     config:
 8 |       languages:
 9 |       - python
10 |   eslint:
11 |     enabled: false
12 |   fixme:
13 |     enabled: true
14 |   radon:
15 |     enabled: true
16 | ratings:
17 |   paths:
18 |   - "**.py"
19 | exclude_paths: []
20 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | dicttoxml >= 1.7.4
 2 | pygtrie >= 2.2
 3 | numpy >= 1.12.1
 4 | scipy >= 0.19.0
 5 | scikit-learn >= 0.18.1
 6 | jsonpickle >= 0.9.4
 7 | pymorphy2 >= 0.8
 8 | h5py >= 2.7.0
 9 | russian-tagsets == 0.6
10 | tqdm >= 4.14.0
11 | jsonpickle >= 0.9.4
12 | rnnmorph >= 0.2.3
13 | sentence_splitter >= 1.2
14 | allennlp == 0.9.0
15 | overrides == 3.0.0
16 | git+https://github.com/IlyaGusev/rulm.git@4e78a49
17 | git+https://github.com/IlyaGusev/russ.git@288fe6a
18 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. rupo documentation master file, created by
 2 |    sphinx-quickstart on Mon Jul 24 20:49:37 2017.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to rupo's documentation!
 7 | ================================
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 2
11 |    :caption: Contents:
12 | 
13 | 
14 | 
15 | Indices and tables
16 | ==================
17 | 
18 | * :ref:`genindex`
19 | * :ref:`modindex`
20 | * :ref:`search`
21 | 


--------------------------------------------------------------------------------
/rupo/rhymes/test_rhymes.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Автор: Гусев Илья
 3 | # Описание: Тесты для модуля рифм.
 4 | 
 5 | import unittest
 6 | 
 7 | from rupo.stress.word import StressedWord, Stress
 8 | from rupo.rhymes.rhymes import Rhymes
 9 | 
10 | 
11 | class TestRhymes(unittest.TestCase):
12 |     def test_rhyme(self):
13 |         self.assertTrue(Rhymes.is_rhyme(StressedWord("братишь", {Stress(4)}), StressedWord("грустишь", {Stress(5)})))
14 |         self.assertFalse(Rhymes.is_rhyme(StressedWord("наизусть", {Stress(4)}), StressedWord("сестра", {Stress(5)})))


--------------------------------------------------------------------------------
/rupo/data/examples/text.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <items>
 3 |     <item>
 4 |         <themes></themes>
 5 |         <author>Михаил Лермонтов</author>
 6 |         <date_from>1829</date_from>
 7 |         <date_to>1829</date_to>
 8 |         <text>Забывши волнения жизни мятежной,
 9 | Один жил в пустыне рыбак молодой.
10 | Однажды на скале прибрежной,
11 | Над тихой прозрачной рекой
12 | Он с удой беспечно
13 | Сидел
14 | И думой сердечной
15 | К прошедшему счастью летел.</text>
16 |         <name>Забывши волнения жизни мятежной...</name>
17 |         <date_to>1829</date_to>
18 |     </item>
19 | </items>


--------------------------------------------------------------------------------
/docs/source/rupo.rhymes.rst:
--------------------------------------------------------------------------------
 1 | rupo.rhymes package
 2 | ===================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | rupo.rhymes.rhymes module
 8 | -------------------------
 9 | 
10 | .. automodule:: rupo.rhymes.rhymes
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | rupo.rhymes.test_rhymes module
16 | ------------------------------
17 | 
18 | .. automodule:: rupo.rhymes.test_rhymes
19 |     :members:
20 |     :undoc-members:
21 |     :show-inheritance:
22 | 
23 | 
24 | Module contents
25 | ---------------
26 | 
27 | .. automodule:: rupo.rhymes
28 |     :members:
29 |     :undoc-members:
30 |     :show-inheritance:
31 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = rupo
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/download.sh:
--------------------------------------------------------------------------------
 1 | wget https://www.dropbox.com/s/dwkui2xqivzsyw5/generator_model.zip
 2 | mkdir -p ./rupo/data/generator_models
 3 | unzip generator_model.zip -d ./rupo/data/generator_models
 4 | rm generator_model.zip
 5 | 
 6 | wget https://www.dropbox.com/s/ajd8b7lpqaao7xt/stress_ru_main.tar.gz
 7 | mkdir -p ./rupo/data/stress_models/ru_main
 8 | tar -xzvf stress_ru_main.tar.gz --directory ./rupo/data/stress_models/ru_main
 9 | rm stress_ru_main.tar.gz
10 | 
11 | wget https://www.dropbox.com/s/7rk135fzd3i8kfw/g2p_models.zip
12 | mkdir -p ./rupo/data/g2p_models
13 | unzip g2p_models.zip -d ./rupo/data/g2p_models
14 | rm g2p_models.zip
15 | 
16 | wget https://www.dropbox.com/s/znqlrb1xblh3amo/dict.zip
17 | mkdir -p ./rupo/data/dict
18 | unzip dict.zip -d ./rupo/data/dict
19 | rm dict.zip
20 | 


--------------------------------------------------------------------------------
/docs/source/rupo.util.rst:
--------------------------------------------------------------------------------
 1 | rupo.util package
 2 | =================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | rupo.util.data module
 8 | ---------------------
 9 | 
10 | .. automodule:: rupo.util.data
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | rupo.util.mixins module
16 | -----------------------
17 | 
18 | .. automodule:: rupo.util.mixins
19 |     :members:
20 |     :undoc-members:
21 |     :show-inheritance:
22 | 
23 | rupo.util.preprocess module
24 | ---------------------------
25 | 
26 | .. automodule:: rupo.util.preprocess
27 |     :members:
28 |     :undoc-members:
29 |     :show-inheritance:
30 | 
31 | 
32 | Module contents
33 | ---------------
34 | 
35 | .. automodule:: rupo.util
36 |     :members:
37 |     :undoc-members:
38 |     :show-inheritance:
39 | 


--------------------------------------------------------------------------------
/rupo/main/test_vocabulary.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Автор: Гусев Илья
 3 | # Описание: Тесты словаря.
 4 | 
 5 | import os
 6 | import unittest
 7 | 
 8 | from rupo.main.vocabulary import StressVocabulary
 9 | from rupo.settings import EXAMPLES_DIR, MARKUP_XML_EXAMPLE
10 | 
11 | 
12 | class TestVocabulary(unittest.TestCase):
13 |     def test_vocabulary(self):
14 |         dump_file = os.path.join(EXAMPLES_DIR, "temp.pickle")
15 |         vocabulary = StressVocabulary()
16 |         vocabulary.parse(MARKUP_XML_EXAMPLE)
17 |         vocabulary.save(dump_file)
18 |         self.assertTrue(os.path.exists(dump_file))
19 |         os.remove(dump_file)
20 |         try:
21 |             self.assertTrue(vocabulary.get_word(0) is not None)
22 |         except IndexError:
23 |             self.assertTrue(False)
24 | 


--------------------------------------------------------------------------------
/docs/source/rupo.rst:
--------------------------------------------------------------------------------
 1 | rupo package
 2 | ============
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 | 
 9 |     rupo.dict
10 |     rupo.files
11 |     rupo.g2p
12 |     rupo.generate
13 |     rupo.main
14 |     rupo.metre
15 |     rupo.morph
16 |     rupo.rhymes
17 |     rupo.stress
18 |     rupo.util
19 | 
20 | Submodules
21 | ----------
22 | 
23 | rupo.api module
24 | ---------------
25 | 
26 | .. automodule:: rupo.api
27 |     :members:
28 |     :undoc-members:
29 |     :show-inheritance:
30 | 
31 | rupo.settings module
32 | --------------------
33 | 
34 | .. automodule:: rupo.settings
35 |     :members:
36 |     :undoc-members:
37 |     :show-inheritance:
38 | 
39 | rupo.test_api module
40 | --------------------
41 | 
42 | .. automodule:: rupo.test_api
43 |     :members:
44 |     :undoc-members:
45 |     :show-inheritance:
46 | 
47 | 
48 | Module contents
49 | ---------------
50 | 
51 | .. automodule:: rupo
52 |     :members:
53 |     :undoc-members:
54 |     :show-inheritance:
55 | 


--------------------------------------------------------------------------------
/docs/source/rupo.files.rst:
--------------------------------------------------------------------------------
 1 | rupo.files package
 2 | ==================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | rupo.files.reader module
 8 | ------------------------
 9 | 
10 | .. automodule:: rupo.files.reader
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | rupo.files.test_reader module
16 | -----------------------------
17 | 
18 | .. automodule:: rupo.files.test_reader
19 |     :members:
20 |     :undoc-members:
21 |     :show-inheritance:
22 | 
23 | rupo.files.test_writer module
24 | -----------------------------
25 | 
26 | .. automodule:: rupo.files.test_writer
27 |     :members:
28 |     :undoc-members:
29 |     :show-inheritance:
30 | 
31 | rupo.files.writer module
32 | ------------------------
33 | 
34 | .. automodule:: rupo.files.writer
35 |     :members:
36 |     :undoc-members:
37 |     :show-inheritance:
38 | 
39 | 
40 | Module contents
41 | ---------------
42 | 
43 | .. automodule:: rupo.files
44 |     :members:
45 |     :undoc-members:
46 |     :show-inheritance:
47 | 


--------------------------------------------------------------------------------
/rupo/util/tqdm_open.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Авторы: Анастасьев Даниил
 3 | # Описание: Обертка открытия больших файлов в счетчик tqdm
 4 | 
 5 | from contextlib import contextmanager
 6 | from os.path import getsize, basename
 7 | from tqdm import tqdm
 8 | 
 9 | 
10 | @contextmanager
11 | def tqdm_open(filename, encoding='utf8'):
12 |     """
13 |     Открытие файла, обёрнутое в tqdm
14 |     """
15 |     total = getsize(filename)
16 | 
17 |     def wrapped_line_iterator(fd):
18 |         with tqdm(total=total, unit="B", unit_scale=True, desc=basename(filename), miniters=1) as pb:
19 |             processed_bytes = 0
20 |             for line in fd:
21 |                 processed_bytes += len(line)
22 |                 if processed_bytes >= 1024 * 1024:
23 |                     pb.update(processed_bytes)
24 |                     processed_bytes = 0
25 |                 yield line
26 |             pb.update(processed_bytes)
27 | 
28 |     with open(filename, encoding=encoding) as fd:
29 |         yield wrapped_line_iterator(fd)


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | dist: xenial
 2 | sudo: required
 3 | language: python
 4 | python: 3.6
 5 | before_script:
 6 | - git lfs pull
 7 | install:
 8 | - pip install --upgrade pip setuptools wheel
 9 | - pip install -r requirements.txt
10 | - sh download.sh
11 | script:
12 | - pytest
13 | deploy:
14 |   provider: pypi
15 |   user: Phoenix120
16 |   password:
17 |     secure: ueaFMBiVlNSPmwivJ0uMGJw1ntj6nTuCqIuxYw0IXQDUebPkU6QZLH3o8k59BgD1o5+7yXmpWQHXb7B5UYr0pBC/4wEutatoGLzDmcTv1DaYn7kzrv5PTBSQVEvCQNzA8jNog0j6Ljg9Z7CN3H/vGIIdPRt1Gxmu0dPCrX3rGMlwKZLH5/gRaZlbgxtov/UGfIUEOgJmM1eJvZYS8Y5InmxlUBJmT0U1QDe1cBooax43KlspQzCJSJ6NciMGXSZUi5nPSb9sKbqvbOjRnCydcazeQwoRf14qIwFS3b7nL4TLb+rRSHFKuOJ9cmnAF+f5qo0ytJuYZqo3dNS8LqwuJH0tXyO4fo5T7Xe2k7eIfla4mg1T+uss5zIM0ttfW/ApKQanAr2kZ/tMl6ywWkWLJ1crSYM9RjUewZw8Z1qwYbEDJrcWIBZxkyPfEkzilgjAvlf4rmEUR3eJtm2YBgoz5XiNR2sdTeRFUgAcZUyC7nx+N15FJgw1HTtZeqbedPGgq84sMk31OxGfpGDJQ9iHqHavvTdxiRjA8YLNlxAeZ+Upop6zLznUM7iE742tNAjqjaXhGR6128Viggn4hL2PZuFYlmRx5Rt7LhCr1OgKViodNJwyoWZTFDl3p+b6GoJai9FPIewX6nmfQTAeYweFM8yz38akC8v21P3/kNeJ2/w=
18 |   on:
19 |     tags: true
20 | 


--------------------------------------------------------------------------------
/docs/source/rupo.metre.rst:
--------------------------------------------------------------------------------
 1 | rupo.metre package
 2 | ==================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | rupo.metre.metre_classifier module
 8 | ----------------------------------
 9 | 
10 | .. automodule:: rupo.metre.metre_classifier
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | rupo.metre.patterns module
16 | --------------------------
17 | 
18 | .. automodule:: rupo.metre.patterns
19 |     :members:
20 |     :undoc-members:
21 |     :show-inheritance:
22 | 
23 | rupo.metre.test_metre_classifier module
24 | ---------------------------------------
25 | 
26 | .. automodule:: rupo.metre.test_metre_classifier
27 |     :members:
28 |     :undoc-members:
29 |     :show-inheritance:
30 | 
31 | rupo.metre.test_patterns module
32 | -------------------------------
33 | 
34 | .. automodule:: rupo.metre.test_patterns
35 |     :members:
36 |     :undoc-members:
37 |     :show-inheritance:
38 | 
39 | 
40 | Module contents
41 | ---------------
42 | 
43 | .. automodule:: rupo.metre
44 |     :members:
45 |     :undoc-members:
46 |     :show-inheritance:
47 | 


--------------------------------------------------------------------------------
/rupo/files/test_writer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Автор: Гусев Илья
 3 | # Описание: Тесты записи разметок.
 4 | 
 5 | import unittest
 6 | import os
 7 | 
 8 | from rupo.main.markup import Markup
 9 | from rupo.files.writer import Writer
10 | from rupo.files.reader import Reader, FileType
11 | from rupo.util.data import MARKUP_EXAMPLE
12 | from rupo.settings import EXAMPLES_DIR
13 | 
14 | 
15 | class TestWriter(unittest.TestCase):
16 |     def test_write(self):
17 |         temp_file = os.path.join(EXAMPLES_DIR, "temp.xml")
18 |         markup = MARKUP_EXAMPLE
19 |         Writer.write_markups(FileType.XML, [markup], temp_file)
20 |         processed_xml = Reader.read_markups(temp_file, FileType.XML, is_processed=True)
21 |         self.assertEqual(next(processed_xml), markup)
22 |         processed_xml.close()
23 |         os.remove(temp_file)
24 | 
25 |         temp_file = os.path.join(EXAMPLES_DIR, "temp.txt")
26 |         Writer.write_markups(FileType.RAW, [markup], temp_file)
27 |         processed_raw = Reader.read_markups(temp_file, FileType.RAW, is_processed=True)
28 |         self.assertIsInstance((next(processed_raw)), Markup)
29 |         processed_raw.close()
30 |         os.remove(temp_file)
31 | 


--------------------------------------------------------------------------------
/rupo/util/data.py:
--------------------------------------------------------------------------------
 1 | from rupo.main.markup import Markup, Line, Word, Syllable
 2 | 
 3 | MARKUP_EXAMPLE = Markup("Соломка король себя.\n Пора виться майкой в.", [
 4 |             Line(0, 20, "Соломка король себя.", [
 5 |                 Word(0, 7, "Соломка",
 6 |                      [Syllable(0, 2, 0, "Со"),
 7 |                       Syllable(2, 5, 1, "лом", 3),
 8 |                       Syllable(5, 7, 2, "ка")]),
 9 |                 Word(8, 14, "король",
10 |                      [Syllable(0, 2, 0, "ко"),
11 |                       Syllable(2, 6, 1, "роль", 3)]),
12 |                 Word(15, 19, "себя",
13 |                      [Syllable(0, 2, 0, "се"),
14 |                       Syllable(2, 4, 1, "бя", 3)])]),
15 |             Line(21, 43, " Пора виться майкой в.",[
16 |                 Word(22, 26, "Пора",
17 |                      [Syllable(0, 2, 0, "По", 1),
18 |                       Syllable(2, 4, 1, "ра", 3)]),
19 |                 Word(27, 33, "виться",
20 |                      [Syllable(0, 2, 0, "ви", 1),
21 |                       Syllable(2, 6, 1, "ться")]),
22 |                 Word(34, 40, "майкой",
23 |                      [Syllable(0, 3, 0, "май", 1),
24 |                       Syllable(3, 6, 1, "кой")]),
25 |                 Word(41, 42, "в", [])
26 |                 ])])


--------------------------------------------------------------------------------
/rupo/dict/zaliznyak.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | class ZalyzniakDict:
 5 |     @staticmethod
 6 |     def convert_to_accent_only(dict_file, accent_file):
 7 |         with open(dict_file, 'r', encoding='utf-8') as r:
 8 |             lines = r.readlines()
 9 |         with open(accent_file, 'w', encoding='utf-8') as w:
10 |             for line in lines:
11 |                 for word in line.split("#")[1].split(","):
12 |                     word = word.strip()
13 |                     pos = -1
14 |                     clean_word = ""
15 |                     primary = []
16 |                     secondary = []
17 |                     for i, ch in enumerate(word):
18 |                         if ch == "'" or ch == "`":
19 |                             if ch == "`":
20 |                                 secondary.append(pos)
21 |                             else:
22 |                                 primary.append(pos)
23 |                             continue
24 |                         clean_word += ch
25 |                         pos += 1
26 |                         if ch == "ё":
27 |                             primary.append(pos)
28 |                     if len(primary) != 0:
29 |                         w.write(clean_word + "\t" + ",".join([str(a) for a in primary]) + "\t" +
30 |                                 ",".join([str(a) for a in secondary]) + "\n")


--------------------------------------------------------------------------------
/docs/source/rupo.main.rst:
--------------------------------------------------------------------------------
 1 | rupo.main package
 2 | =================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | rupo.main.markup module
 8 | -----------------------
 9 | 
10 | .. automodule:: rupo.main.markup
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | rupo.main.test_markup module
16 | ----------------------------
17 | 
18 | .. automodule:: rupo.main.test_markup
19 |     :members:
20 |     :undoc-members:
21 |     :show-inheritance:
22 | 
23 | rupo.main.test_tokenizer module
24 | -------------------------------
25 | 
26 | .. automodule:: rupo.main.test_tokenizer
27 |     :members:
28 |     :undoc-members:
29 |     :show-inheritance:
30 | 
31 | rupo.main.test_vocabulary module
32 | --------------------------------
33 | 
34 | .. automodule:: rupo.main.test_vocabulary
35 |     :members:
36 |     :undoc-members:
37 |     :show-inheritance:
38 | 
39 | rupo.main.tokenizer module
40 | --------------------------
41 | 
42 | .. automodule:: rupo.main.tokenizer
43 |     :members:
44 |     :undoc-members:
45 |     :show-inheritance:
46 | 
47 | rupo.main.vocabulary module
48 | ---------------------------
49 | 
50 | .. automodule:: rupo.main.vocabulary
51 |     :members:
52 |     :undoc-members:
53 |     :show-inheritance:
54 | 
55 | 
56 | Module contents
57 | ---------------
58 | 
59 | .. automodule:: rupo.main
60 |     :members:
61 |     :undoc-members:
62 |     :show-inheritance:
63 | 


--------------------------------------------------------------------------------
/generate_poem.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from rupo.api import Engine
 4 | from rupo.settings import RU_STRESS_DEFAULT_MODEL, ZALYZNYAK_DICT, GENERATOR_MODEL_DIR
 5 | 
 6 | 
 7 | if __name__ == "__main__":
 8 |     parser = argparse.ArgumentParser()
 9 |     parser.add_argument('--model-path', type=str, default=GENERATOR_MODEL_DIR)
10 |     parser.add_argument('--token-vocab-path', type=str, default=None)
11 |     parser.add_argument('--stress-vocab-path', type=str, default=None)
12 |     parser.add_argument('--metre-schema', type=str, default='+-')
13 |     parser.add_argument('--rhyme-pattern', type=str, default='abab')
14 |     parser.add_argument('--n-syllables', type=int, default=8)
15 |     parser.add_argument('--sampling-k', type=int, default=50000)
16 |     parser.add_argument('--beam-width', type=int, default=None)
17 |     parser.add_argument('--temperature', type=float, default=1.0)
18 |     parser.add_argument('--last-text', type=str, default="")
19 |     parser.add_argument('--count', type=int, default=100)
20 |     args = parser.parse_args()
21 | 
22 |     kwargs = vars(args)
23 |     count = kwargs.pop('count')
24 | 
25 |     engine = Engine()
26 |     engine.load(RU_STRESS_DEFAULT_MODEL, ZALYZNYAK_DICT)
27 |     for seed in range(count):
28 |         print(seed)
29 |         try:
30 |             poem = engine.generate_poem(seed=seed, **kwargs)
31 |             print(poem)
32 |         except AssertionError as e:
33 |             print("Error: ", e)
34 | 


--------------------------------------------------------------------------------
/rupo/stress/test_dict.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Автор: Гусев Илья
 3 | # Описание: Тесты для словаря ударений.
 4 | 
 5 | import unittest
 6 | 
 7 | from rupo.stress.dict import StressDict
 8 | from rupo.stress.word import Stress, StressedWord
 9 | from rupo.util.preprocess import VOWELS
10 | from rupo.settings import RU_GRAPHEME_STRESS_PATH, ZALYZNYAK_DICT, RU_GRAPHEME_STRESS_TRIE_PATH
11 | 
12 | 
13 | class TestStressDict(unittest.TestCase):
14 |     @classmethod
15 |     def setUpClass(cls):
16 |         cls.dict = StressDict(language="ru", zalyzniak_dict=ZALYZNYAK_DICT,
17 |                               raw_dict_path=RU_GRAPHEME_STRESS_PATH, trie_path=RU_GRAPHEME_STRESS_TRIE_PATH)
18 | 
19 |     @classmethod
20 |     def tearDownClass(cls):
21 |         del cls.dict
22 | 
23 |     def test_get_stresses(self):
24 |         self.assertCountEqual(self.dict.get_stresses("данный", Stress.Type.PRIMARY), [1])
25 |         self.assertCountEqual(self.dict.get_stresses("союза", Stress.Type.PRIMARY), [2])
26 |         self.assertCountEqual(self.dict.get_stresses("англосакс", Stress.Type.SECONDARY), [0])
27 |         self.assertCountEqual(self.dict.get_stresses("англосакс", Stress.Type.ANY), [0, 6])
28 |         self.assertCountEqual(self.dict.get_stresses("пора", Stress.Type.PRIMARY), [1, 3])
29 | 
30 |     def test_stress_only_in_vowels(self):
31 |         for word, stresses in self.dict.get_all():
32 |             for stress in stresses:
33 |                 self.assertIn(word[stress.position], VOWELS)
34 | 
35 | 


--------------------------------------------------------------------------------
/rupo/main/test_markup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Автор: Гусев Илья
 3 | # Описание: Тесты для разметки.
 4 | 
 5 | import unittest
 6 | 
 7 | from rupo.util.data import MARKUP_EXAMPLE
 8 | from rupo.main.markup import Markup
 9 | from rupo.stress.predictor import CombinedStressPredictor
10 | from rupo.settings import RU_STRESS_DEFAULT_MODEL, ZALYZNYAK_DICT, CMU_DICT, \
11 |     RU_GRAPHEME_STRESS_PATH, RU_GRAPHEME_STRESS_TRIE_PATH
12 | 
13 | 
14 | class TestMarkup(unittest.TestCase):
15 |     @classmethod
16 |     def setUpClass(cls):
17 |         cls.stress_predictor = CombinedStressPredictor(
18 |             stress_model_path=RU_STRESS_DEFAULT_MODEL,
19 |             zalyzniak_dict=ZALYZNYAK_DICT,
20 |             cmu_dict=CMU_DICT,
21 |             raw_stress_dict_path=RU_GRAPHEME_STRESS_PATH,
22 |             stress_trie_path=RU_GRAPHEME_STRESS_TRIE_PATH
23 |         )
24 | 
25 |     @classmethod
26 |     def tearDownClass(cls):
27 |         del cls.stress_predictor
28 | 
29 |     def test_from_to(self):
30 |         clean_markup = Markup()
31 |         self.assertEqual(MARKUP_EXAMPLE, clean_markup.from_xml(MARKUP_EXAMPLE.to_xml()))
32 |         clean_markup = Markup()
33 |         self.assertEqual(MARKUP_EXAMPLE, clean_markup.from_json(MARKUP_EXAMPLE.to_json()))
34 | 
35 |     def test_process_text(self):
36 |         text = "Соломка король себя.\n Пора виться майкой в."
37 |         markup = Markup.process_text(text, self.stress_predictor)
38 |         self.assertEqual(markup.to_json(), MARKUP_EXAMPLE.to_json())
39 | 
40 | 


--------------------------------------------------------------------------------
/rupo/files/test_reader.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Автор: Гусев Илья
 3 | # Описание: Тесты считывателя разметок.
 4 | 
 5 | import unittest
 6 | 
 7 | from rupo.files.reader import Reader, FileType
 8 | from rupo.stress.predictor import CombinedStressPredictor
 9 | from rupo.main.markup import Markup, Line, Word
10 | from rupo.settings import MARKUP_XML_EXAMPLE, TEXT_XML_EXAMPLE, MARKUP_JSON_EXAMPLE
11 | 
12 | 
13 | class TestReader(unittest.TestCase):
14 |     @classmethod
15 |     def setUpClass(cls):
16 |         cls.stress_predictor = CombinedStressPredictor()
17 | 
18 |     def test_read(self):
19 |         processed_xml = Reader.read_markups(MARKUP_XML_EXAMPLE, FileType.XML, is_processed=True)
20 |         self.__assert_markup_is_correct(next(processed_xml))
21 | 
22 |         unprocessed_xml = Reader.read_markups(TEXT_XML_EXAMPLE, FileType.XML, is_processed=False,
23 |                                               stress_predictor=self.stress_predictor)
24 |         self.__assert_markup_is_correct(next(unprocessed_xml))
25 | 
26 |         processed_json = Reader.read_markups(MARKUP_JSON_EXAMPLE, FileType.JSON, is_processed=True)
27 |         self.__assert_markup_is_correct(next(processed_json))
28 | 
29 |     def __assert_markup_is_correct(self, markup):
30 |         self.assertIsInstance(markup, Markup)
31 |         self.assertIsNotNone(markup.text)
32 |         self.assertNotEqual(markup.text, "")
33 |         self.assertNotEqual(markup.lines, [])
34 |         self.assertIsInstance(markup.lines[0], Line)
35 |         self.assertIsInstance(markup.lines[0].words[0], Word)
36 | 


--------------------------------------------------------------------------------
/rupo/stress/test_predictor.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Автор: Гусев Илья
 3 | # Описание: Тесты предсказателя ударений.
 4 | 
 5 | import unittest
 6 | 
 7 | from rupo.stress.predictor import CombinedStressPredictor
 8 | from rupo.settings import RU_STRESS_DEFAULT_MODEL, ZALYZNYAK_DICT, CMU_DICT, \
 9 |     RU_GRAPHEME_STRESS_PATH, RU_GRAPHEME_STRESS_TRIE_PATH
10 | 
11 | 
12 | class TestStressPredictor(unittest.TestCase):
13 |     @classmethod
14 |     def setUpClass(cls):
15 |         cls.stress_predictor = CombinedStressPredictor(
16 |             stress_model_path=RU_STRESS_DEFAULT_MODEL,
17 |             zalyzniak_dict=ZALYZNYAK_DICT,
18 |             cmu_dict=CMU_DICT,
19 |             raw_stress_dict_path=RU_GRAPHEME_STRESS_PATH,
20 |             stress_trie_path=RU_GRAPHEME_STRESS_TRIE_PATH
21 |         )
22 | 
23 |     def test_stress(self):
24 |         checks = {
25 |             'я': [0],
26 |             'в': [],
27 |             'он': [0],
28 |             'майка': [1],
29 |             'соломка': [3],
30 |             'изжить': [3],
31 |             'виться': [1],
32 |             'данный': [1],
33 |             'зорька': [1],
34 |             'банка': [1],
35 |             'оттечь': [3],
36 |             'советского': [3],
37 |             'союза': [2],
38 |             'пора': [3, 1],
39 |             'изжила': [5],
40 |             'меда': [1],
41 |             'автоподъёмник': [8],
42 |             'каракуля': [3],
43 |             'супервайзер': [6],
44 |             'колесом': [5]
45 |         }
46 |         for word, pos in checks.items():
47 |             self.assertEqual(sorted(self.stress_predictor.predict(word)), sorted(pos))
48 | 


--------------------------------------------------------------------------------
/rupo/util/mixins.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Автор: Гусев Илья
 3 | # Описание: Служебные миксины для удобства сериализации.
 4 | 
 5 | 
 6 | def to_dict(obj):
 7 |     """
 8 |     Преобразование объекта в словарь.
 9 | 
10 |     :param obj: объект, который нужно превратить в словарь
11 |     :return data: получившийся словарь
12 |     """
13 |     if isinstance(obj, dict):
14 |         data = {}
15 |         for (k, v) in obj.items():
16 |             data[k] = to_dict(v)
17 |         return data
18 |     elif hasattr(obj, "__iter__") and not isinstance(obj, str):
19 |         return [to_dict(v) for v in obj]
20 |     elif hasattr(obj, "__dict__"):
21 |         data = dict([(key, to_dict(value)) for key, value in obj.__dict__.items()
22 |                     if not callable(value) and not key.startswith('_')])
23 |         return data
24 |     else:
25 |         return obj
26 | 
27 | 
28 | class CommonMixin(object):
29 |     """
30 |     Mixin для удобного сравнения и преобразования в dict.
31 |     """
32 |     def __eq__(self, other):
33 |         if isinstance(other, self.__class__):
34 |             return self.__dict__ == other.__dict__
35 |         return NotImplemented
36 | 
37 |     def __ne__(self, other):
38 |         if isinstance(other, self.__class__):
39 |             return not self.__eq__(other)
40 |         return NotImplemented
41 | 
42 |     def __hash__(self):
43 |         return hash(tuple(sorted(self.__dict__.items())))
44 | 
45 |     def __repr__(self):
46 |         return str(self.to_dict())
47 | 
48 |     def __str__(self):
49 |         return str(self.to_dict())
50 | 
51 |     def to_dict(self):
52 |         return to_dict(self)


--------------------------------------------------------------------------------
/rupo/data/examples/morph_markup.txt:
--------------------------------------------------------------------------------
 1 | забывши	забыть	VERB	Aspect=Perf|Tense=Past|VerbForm=Trans
 2 | волнения	волнение	NOUN	Animacy=Inan|Case=Gen|Gender=Neut|Number=Sing
 3 | жизни	жизнь	NOUN	Animacy=Inan|Case=Gen|Gender=Fem|Number=Sing
 4 | мятежной	мятежный	ADJ	Case=Gen|Gender=Fem|Number=Sing
 5 | 
 6 | один	один	DET	Case=Nom|Gender=Masc|Number=Sing
 7 | жил	жить	VERB	Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin
 8 | в	в	ADP	_
 9 | пустыне	пустыня	NOUN	Animacy=Inan|Case=Loc|Gender=Fem|Number=Sing
10 | рыбак	рыбак	NOUN	Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing
11 | молодой	молодая	NOUN	Animacy=Anim|Case=Gen|Gender=Fem|Number=Sing
12 | 
13 | однажды	однажды	ADV	_
14 | на	на	ADP	_
15 | скале	скал	NOUN	Animacy=Inan|Case=Loc|Gender=Masc|Number=Sing
16 | прибрежной	прибрежный	ADJ	Case=Gen|Gender=Fem|Number=Sing
17 | 
18 | над	над	ADP	_
19 | тихой	тихий	ADJ	Case=Gen|Gender=Fem|Number=Sing
20 | прозрачной	прозрачный	ADJ	Case=Gen|Gender=Fem|Number=Sing
21 | рекой	река	NOUN	Animacy=Inan|Case=Ins|Gender=Fem|Number=Sing
22 | 
23 | он	он	PRON	Case=Nom|Gender=Masc|Number=Sing|Person=3
24 | с	с	ADP	_
25 | удой	уда	PROPN	Animacy=Inan|Case=Ins|Gender=Fem|Number=Sing
26 | беспечно	беспечно	ADV	_
27 | 
28 | сидел	сидеть	VERB	Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin
29 | 
30 | и	и	CONJ	_
31 | думой	дума	NOUN	Animacy=Inan|Case=Ins|Gender=Fem|Number=Sing
32 | сердечной	сердечный	ADJ	Case=Gen|Gender=Fem|Number=Sing
33 | 
34 | к	к	ADP	_
35 | прошедшему	прошедшее	NOUN	Animacy=Inan|Case=Dat|Gender=Neut|Number=Sing
36 | счастью	счастие	NOUN	Animacy=Inan|Case=Dat|Gender=Neut|Number=Sing
37 | летел	лететь	VERB	Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin
38 | 
39 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | from setuptools.command.develop import develop
 3 | from setuptools.command.install import install
 4 | 
 5 | 
 6 | class PostDevelopCommand(develop):
 7 |     def run(self):
 8 |         develop.run(self)
 9 | 
10 | 
11 | class PostInstallCommand(install):
12 |     def run(self):
13 |         install.run(self)
14 | 
15 | 
16 | setup(
17 |     name='rupo',
18 |     packages=find_packages(),
19 |     version='0.2.8',
20 |     description='RuPo: library for russian poetry analysis and generation',
21 |     author='Ilya Gusev',
22 |     author_email='phoenixilya@gmail.com',
23 |     url='https://github.com/IlyaGusev/rupo',
24 |     download_url='https://github.com/IlyaGusev/rupo/archive/0.2.8.tar.gz',
25 |     keywords=['poetry', 'nlp', 'russian'],
26 |     package_data={
27 |         'rupo': ['data/examples/*', 'data/hyphen-tokens.txt']
28 |     },
29 |     install_requires=[
30 |         'dicttoxml>=1.7.4',
31 |         'pygtrie>=2.2',
32 |         'numpy>=1.11.3',
33 |         'scipy>=0.18.1',
34 |         'scikit-learn>=0.18.1',
35 |         'jsonpickle>=0.9.4',
36 |         'pymorphy2>=0.8',
37 |         'h5py>=2.7.0',
38 |         'russian-tagsets==0.6',
39 |         'tqdm>=4.14.0',
40 |         'rnnmorph==0.2.3',
41 |         'sentence_splitter>=1.2',
42 |         'rulm==0.0.2',
43 |         'russ==0.0.1'
44 |     ],
45 |     cmdclass={
46 |         'develop': PostDevelopCommand,
47 |         'install': PostInstallCommand,
48 |     },
49 |     classifiers=[
50 |         'Development Status :: 4 - Beta',
51 |         'Intended Audience :: Developers',
52 | 
53 |         'Topic :: Text Processing :: Linguistic',
54 | 
55 |         'License :: OSI Approved :: Apache Software License',
56 | 
57 |         'Natural Language :: Russian',
58 | 
59 |         'Programming Language :: Python :: 3.5',
60 |         'Programming Language :: Python :: 3.5',
61 |         'Programming Language :: Python :: 3.6',
62 |     ],
63 | )
64 | 


--------------------------------------------------------------------------------
/rupo/util/preprocess.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Автор: Гусев Илья
 3 | # Описание: Служебные функции и константы.
 4 | 
 5 | import re
 6 | 
 7 | CYRRILIC_LOWER_VOWELS = "аоэиуыеёюя"
 8 | CYRRILIC_LOWER_CONSONANTS = "йцкнгшщзхъфвпрлджчсмтьб"
 9 | VOWELS = "aeiouAEIOUаоэиуыеёюяАОЭИУЫЕЁЮЯ"
10 | CLOSED_SYLLABLE_CHARS = "рлймнРЛЙМН"
11 | 
12 | 
13 | def text_to_wordlist(sentence, cyrillic=False):
14 |     regexp = "[^а-яА-Яёa-zA-Z]"
15 |     if cyrillic:
16 |         regexp = "[^а-яА-Яё]"
17 |     sentence = re.sub(regexp, " ", sentence)
18 |     result = sentence.lower().split()
19 |     return result
20 | 
21 | 
22 | def text_to_sentences(text):
23 |     regexp = "[\.\?!](?=[\s\n]*[A-ZА-Я])|;|:-|:—|:—|: —|: —|: -"
24 |     regexps = ["(?<=[^A-zА-я][A-ZА-Я])\.", 
25 |                 "(?<=[^A-zА-я][A-zА-я])\.[ ]?(?=[A-zА-я][^A-zА-я])",
26 |                 "\.(?=,)"
27 |                 ]
28 |     for reg in regexps:
29 |         text = "$".join(re.split(reg,text))
30 | 
31 |     result = re.split(regexp, text)
32 |     result = map(lambda x: x.strip().replace("$", "."), result)
33 |     return result
34 | 
35 | 
36 | def to_cyrrilic(text):
37 |     return text.replace("x", "х") \
38 |         .replace("a", "а") \
39 |         .replace("y", "у") \
40 |         .replace("o", "о") \
41 |         .replace("c", "с") \
42 |         .replace("ё", "е")
43 | 
44 | 
45 | def normilize_line(text):
46 |     regexp = "[^а-яА-Яёa-zA-Z0-9]"
47 |     text = re.sub(regexp, " ", text)
48 |     result = to_cyrrilic("".join(text.lower().split()))
49 |     return result
50 | 
51 | 
52 | def count_vowels(string):
53 |     num_vowels = 0
54 |     for char in string:
55 |         if char in VOWELS:
56 |             num_vowels += 1
57 |     return num_vowels
58 | 
59 | 
60 | def get_first_vowel_position(string):
61 |     for i, ch in enumerate(string):
62 |         if ch in VOWELS:
63 |             return i
64 |     return -1
65 | 
66 | 
67 | def etree_to_dict(t):
68 |     return {t.tag: map(etree_to_dict, t.iterchildren()) or t.text}


--------------------------------------------------------------------------------
/rupo/main/morph.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Автор: Гусев Илья
 3 | # Описание: Разметка по грамматическим значениям
 4 | 
 5 | import os
 6 | from typing import List, TextIO
 7 | 
 8 | from sentence_splitter import SentenceSplitter
 9 | from rnnmorph.predictor import RNNMorphPredictor
10 | 
11 | from rupo.main.tokenizer import Tokenizer, Token
12 | 
13 | 
14 | class Morph:
15 |     @staticmethod
16 |     def get_morph_markup(input_filenames: List[str], output_filename: str):
17 |         """
18 |         Разметка по грамматическим значениям
19 | 
20 |         :param input_filenames: входные текстовые файлы
21 |         :param output_filename: путь к файлу, куда будет сохранена разметка
22 |         """
23 |         if os.path.exists(output_filename):
24 |             os.remove(output_filename)
25 | 
26 |         sentence_splitter = SentenceSplitter(language='ru')
27 |         morph_predictor = RNNMorphPredictor()
28 | 
29 |         for filename in input_filenames:
30 |             with open(filename, "r", encoding="utf-8") as r, open(output_filename, "w+", encoding="utf-8") as w:
31 |                 for line in r:
32 |                     Morph.__process_line(line, w, sentence_splitter, morph_predictor)
33 | 
34 |     @staticmethod
35 |     def __process_line(line: str, output_file: TextIO, sentence_splitter: SentenceSplitter,
36 |                        morph_predictor: RNNMorphPredictor):
37 |         sentences = sentence_splitter.split(line)
38 |         for sentence in sentences:
39 |             words = [token.text for token in Tokenizer.tokenize(sentence)
40 |                      if token.text != '' and token.token_type != Token.TokenType.SPACE]
41 |             if not words:
42 |                 continue
43 |             forms = morph_predictor.predict_sentence_tags(words)
44 |             for form in forms:
45 |                 if form.pos == "PUNCT":
46 |                     continue
47 |                 output_file.write("%s\t%s\t%s\t%s\n" % (form.word, form.normal_form, form.pos, form.tag))
48 |             output_file.write("\n")
49 | 


--------------------------------------------------------------------------------
/rupo/stress/word.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Автор: Гусев Илья
 3 | # Описание: Класс слова с ударением.
 4 | 
 5 | from enum import Enum
 6 | from typing import List, Set
 7 | from russ.syllables import get_syllables
 8 | 
 9 | 
10 | class Stress:
11 |     """
12 |     Ударение
13 |     """
14 | 
15 |     class Type(Enum):
16 |         ANY = -1
17 |         PRIMARY = 0
18 |         SECONDARY = 1
19 | 
20 |     def __init__(self, position: int, stress_type: Type=Type.PRIMARY) -> None:
21 |         self.position = position
22 |         self.type = stress_type
23 | 
24 |     def __hash__(self):
25 |         return hash(self.position)
26 | 
27 |     def __eq__(self, other: 'Stress'):
28 |         return self.position == other.position and self.type == other.type
29 | 
30 |     def __str__(self):
31 |         return str(self.position) + "\t" + str(self.type)
32 | 
33 |     def __repr__(self):
34 |         return self.__str__()
35 | 
36 | 
37 | class StressedWord:
38 |     """
39 |     Слово и его ударения.
40 |     """
41 | 
42 |     def __init__(self, text: str, stresses: Set[Stress]) -> None:
43 |         self.stresses = stresses
44 |         self.text = text
45 |         self.syllables = get_syllables(text)
46 |         self.__accent_syllables()
47 | 
48 |     def get_primary_stresses(self) -> List[int]:
49 |         return [stress.position for stress in self.stresses if stress.type == Stress.Type.PRIMARY]
50 | 
51 |     def get_secondary_stresses(self) -> List[int]:
52 |         return [stress.position for stress in self.stresses if stress.type == Stress.Type.SECONDARY]
53 | 
54 |     def add_stress(self, position: int, stress_type: Stress.Type=Stress.Type.PRIMARY) -> None:
55 |         self.stresses.add(Stress(position, stress_type))
56 |         self.__accent_syllables()
57 | 
58 |     def add_stresses(self, stresses: List[Stress]) -> None:
59 |         self.stresses = set(self.stresses).union(set(stresses))
60 |         self.__accent_syllables()
61 | 
62 |     def __accent_syllables(self):
63 |         for syllable in self.syllables:
64 |             if Stress(syllable.vowel()) in self.stresses:
65 |                 syllable.stress = syllable.vowel()
66 |             else:
67 |                 syllable.stress = -1
68 | 
69 |     def __str__(self):
70 |         return self.text + "\t" + ",".join([str(i) for i in self.get_primary_stresses()])+ \
71 |                "\t" + ",".join([str(i) for i in self.get_secondary_stresses()])
72 | 
73 |     def __repr__(self):
74 |         return self.__str__()
75 | 
76 |     def __hash__(self):
77 |         return hash(self.text)
78 | 
79 |     def __eq__(self, other: 'StressedWord'):
80 |         return self.text == other.text
81 | 


--------------------------------------------------------------------------------
/rupo/dict/wiki.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from rupo.settings import RU_GRAPHEME_SET
 4 | 
 5 | from russ.syllables import VOWELS
 6 | 
 7 | 
 8 | class WikiDict:
 9 |     @staticmethod
10 |     def convert_to_g2p_only(source_file, destination_file):
11 |         with open(source_file, 'r', encoding='utf-8') as r:
12 |             lines = r.readlines()
13 |         with open(destination_file, 'w', encoding='utf-8') as w:
14 |             words = []
15 |             phonetic_words = []
16 |             for line in lines:
17 |                 words.append(line.split("\t")[0].strip())
18 |                 phonetic_words.append(line.split("\t")[1].replace("'", "").replace("ˌ", "").strip())
19 |             for i, word in enumerate(words):
20 |                 w.write(word + "\t" + phonetic_words[i] + "\n")
21 | 
22 |     @staticmethod
23 |     def first_clean_up(filename):
24 |         words = []
25 |         phonetic_words = []
26 |         with open(filename, "r") as f:
27 |             lines = f.readlines()
28 |             print(len(lines))
29 |             for line in lines:
30 |                 word = line.split("#")[0]
31 |                 word = word.lower()
32 |                 phonetic_word = line.split("#")[1]
33 |                 if "'" not in phonetic_word and "ˈ" not in phonetic_word:
34 |                     continue
35 |                 phonetic_word = phonetic_word.split("/")[0].strip()
36 |                 phonetic_word = phonetic_word.split("~")[0].strip()
37 |                 phonetic_word = phonetic_word.split(";")[0].strip()
38 |                 phonetic_word = phonetic_word.split(",")[0].strip()
39 |                 phonetic_word = phonetic_word.replace("ˈ", "'")
40 |                 phonetic_word = phonetic_word.replace(":", "ː")
41 |                 phonetic_word = re.sub(r"[\s̟̥̻.̞]", "", phonetic_word)
42 |                 phonetic_word = re.sub(r"[(⁽][^)⁾]*[)⁾]", "", phonetic_word)
43 |                 phonetic_word = Phonemes.clean(phonetic_word)
44 |                 wrong_chars = [ch for ch in word if ch not in RU_GRAPHEME_SET]
45 |                 if len(wrong_chars) != 0:
46 |                     continue
47 |                 if len(word) == 0 or len(phonetic_word) == 0:
48 |                     continue
49 |                 if sum([1 for ch in word if ch in "еуаоэяиюёы"]) != \
50 |                         sum([1 for ch in phonetic_word if ch in VOWELS]):
51 |                     continue
52 |                 words.append(word)
53 |                 phonetic_words.append(phonetic_word)
54 |         print(len(words))
55 |         with open(filename, "w") as f:
56 |             for i, word in enumerate(words):
57 |                 f.write(word + "\t" + phonetic_words[i] + "\n")


--------------------------------------------------------------------------------
/docs/source/rupo.generate.rst:
--------------------------------------------------------------------------------
  1 | rupo.generate package
  2 | =====================
  3 | 
  4 | Submodules
  5 | ----------
  6 | 
  7 | rupo.generate.corpora_information_loader module
  8 | -----------------------------------------------
  9 | 
 10 | .. automodule:: rupo.generate.corpora_information_loader
 11 |     :members:
 12 |     :undoc-members:
 13 |     :show-inheritance:
 14 | 
 15 | rupo.generate.filters module
 16 | ----------------------------
 17 | 
 18 | .. automodule:: rupo.generate.filters
 19 |     :members:
 20 |     :undoc-members:
 21 |     :show-inheritance:
 22 | 
 23 | rupo.generate.gen module
 24 | ------------------------
 25 | 
 26 | .. automodule:: rupo.generate.gen
 27 |     :members:
 28 |     :undoc-members:
 29 |     :show-inheritance:
 30 | 
 31 | rupo.generate.generator module
 32 | ------------------------------
 33 | 
 34 | .. automodule:: rupo.generate.generator
 35 |     :members:
 36 |     :undoc-members:
 37 |     :show-inheritance:
 38 | 
 39 | rupo.generate.grammeme_vectorizer module
 40 | ----------------------------------------
 41 | 
 42 | .. automodule:: rupo.generate.grammeme_vectorizer
 43 |     :members:
 44 |     :undoc-members:
 45 |     :show-inheritance:
 46 | 
 47 | rupo.generate.lstm module
 48 | -------------------------
 49 | 
 50 | .. automodule:: rupo.generate.lstm
 51 |     :members:
 52 |     :undoc-members:
 53 |     :show-inheritance:
 54 | 
 55 | rupo.generate.markov module
 56 | ---------------------------
 57 | 
 58 | .. automodule:: rupo.generate.markov
 59 |     :members:
 60 |     :undoc-members:
 61 |     :show-inheritance:
 62 | 
 63 | rupo.generate.model_container module
 64 | ------------------------------------
 65 | 
 66 | .. automodule:: rupo.generate.model_container
 67 |     :members:
 68 |     :undoc-members:
 69 |     :show-inheritance:
 70 | 
 71 | rupo.generate.test_generator module
 72 | -----------------------------------
 73 | 
 74 | .. automodule:: rupo.generate.test_generator
 75 |     :members:
 76 |     :undoc-members:
 77 |     :show-inheritance:
 78 | 
 79 | rupo.generate.test_markov module
 80 | --------------------------------
 81 | 
 82 | .. automodule:: rupo.generate.test_markov
 83 |     :members:
 84 |     :undoc-members:
 85 |     :show-inheritance:
 86 | 
 87 | rupo.generate.tqdm_open module
 88 | ------------------------------
 89 | 
 90 | .. automodule:: rupo.generate.tqdm_open
 91 |     :members:
 92 |     :undoc-members:
 93 |     :show-inheritance:
 94 | 
 95 | rupo.generate.word_form module
 96 | ------------------------------
 97 | 
 98 | .. automodule:: rupo.generate.word_form
 99 |     :members:
100 |     :undoc-members:
101 |     :show-inheritance:
102 | 
103 | rupo.generate.word_form_vocabulary module
104 | -----------------------------------------
105 | 
106 | .. automodule:: rupo.generate.word_form_vocabulary
107 |     :members:
108 |     :undoc-members:
109 |     :show-inheritance:
110 | 
111 | 
112 | Module contents
113 | ---------------
114 | 
115 | .. automodule:: rupo.generate
116 |     :members:
117 |     :undoc-members:
118 |     :show-inheritance:
119 | 


--------------------------------------------------------------------------------
/rupo/settings.py:
--------------------------------------------------------------------------------
 1 | from pkg_resources import resource_filename
 2 | foo_config = resource_filename(__name__, 'foo.conf')
 3 | 
 4 | CLASSIFIER_DIR = resource_filename(__name__, "data/classifier/")
 5 | 
 6 | DATA_DIR = resource_filename(__name__, "data")
 7 | 
 8 | DICT_DIR = resource_filename(__name__, "data/dict")
 9 | CMU_DICT = resource_filename(__name__, "data/dict/cmu.txt")
10 | ZALYZNYAK_DICT = resource_filename(__name__, "data/dict/zaliznyak.txt")
11 | RU_WIKI_DICT = resource_filename(__name__, "data/dict/wiki_ru.txt")
12 | 
13 | RU_ALIGNER_DEFAULT_PATH = resource_filename(__name__, "data/g2p_models/ru_aligner.pickle")
14 | EN_ALIGNER_DEFAULT_PATH = resource_filename(__name__, "data/g2p_models/en_aligner.pickle")
15 | 
16 | RU_GRAPHEME_STRESS_PATH = resource_filename(__name__, "data/dict/ru_grapheme_stress.txt")
17 | RU_GRAPHEME_STRESS_TRIE_PATH = resource_filename(__name__, "data/dict/ru_grapheme_stress.trie")
18 | RU_G2P_DICT_PATH = resource_filename(__name__, "data/dict/ru_g2p.txt")
19 | RU_PHONEME_STRESS_PATH = resource_filename(__name__, "data/dict/ru_phoneme_stress.txt")
20 | RU_PHONEME_STRESS_BIG_PATH = resource_filename(__name__, "data/dict/ru_phoneme_stress_big.txt")
21 | RU_PHONEME_STRESS_TRIE_PATH = resource_filename(__name__, "data/dict/ru_phoneme_stress.trie")
22 | RU_G2P_DEFAULT_MODEL = resource_filename(__name__, "data/g2p_models/g2p_ru_maxlen40_BLSTM256_BLSTM256_dropout0.2_acc992_wer140.h5")
23 | RU_STRESS_DEFAULT_MODEL = resource_filename(__name__, "data/stress_models/ru_main")
24 | 
25 | EN_G2P_DICT_PATH = resource_filename(__name__, "data/dict/en_g2p.txt")
26 | EN_PHONEME_STRESS_PATH = resource_filename(__name__, "data/dict/en_phoneme_stress.txt")
27 | EN_PHONEME_STRESS_TRIE_PATH = resource_filename(__name__, "data/dict/en_phoneme_stress.trie")
28 | EN_G2P_DEFAULT_MODEL = resource_filename(__name__, "data/g2p_models/g2p_en_maxlen40_BLSTM256+LSTM256_LSTM128_dropout0.4_acc977_wer379.h5")
29 | EN_STRESS_DEFAULT_MODEL = resource_filename(__name__, "data/stress_models/stress_en_LSTM128_dropout0.2_acc99_wer10.h5")
30 | 
31 | EXAMPLES_DIR = resource_filename(__name__, "data/examples/")
32 | MARKUP_XML_EXAMPLE = resource_filename(__name__, "data/examples/markup.xml")
33 | MARKUP_JSON_EXAMPLE = resource_filename(__name__, "data/examples/markup.json")
34 | TEXT_XML_EXAMPLE = resource_filename(__name__, "data/examples/text.xml")
35 | TEXT_TXT_EXAMPLE = resource_filename(__name__, "data/examples/text.txt")
36 | HYPHEN_TOKENS = resource_filename(__name__, "data/hyphen-tokens.txt")
37 | 
38 | G2P_CURRENT_MODEL_DIR = resource_filename(__name__, "data/g2p_models/")
39 | ACCENT_CURRENT_MODEL_DIR = resource_filename(__name__, "data/stress_models/")
40 | 
41 | GENERATOR_MODEL_DIR = resource_filename(__name__, "data/generator_models/")
42 | GENERATOR_WORD_FORM_VOCAB_PATH = resource_filename(__name__, "data/generator_models/vocabulary")
43 | GENERATOR_VOCAB_PATH = resource_filename(__name__, "data/generator_models/stress.pickle")
44 | 
45 | TEMP_PATH = resource_filename(__name__, "data/temp.txt")
46 | 
47 | RU_GRAPHEME_SET = " абвгдеёжзийклмнопрстуфхцчшщьыъэюя-"
48 | EN_GRAPHEME_SET = " abcdefghijklmnopqrstuvwxyz.'-"
49 | 


--------------------------------------------------------------------------------
/rupo/main/test_tokenizer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Автор: Гусев Илья
 3 | # Описание: Тесты для токенизатора.
 4 | 
 5 | import unittest
 6 | 
 7 | from rupo.main.tokenizer import Tokenizer, SentenceTokenizer, Token
 8 | 
 9 | 
10 | class TestTokenizer(unittest.TestCase):
11 |     def test_tokenizer(self):
12 |         text = "О, когда-нибудь, когда?"
13 |         self.assertEqual(Tokenizer.tokenize(text), [
14 |             Token('О', Token.TokenType.WORD, 0, 1),
15 |             Token(',', Token.TokenType.PUNCTUATION, 1, 2),
16 |             Token(' ', Token.TokenType.SPACE, 2, 3),
17 |             Token('когда-нибудь', Token.TokenType.WORD, 3, 15),
18 |             Token(',', Token.TokenType.PUNCTUATION, 15, 16),
19 |             Token(' ', Token.TokenType.SPACE, 16, 17),
20 |             Token('когда', Token.TokenType.WORD, 17, 22),
21 |             Token('?', Token.TokenType.PUNCTUATION, 22, 23)])
22 | 
23 |         text = " Пора"
24 |         self.assertEqual(Tokenizer.tokenize(text), [
25 |             Token(' ', Token.TokenType.SPACE, 0, 1),
26 |             Token('Пора', Token.TokenType.WORD, 1, 5)])
27 | 
28 |     def test_numbers(self):
29 |         text = "Очевидно, 1 января 1970 года..."
30 |         self.assertEqual(Tokenizer.tokenize(text), [
31 |             Token('Очевидно', Token.TokenType.WORD, 0, 8),
32 |             Token(',', Token.TokenType.PUNCTUATION, 8, 9),
33 |             Token(' ', Token.TokenType.SPACE, 9, 10),
34 |             Token('1', Token.TokenType.NUMBER, 10, 11),
35 |             Token(' ', Token.TokenType.SPACE, 11, 12),
36 |             Token('января', Token.TokenType.WORD, 12, 18),
37 |             Token(' ', Token.TokenType.SPACE, 18, 19),
38 |             Token('1970', Token.TokenType.NUMBER, 19, 23),
39 |             Token(' ', Token.TokenType.SPACE, 23, 24),
40 |             Token('года', Token.TokenType.WORD, 24, 28),
41 |             Token('...', Token.TokenType.PUNCTUATION, 28, 31)])
42 | 
43 |         self.assertEqual(Tokenizer.tokenize(text, replace_numbers=True), [
44 |             Token('Очевидно', Token.TokenType.WORD, 0, 8),
45 |             Token(',', Token.TokenType.PUNCTUATION, 8, 9),
46 |             Token(' ', Token.TokenType.SPACE, 9, 10),
47 |             Token('ЧИСЛО', Token.TokenType.WORD, 10, 11),
48 |             Token(' ', Token.TokenType.SPACE, 11, 12),
49 |             Token('января', Token.TokenType.WORD, 12, 18),
50 |             Token(' ', Token.TokenType.SPACE, 18, 19),
51 |             Token('ЧИСЛО', Token.TokenType.WORD, 19, 23),
52 |             Token(' ', Token.TokenType.SPACE, 23, 24),
53 |             Token('года', Token.TokenType.WORD, 24, 28),
54 |             Token('...', Token.TokenType.PUNCTUATION, 28, 31)])
55 | 
56 | 
57 | class TestSentenceTokenizer(unittest.TestCase):
58 |     def test_tokenizer(self):
59 |         text1 = "Конкурс учреждён в 2005 году!!! Официальный партнёр конкурса – Президентский центр Б.Н. Ельцина."
60 |         self.assertEqual(SentenceTokenizer.tokenize(text1),
61 |                          ['Конкурс учреждён в 2005 году!!!',
62 |                           'Официальный партнёр конкурса – Президентский центр Б.Н. Ельцина.'])
63 | 


--------------------------------------------------------------------------------
/rupo/test_api.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Автор: Гусев Илья
 3 | # Описание: Тесты для API библиотеки.
 4 | 
 5 | import unittest
 6 | import os
 7 | import random
 8 | 
 9 | from rupo.settings import MARKUP_XML_EXAMPLE, EXAMPLES_DIR, GENERATOR_MODEL_DIR, \
10 |     GENERATOR_WORD_FORM_VOCAB_PATH, GENERATOR_VOCAB_PATH, RU_STRESS_DEFAULT_MODEL,\
11 |     ZALYZNYAK_DICT
12 | from rupo.main.markup import Markup
13 | from rupo.api import Engine
14 | 
15 | 
16 | class TestApi(unittest.TestCase):
17 |     @classmethod
18 |     def setUpClass(cls):
19 |         cls.engine = Engine(language="ru")
20 |         cls.engine.load(
21 |             stress_model_path=RU_STRESS_DEFAULT_MODEL,
22 |             zalyzniak_dict=ZALYZNYAK_DICT
23 |         )
24 | 
25 |     @classmethod
26 |     def tearDownClass(cls):
27 |         del cls.engine
28 | 
29 |     def test_stress(self):
30 |         self.assertEqual(self.engine.get_stresses("корова"), [3])
31 |         self.assertEqual(self.engine.get_stresses("триплекс"), [2])
32 |         self.assertEqual(self.engine.get_stresses("горит"), [3])
33 |         self.assertEqual(self.engine.get_stresses("восток"), [4])
34 |         self.assertEqual(self.engine.get_stresses("зарёю"), [3])
35 |         self.assertEqual(self.engine.get_stresses("новой"), [1])
36 |         self.assertEqual(self.engine.get_stresses("равнине"), [4])
37 |         self.assertEqual(self.engine.get_stresses("холмам"), [4])
38 |         self.assertEqual(self.engine.get_stresses("грохочут"), [4])
39 |         self.assertCountEqual(self.engine.get_stresses("пушки"), [4, 1])
40 |         self.assertEqual(self.engine.get_stresses("багровый"), [4])
41 |         self.assertEqual(self.engine.get_stresses("кругами"), [4])
42 |         self.assertEqual(self.engine.get_stresses("уж"), [0])
43 |         self.assertEqual(self.engine.get_stresses('колесом'), [5])
44 | 
45 |     def test_get_word_syllables(self):
46 |         self.assertEqual(self.engine.get_word_syllables("корова"), ["ко", "ро", "ва"])
47 | 
48 |     def test_count_syllables(self):
49 |         self.assertEqual(self.engine.count_syllables("корова"), 3)
50 | 
51 |     def test_is_rhyme(self):
52 |         self.assertTrue(self.engine.is_rhyme("корова", "здорова"))
53 | 
54 |     def test_get_markup(self):
55 |         self.assertIsInstance(self.engine.get_markup("корова"), Markup)
56 | 
57 |     def test_get_improved_markup(self):
58 |         self.assertIsInstance(self.engine.get_improved_markup("корова")[0], Markup)
59 | 
60 |     def test_classify_metre(self):
61 |         text = "Горит восток зарёю новой.\n" \
62 |                "Уж на равнине, по холмам\n" \
63 |                "Грохочут пушки. Дым багровый\n" \
64 |                "Кругами всходит к небесам."
65 |         self.assertEqual(self.engine.classify_metre(text), "iambos")
66 | 
67 |     def test_generate_poem(self):
68 |         random.seed(42)
69 |         model_path = GENERATOR_MODEL_DIR
70 |         vocab_path = GENERATOR_WORD_FORM_VOCAB_PATH
71 |         stress_path = GENERATOR_VOCAB_PATH
72 |         poem = self.engine.generate_poem(
73 |             model_path,
74 |             vocab_path,
75 |             stress_path,
76 |             sampling_k=10000,
77 |             n_syllables=8,
78 |             rhyme_pattern="abab",
79 |             metre_schema="-+")
80 |         self.assertIsNotNone(poem)
81 | 
82 |     def test_get_word_rhymes(self):
83 |         vocab_dump_file = os.path.join(EXAMPLES_DIR, "vocab_rhymes.pickle")
84 |         self.assertEqual(self.engine.get_word_rhymes("глядел", vocab_dump_file, MARKUP_XML_EXAMPLE), ["сидел", "летел"])
85 | 
86 | 


--------------------------------------------------------------------------------
/rupo/stress/predictor.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Автор: Гусев Илья
 3 | # Описание: Класс для определения ударения.
 4 | 
 5 | from typing import List
 6 | 
 7 | from rupo.stress.dict import StressDict
 8 | from rupo.util.preprocess import count_vowels, get_first_vowel_position
 9 | from rupo.settings import CMU_DICT, ZALYZNYAK_DICT, RU_STRESS_DEFAULT_MODEL
10 | from rupo.stress.word import Stress
11 | 
12 | from russ.stress.model import StressModel
13 | 
14 | 
15 | class StressPredictor:
16 |     def predict(self, word: str) -> List[int]:
17 |         raise NotImplementedError()
18 | 
19 | 
20 | class DictStressPredictor(StressPredictor):
21 |     def __init__(self, language="ru", raw_dict_path=None, trie_path=None,
22 |                  zalyzniak_dict=ZALYZNYAK_DICT, cmu_dict=CMU_DICT):
23 |         self.stress_dict = StressDict(language, raw_dict_path=raw_dict_path, trie_path=trie_path,
24 |                                       zalyzniak_dict=zalyzniak_dict, cmu_dict=cmu_dict)
25 | 
26 |     def predict(self, word: str) -> List[int]:
27 |         """
28 |         Определение ударения в слове по словарю. Возможно несколько вариантов ударения.
29 | 
30 |         :param word: слово для простановки ударений.
31 |         :return stresses: позиции букв, на которые падает ударение.
32 |         """
33 |         stresses = []
34 |         if count_vowels(word) == 0:
35 |             # Если гласных нет, то и ударений нет.
36 |             pass
37 |         elif count_vowels(word) == 1:
38 |             # Если одна гласная, то на неё и падает ударение.
39 |             stresses.append(get_first_vowel_position(word))
40 |         elif word.find("ё") != -1:
41 |             # Если есть буква "ё", то только на неё может падать ударение.
42 |             stresses.append(word.find("ё"))
43 |         else:
44 |             # Проверяем словарь на наличие форм с ударениями.
45 |             stresses = self.stress_dict.get_stresses(word, Stress.Type.PRIMARY) +\
46 |                        self.stress_dict.get_stresses(word, Stress.Type.SECONDARY)
47 |             if 'е' not in word:
48 |                 return stresses
49 |             # Находим все возможные варинаты преобразований 'е' в 'ё'.
50 |             positions = [i for i in range(len(word)) if word[i] == 'е']
51 |             beam = [word[:positions[0]]]
52 |             for i in range(len(positions)):
53 |                 new_beam = []
54 |                 for prefix in beam:
55 |                     n = positions[i+1] if i+1 < len(positions) else len(word)
56 |                     new_beam.append(prefix + 'ё' + word[positions[i]+1:n])
57 |                     new_beam.append(prefix + 'е' + word[positions[i]+1:n])
58 |                     beam = new_beam
59 |             # И проверяем их по словарю.
60 |             for permutation in beam:
61 |                 if len(self.stress_dict.get_stresses(permutation)) != 0:
62 |                     yo_pos = permutation.find("ё")
63 |                     if yo_pos != -1:
64 |                         stresses.append(yo_pos)
65 |         return stresses
66 | 
67 | 
68 | class CombinedStressPredictor(StressPredictor):
69 |     def __init__(self, language="ru", stress_model_path: str=RU_STRESS_DEFAULT_MODEL, raw_stress_dict_path=None,
70 |                  stress_trie_path=None, zalyzniak_dict=ZALYZNYAK_DICT, cmu_dict=CMU_DICT):
71 |         self.rnn = StressModel.load(stress_model_path)
72 |         self.dict = DictStressPredictor(language, raw_stress_dict_path, stress_trie_path, zalyzniak_dict, cmu_dict)
73 | 
74 |     def predict(self, word: str) -> List[int]:
75 |         stresses = self.dict.predict(word)
76 |         if len(stresses) == 0:
77 |             return self.rnn.predict(word)
78 |         else:
79 |             return stresses
80 | 


--------------------------------------------------------------------------------
/rupo/main/vocabulary.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Автор: Гусев Илья
  3 | # Описание: Индексы слов для языковой модели.
  4 | 
  5 | from typing import Dict
  6 | import pickle
  7 | 
  8 | from allennlp.data.vocabulary import Vocabulary
  9 | 
 10 | from rupo.main.markup import Markup
 11 | from rupo.files.reader import Reader, FileType
 12 | from rupo.stress.word import StressedWord, Stress
 13 | from rupo.stress.predictor import StressPredictor
 14 | 
 15 | 
 16 | class StressVocabulary(object):
 17 |     """
 18 |     Индексированный словарь.
 19 |     """
 20 |     def __init__(self) -> None:
 21 |         self.word_to_index = {}  # type: Dict[StressedWord, int]
 22 |         self.index_to_word = {}  # type: Dict[int, StressedWord]
 23 | 
 24 |     def save(self, dump_filename: str) -> None:
 25 |         """
 26 |         Сохранение словаря.
 27 |         """
 28 |         with open(dump_filename, "wb") as f:
 29 |             pickle.dump(self, f, pickle.HIGHEST_PROTOCOL)
 30 | 
 31 |     def load(self, dump_filename: str):
 32 |         """
 33 |         Загрузка словаря.
 34 |         """
 35 |         with open(dump_filename, "rb") as f:
 36 |             vocab = pickle.load(f)
 37 |             self.__dict__.update(vocab.__dict__)
 38 | 
 39 |     def parse(self, markup_path: str, from_voc: bool=False):
 40 |          if from_voc:
 41 |             word_indexes = Reader.read_vocabulary(markup_path)
 42 |             for word, index in word_indexes:
 43 |                 self.add_word(word.to_stressed_word(), index)
 44 |          else:
 45 |             markups = Reader.read_markups(markup_path, FileType.XML, is_processed=True)
 46 |             for markup in markups:
 47 |                 self.add_markup(markup)
 48 | 
 49 |     def add_markup(self, markup: Markup) -> None:
 50 |         """
 51 |         Добавление слов из разметки в словарь.
 52 | 
 53 |         :param markup: разметка.
 54 |         """
 55 |         for line in markup.lines:
 56 |             for word in line.words:
 57 |                 self.add_word(word.to_stressed_word())
 58 | 
 59 |     def add_word(self, word: StressedWord, index: int=-1) -> bool:
 60 |         """
 61 |         Добавление слова.
 62 | 
 63 |         :param word: слово.
 64 |         :param index: индекс, если задан заранее.
 65 |         :return: слово новое или нет.
 66 |         """
 67 |         if word in self.word_to_index:
 68 |             if index != -1:
 69 |                 self.index_to_word[index] = word
 70 |             return False
 71 |         self.word_to_index[word] = self.size() if index == -1 else index
 72 |         self.index_to_word[self.size() if index == -1 else index] = word
 73 |         return True
 74 | 
 75 |     def get_word_index(self, word: StressedWord) -> int:
 76 |         """
 77 |         Получить индекс слова.
 78 | 
 79 |         :param word: слово (Word).
 80 |         :return: индекс.
 81 |         """
 82 |         if word in self.word_to_index:
 83 |             return self.word_to_index[word]
 84 |         raise IndexError("Can't find word: " + word.text)
 85 | 
 86 |     def get_word(self, index: int) -> StressedWord:
 87 |         """
 88 |         Получить слово по индексу.
 89 | 
 90 |         :param index: индекс.
 91 |         :return: слово.
 92 |         """
 93 |         return self.index_to_word[index]
 94 | 
 95 |     def size(self):
 96 |         """
 97 |         :return: получить размер словаря.
 98 |         """
 99 |         return len(self.index_to_word)
100 | 
101 | 
102 | def inflate_stress_vocabulary(vocabulary: Vocabulary, stress_predictor: StressPredictor):
103 |     vocab = StressVocabulary()
104 |     for index, word in vocabulary.get_index_to_token_vocabulary("tokens").items():
105 |         stresses = [Stress(pos, Stress.Type.PRIMARY) for pos in stress_predictor.predict(word)]
106 |         word = StressedWord(word, set(stresses))
107 |         vocab.add_word(word, index)
108 |     return vocab
109 | 


--------------------------------------------------------------------------------
/rupo/files/writer.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Автор: Гусев Илья
  3 | # Описание: Запись в файлы разных расширений.
  4 | 
  5 | import os
  6 | from typing import List
  7 | 
  8 | from rupo.files.reader import RAW_SEPARATOR
  9 | from rupo.main.markup import Markup
 10 | from rupo.files.reader import FileType
 11 | 
 12 | 
 13 | class Writer(object):
 14 |     """
 15 |     Запись в файл.
 16 |     """
 17 |     def __init__(self, destination_type: FileType, path: str) -> None:
 18 |         """
 19 |         Нужно, когда хотим записывать разметки по одной (экономия памяти).
 20 | 
 21 |         :param destination_type: тип файла.
 22 |         :param path: путь к файлу.
 23 |         """
 24 |         self.type = destination_type
 25 |         self.path = path
 26 |         self.file = None
 27 |         try:
 28 |             os.remove(path)
 29 |         except OSError:
 30 |             pass
 31 | 
 32 |     def open(self) -> None:
 33 |         """
 34 |         Открываем файл, вызывать до начала записи.
 35 |         """
 36 |         self.file = open(self.path, "w", encoding="utf-8")
 37 |         if self.type == FileType.XML:
 38 |             self.file.write('<?xml version="1W.0" encoding="UTF-8"?><items>')
 39 | 
 40 |     def write_markup(self, markup: Markup) -> None:
 41 |         """
 42 |         Запись разметки в уже открытый файл.
 43 |         :param markup: разметка.
 44 |         """
 45 |         assert self.file is not None
 46 |         if self.type == FileType.XML:
 47 |             xml = markup.to_xml().encode('utf-8')\
 48 |                 .replace(b'<?xml version="1.0" encoding="UTF-8" ?>', b'').decode('utf-8')
 49 |             self.file.write(xml)
 50 |         elif self.type == FileType.RAW:
 51 |             Writer.__write_markup_raw(markup, self.file)
 52 | 
 53 |     def close(self) -> None:
 54 |         """
 55 |         Закрываем файл.
 56 |         """
 57 |         if self.type == FileType.XML:
 58 |             self.file.write('</items>')
 59 |         self.file.close()
 60 | 
 61 |     @staticmethod
 62 |     def write_markups(destination_type: FileType, markups: List[Markup], path: str) -> None:
 63 |         """
 64 |         Запись разметок в файл.
 65 | 
 66 |         :param destination_type: тип файла.
 67 |         :param markups: разметки.
 68 |         :param path: путь к файлу.
 69 |         """
 70 |         with open(path, "w", encoding="utf-8") as file:
 71 |             if destination_type == FileType.XML:
 72 |                 file.write('<?xml version="1.0" encoding="UTF-8"?><items>')
 73 |                 for markup in markups:
 74 |                     xml = markup.to_xml().encode('utf-8')\
 75 |                         .replace(b'<?xml version="1.0" encoding="UTF-8" ?>', b'').decode('utf-8')
 76 |                     file.write(xml)
 77 |                     file.write("\n")
 78 |                 file.write('</items>')
 79 |             elif destination_type == FileType.JSON:
 80 |                 file.write("[")
 81 |                 for markup in markups:
 82 |                     file.write(markup.to_json())
 83 |                     file.write(",")
 84 |                 file.seek(0, 2)
 85 |                 size = file.tell()
 86 |                 file.truncate(size - 1)
 87 |                 file.write(']')
 88 |             elif destination_type == FileType.RAW:
 89 |                 for markup in markups:
 90 |                     Writer.__write_markup_raw(markup, file)
 91 | 
 92 |     @staticmethod
 93 |     def __write_markup_raw(markup: Markup, file) -> None:
 94 |         """
 95 |         Запись разметки в текстовом виде (слово+ударение).
 96 | 
 97 |         :param markup: разметка.
 98 |         :param file: открытый файл.
 99 |         """
100 |         lines = []
101 |         for line in markup.lines:
102 |             lines.append(" ".join([word.get_short() for word in line.words]))
103 |         file.write("\n".join(lines))
104 |         file.write(RAW_SEPARATOR)
105 | 


--------------------------------------------------------------------------------
/rupo/generate/generator.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Автор: Гусев Илья
 3 | # Описание: Модуль создания стихотворений.
 4 | 
 5 | from typing import Optional
 6 | 
 7 | from allennlp.data.vocabulary import Vocabulary
 8 | from rulm.language_model import LanguageModel
 9 | 
10 | from rupo.main.vocabulary import StressVocabulary
11 | from rupo.generate.transforms import PoemTransform
12 | 
13 | 
14 | class Generator(object):
15 |     """
16 |     Генератор стихов
17 |     """
18 |     def __init__(self,
19 |                  model: LanguageModel,
20 |                  token_vocabulary: Vocabulary,
21 |                  stress_vocabulary: StressVocabulary,
22 |                  eos_index: int):
23 |         self.model = model  # type: LanguageModel
24 |         self.token_vocabulary = token_vocabulary  # type: Vocabulary
25 |         self.stress_vocabulary = stress_vocabulary  # type: StressVocabulary
26 |         self.eos_index = eos_index
27 | 
28 |     def generate_poem(self,
29 |                       metre_schema: str="+-",
30 |                       rhyme_pattern: str="aabb",
31 |                       n_syllables: int=8,
32 |                       letters_to_rhymes: dict=None,
33 |                       beam_width: int=None,
34 |                       sampling_k: int=None,
35 |                       rhyme_score_border: int=4,
36 |                       temperature: float=1.0,
37 |                       seed: int=1337,
38 |                       last_text: str="") -> Optional[str]:
39 |         assert beam_width or sampling_k, "Set sampling_k or beam_width"
40 |         self.model.set_seed(seed)
41 | 
42 |         poem_transform = PoemTransform(
43 |             stress_vocabulary=self.stress_vocabulary,
44 |             metre_pattern=metre_schema,
45 |             rhyme_pattern=rhyme_pattern,
46 |             n_syllables=n_syllables,
47 |             eos_index=self.eos_index,
48 |             letters_to_rhymes=letters_to_rhymes,
49 |             score_border=rhyme_score_border
50 |         )
51 | 
52 |         if last_text:
53 |             words = last_text.lower().split(" ")
54 |             last_text = " ".join(words[::-1])
55 |             filled_syllables = 0
56 |             for word in last_text.split():
57 |                 index = self.token_vocabulary.get_token_index(word)
58 |                 word = self.stress_vocabulary.get_word(index)
59 |                 syllables_count = len(word.syllables)
60 |                 filled_syllables += syllables_count
61 |             poem_transform.stress_position -= filled_syllables
62 |             poem_transform.rhyme_position -= 1
63 |             last_index = self.token_vocabulary.get_token_index(words[-1])
64 |             last_word = self.stress_vocabulary.get_word(last_index)
65 |             poem_transform.letters_to_rhymes[rhyme_pattern[-1]].add(last_word)
66 | 
67 |         self.model.transforms.append(poem_transform)
68 | 
69 |         try:
70 |             if beam_width:
71 |                 poem = self.model.beam_decoding(last_text, beam_width=beam_width, temperature=temperature)
72 |             elif sampling_k:
73 |                 poem = self.model.sample_decoding(last_text, k=sampling_k, temperature=temperature)
74 |             else:
75 |                 assert False
76 |         except Exception as e:
77 |             self.model.transforms.pop()
78 |             raise e
79 | 
80 |         self.model.transforms.pop()
81 | 
82 |         words = poem.split(" ")
83 |         words = words[::-1]
84 |         result_words = []
85 |         current_n_syllables = 0
86 |         for word in words:
87 |             result_words.append(word)
88 |             index = self.token_vocabulary.get_token_index(word)
89 |             word = self.stress_vocabulary.get_word(index)
90 |             syllables_count = len(word.syllables)
91 |             current_n_syllables += syllables_count
92 |             if n_syllables == current_n_syllables:
93 |                 current_n_syllables = 0
94 |                 result_words.append("\n")
95 |         poem = " ".join(result_words)
96 |         poem = "\n".join([line.strip() for line in poem.split("\n")])
97 |         return poem
98 | 
99 | 


--------------------------------------------------------------------------------
/rupo/dict/cmu.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Автор: Гусев Илья
  3 | # Описание: Конвертер CMU словаря.
  4 | 
  5 | 
  6 | class CMUDict:
  7 |     aprabet2ipa = {
  8 |         "AO": "ɔ",
  9 |         "AA": "ɑ",
 10 |         "IY": "i",
 11 |         "UW": "u",
 12 |         "EH": "ɛ",
 13 |         "IH": "ɪ",
 14 |         "UH": "ʊ",
 15 |         "AH": "ʌ",
 16 |         "AX": "ə",
 17 |         "AE": "æ",
 18 |         "EY": "eɪ",
 19 |         "AY": "aɪ",
 20 |         "OW": "oʊ",
 21 |         "AW": "aʊ",
 22 |         "OY": "ɔɪ",
 23 |         "ER": "ɝ",
 24 |         "AXR": "ɚ",
 25 |         "P": "p",
 26 |         "B": "b",
 27 |         "T": "t",
 28 |         "D": "d",
 29 |         "K": "k",
 30 |         "G": "ɡ",
 31 |         "CH": "ʦ",
 32 |         "JH": "ʤ",
 33 |         "F": "f",
 34 |         "V": "v",
 35 |         "TH": "θ",
 36 |         "DH": "ð",
 37 |         "S": "s",
 38 |         "Z": "z",
 39 |         "SH": "ʃ",
 40 |         "ZH": "ʒ",
 41 |         "HH": "h",
 42 |         "M": "m",
 43 |         "EM": "m",
 44 |         "N": "n",
 45 |         "EN": "n",
 46 |         "NG": "ŋ",
 47 |         "ENG": "ŋ",
 48 |         "L": "ɫ",
 49 |         "EL": "ɫ",
 50 |         "R": "r",
 51 |         "DX": "ɾ",
 52 |         "NX": "ɾ",
 53 |         "Y": "j",
 54 |         "W": "w",
 55 |         "Q": "ʔ"
 56 |     }
 57 | 
 58 |     diphtongs = ["EY", "AY", "OW", "AW", "OY"]
 59 | 
 60 |     @staticmethod
 61 |     def convert_to_g2p_only(source_file, destination_file):
 62 |         clean = []
 63 |         with open(source_file, 'r', encoding="utf-8", errors="ignore") as f:
 64 |             lines = f.readlines()
 65 |             for line in lines:
 66 |                 g = line.split("  ")[0].lower()
 67 |                 if not ("a" <= g[0] <= "z"):
 68 |                     continue
 69 |                 if "(" in g:
 70 |                     continue
 71 |                 p = line.split("  ")[1].strip()
 72 |                 phonemes = p.split(" ")
 73 |                 for i, phoneme in enumerate(phonemes):
 74 |                     if not ("A" <= phoneme[-1] <= "Z"):
 75 |                         phonemes[i] = phoneme[:-1]
 76 |                 p = "".join([CMUDict.aprabet2ipa[phoneme] for phoneme in phonemes])
 77 |                 clean.append((g, p))
 78 |         with open(destination_file, 'w', encoding="utf-8") as w:
 79 |             for g, p in clean:
 80 |                 w.write(g+"\t"+p+"\n")
 81 | 
 82 |     @staticmethod
 83 |     def convert_to_phoneme_stress(source_file, destination_file):
 84 |         clean = []
 85 |         with open(source_file, 'r', encoding="utf-8", errors="ignore") as f:
 86 |             for line in f:
 87 |                 g = line.split("  ")[0].lower()
 88 |                 if not ("a" <= g[0] <= "z"):
 89 |                     continue
 90 |                 p = line.split("  ")[1].strip()
 91 |                 if "(1)" in g:
 92 |                     g = g.replace("(1)", "")
 93 |                 if "(2)" in g:
 94 |                     g = g.replace("(2)", "")
 95 |                 if "(" in g:
 96 |                     continue
 97 | 
 98 |                 phonemes = p.split(" ")
 99 |                 primary = []
100 |                 secondary = []
101 |                 diphtongs_count = 0
102 |                 for i, phoneme in enumerate(phonemes):
103 |                     if not ("A" <= phoneme[-1] <= "Z"):
104 |                         if int(phoneme[-1]) == 1:
105 |                             primary.append(str(i+diphtongs_count))
106 |                         if int(phoneme[-1]) == 2:
107 |                             secondary.append(str(i+diphtongs_count))
108 |                         phonemes[i] = phoneme[:-1]
109 |                         if phonemes[i] in CMUDict.diphtongs:
110 |                             diphtongs_count += 1
111 |                 p = "".join([CMUDict.aprabet2ipa[phoneme] for phoneme in phonemes])
112 |                 clean.append((p, primary, secondary))
113 |         with open(destination_file, 'w', encoding="utf-8") as w:
114 |             for p, f, s in clean:
115 |                 w.write(p + "\t" + ",".join(f) + "\t" + ",".join(s) + "\n")


--------------------------------------------------------------------------------
/rupo/rhymes/rhymes.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Автор: Гусев Илья
 3 | # Описание: Класс рифм.
 4 | 
 5 | from rupo.stress.word import StressedWord
 6 | from rupo.util.preprocess import VOWELS
 7 | 
 8 | 
 9 | class RhymeProfile:
10 |     def __init__(self, syllable_count: int, stressed_syllable_number: int,
11 |                  stressed_syllable_text: str, next_syllable_text: str, next_char: str):
12 |         self.syllable_count = syllable_count
13 |         self.stressed_syllable_number = stressed_syllable_number
14 |         self.stressed_syllable_text = stressed_syllable_text
15 |         self.next_syllable_text = next_syllable_text
16 |         self.next_char = next_char
17 | 
18 |     def __str__(self):
19 |         return "Syllable count: {}; Stressed syllable: {}; " \
20 |                "Stressed syllable text: {}; Next syllable: {}; " \
21 |                "Next char: {}".format(self.syllable_count, self.stressed_syllable_number,
22 |                                       self.stressed_syllable_text, self.next_syllable_text, self.next_char)
23 | 
24 |     def __repr__(self):
25 |         return self.__str__()
26 | 
27 | 
28 | class Rhymes(object):
29 |     """
30 |     Поиск рифм.
31 |     """
32 | 
33 |     @staticmethod
34 |     def is_rhyme(word1: StressedWord,
35 |                  word2: StressedWord,
36 |                  score_border: int=4,
37 |                  syllable_number_border: int=4) -> bool:
38 |         """
39 |         Проверка рифмованности 2 слов.
40 | 
41 |         :param word1: первое слово для проверки рифмы, уже акцентуированное (Word).
42 |         :param word2: второе слово для проверки рифмы, уже акцентуированное (Word).
43 |         :param score_border: граница определния рифмы, чем выше, тем строже совпадение.
44 |         :param syllable_number_border: ограничение на номер слога с конца, на который падает ударение.
45 |         :return result: является рифмой или нет.
46 |         """
47 |         profile1 = Rhymes.__get_rhyme_profile(word1)
48 |         profile2 = Rhymes.__get_rhyme_profile(word2)
49 |         score = 0
50 |         for i, ch1 in enumerate(profile1.stressed_syllable_text):
51 |             for j, ch2 in enumerate(profile2.stressed_syllable_text[i:]):
52 |                 if ch1 != ch2:
53 |                     continue
54 |                 if ch1 in VOWELS:
55 |                     score += 3
56 |                 else:
57 |                     score += 1
58 |         if profile1.next_syllable_text == profile2.next_syllable_text and profile1.next_syllable_text != '':
59 |             score += 3
60 |         elif profile1.next_char == profile2.next_char and profile1.next_char != '':
61 |             score += 1
62 |         return (profile1.stressed_syllable_number == profile2.stressed_syllable_number and
63 |                 profile1.syllable_count == profile2.syllable_count and
64 |                 profile1.stressed_syllable_number <= syllable_number_border and
65 |                 score >= score_border)
66 | 
67 |     @staticmethod
68 |     def __get_rhyme_profile(word: StressedWord) -> 'RhymeProfile':
69 |         """
70 |         Получение профиля рифмовки (набора признаков для сопоставления).
71 | 
72 |         :param word: уже акцентуированное слово (Word).
73 |         :return profile: профиль рифмовки.
74 |         """
75 |         # TODO: Переход на фонетическое слово, больше признаков.
76 | 
77 |         profile = RhymeProfile(syllable_count=0,
78 |                                stressed_syllable_number=-1,
79 |                                stressed_syllable_text="",
80 |                                next_syllable_text="",
81 |                                next_char="")
82 |         syllables = list(word.syllables)
83 |         profile.syllable_count = len(syllables)
84 |         for i, syllable in enumerate(reversed(syllables)):
85 |             if syllable.stress == -1:
86 |                 continue
87 |             profile.stressed_syllable_text = syllable.text
88 |             profile.stressed_syllable_number = -i-1
89 |             if i != 0:
90 |                 profile.next_syllable = syllables[-i].text
91 |             if syllable.stress + 1 < len(word.text):
92 |                 profile.next_char = word.text[syllable.stress + 1]
93 |             break
94 |         return profile
95 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Python library for analysis and generation of poems in Russian #
 2 | 
 3 | [![Current version on PyPI](http://img.shields.io/pypi/v/rupo.svg)](https://pypi.python.org/pypi/rupo)
 4 | [![Python versions](https://img.shields.io/pypi/pyversions/rupo.svg)](https://pypi.python.org/pypi/rupo)
 5 | [![Build Status](https://travis-ci.org/IlyaGusev/rupo.svg?branch=master)](https://travis-ci.org/IlyaGusev/rupo)
 6 | [![Code Climate](https://codeclimate.com/github/IlyaGusev/rupo/badges/gpa.svg)](https://codeclimate.com/github/IlyaGusev/rupo)
 7 | [![Documentation Status](https://readthedocs.org/projects/rupo/badge/?version=latest)](http://rupo.readthedocs.io/en/latest/?badge=latest)
 8 | 
 9 | ### Install ###
10 | Warning: Python 3.9+ is not supported! Use Python 3.8.
11 | 
12 | ```
13 | git clone https://github.com/IlyaGusev/rupo
14 | cd rupo
15 | pip install -r requirements.txt
16 | sh download.sh
17 | ```
18 | 
19 | ### Example ###
20 | https://colab.research.google.com/drive/1WBl9erJvC9Oc9PjCD8JyC_40TDUqahCx
21 | 
22 | ### Usage manual ###
23 | #### Analysis ####
24 | ```
25 | >>> from rupo.api import Engine
26 | >>> engine = Engine(language="ru")
27 | >>> engine.load(<stress model path>, <zalyzniak dict path>)
28 | >>> engine.get_stresses("корова")
29 | [3]
30 | 
31 | >>> engine.get_word_syllables("корова")
32 | ["ко", "ро", "ва"]
33 | 
34 | >>> engine.is_rhyme("корова", "здорова")
35 | True
36 | 
37 | >>> text = "Горит восток зарёю новой.\nУж на равнине, по холмам\nГрохочут пушки. Дым багровый\nКругами всходит к небесам."
38 | >>> engine.classify_metre(text)
39 | iambos
40 | ```
41 | 
42 | #### Generation ####
43 | Script for poem generation. It can work in two different modes: sampling or beam search.
44 | 
45 | ```
46 | python generate_poem.py
47 | ```
48 | 
49 | | Argument            | Default | Description                                |
50 | |:--------------------|:--------|:-------------------------------------------|
51 | | --metre-schema      | +-      | feet type: -+ (iambos), +- (trochee), ...  |
52 | | --rhyme-pattern     | abab    | rhyme pattern                              |
53 | | --n-syllables       | 8       | number of syllables in line                |
54 | | --sampling-k        | 50000   | top-k words to sample from (sampling mode) |
55 | | --beam-width        | None    | width of beam search (beam search mode)    |
56 | | --temperature       | 1.0     | sampling softmax temperature               |
57 | | --last-text         | None    | custom last line                           |
58 | | --count             | 100     | count of poems to generate                 |
59 | | --model-path        | None    | optional path to generator model directory |
60 | | --token-vocab-path  | None    | optional path to vocabulary                |
61 | | --stress-vocab-path | None    | optional path to stress vocabulary         |
62 | 
63 | ## Models ###
64 | * Generator: https://www.dropbox.com/s/dwkui2xqivzsyw5/generator_model.zip
65 | * Stress predictor: https://www.dropbox.com/s/i9tarc8pum4e40p/stress_models_14_05_17.zip
66 | * G2P: https://www.dropbox.com/s/7rk135fzd3i8kfw/g2p_models.zip
67 | * Dictionaries: https://www.dropbox.com/s/znqlrb1xblh3amo/dict.zip
68 | 
69 | ### Литература ###
70 | * Брейдо, 1996, [Автоматический анализ метрики русского стиха](http://search.rsl.ru/ru/record/01000000124)
71 | * Каганов, 1996, [Лингвистическое конструирование в системах искусственного интеллекта](http://lleo.me/soft/text_dip.htm)
72 | * Козьмин, 2006, [Автоматический анализ стиха в системе Starling](http://www.dialog-21.ru/digests/dialog2006/materials/html/Kozmin.htm)
73 | * Гришина, 2008, [Поэтический корпус в рамках НКРЯ: общая структура и перспективы использования](http://ruscorpora.ru/sbornik2008/05.pdf)
74 | * Пильщиков, Старостин, 2012, [Автоматическое распознавание метра: проблемы и решения](http://www.academia.edu/11465228/%D0%90%D0%B2%D1%82%D0%BE%D0%BC%D0%B0%D1%82%D0%B8%D1%87%D0%B5%D1%81%D0%BA%D0%BE%D0%B5_%D1%80%D0%B0%D1%81%D0%BF%D0%BE%D0%B7%D0%BD%D0%B0%D0%B2%D0%B0%D0%BD%D0%B8%D0%B5_%D0%BC%D0%B5%D1%82%D1%80%D0%B0_%D0%BF%D1%80%D0%BE%D0%B1%D0%BB%D0%B5%D0%BC%D1%8B_%D0%B8_%D1%80%D0%B5%D1%88%D0%B5%D0%BD%D0%B8%D1%8F)
75 | * Барахнин, 2015, [Алгоритмы комплексного анализа русских поэтических текстов с целью автоматизации процесса создания метрических справочников и конкордансов](http://ceur-ws.org/Vol-1536/paper21.pdf), [сама система](http://poem.ict.nsc.ru/)
76 | 


--------------------------------------------------------------------------------
/rupo/generate/transforms.py:
--------------------------------------------------------------------------------
  1 | from copy import deepcopy
  2 | import numpy as np
  3 | from collections import defaultdict
  4 | 
  5 | from rulm.transform import Transform
  6 | 
  7 | from rupo.rhymes.rhymes import Rhymes
  8 | from rupo.main.vocabulary import StressVocabulary
  9 | from rupo.stress.word import StressedWord
 10 | 
 11 | 
 12 | class PoemTransform(Transform):
 13 |     """
 14 |     Фильтр по шаблону метра.
 15 |     """
 16 |     def __init__(self,
 17 |                  stress_vocabulary: StressVocabulary,
 18 |                  metre_pattern: str,
 19 |                  rhyme_pattern: str,
 20 |                  n_syllables: int,
 21 |                  eos_index: int,
 22 |                  letters_to_rhymes: dict=None,
 23 |                  score_border=4):
 24 |         self.stress_vocabulary = stress_vocabulary
 25 | 
 26 |         self.n_syllables = n_syllables
 27 | 
 28 |         mul = n_syllables // len(metre_pattern)
 29 |         if n_syllables % len(metre_pattern) != 0:
 30 |             mul += 1
 31 | 
 32 |         self.metre_pattern = metre_pattern * mul
 33 |         self.stress_position = len(self.metre_pattern) - 1
 34 |         self.eos_index = eos_index
 35 | 
 36 |         self.rhyme_pattern = rhyme_pattern
 37 |         self.rhyme_position = len(self.rhyme_pattern) - 1
 38 |         self.score_border = score_border
 39 | 
 40 |         self.letters_to_rhymes = defaultdict(set)
 41 |         if letters_to_rhymes is not None:
 42 |             for letter, words in letters_to_rhymes.items():
 43 |                 for word in words:
 44 |                     self.letters_to_rhymes[letter].add(word)
 45 | 
 46 |     def __call__(self, probabilities: np.array) -> np.array:
 47 |         if self.rhyme_position < 0 and self.stress_position == len(self.metre_pattern) - 1:
 48 |             probabilities = np.zeros(probabilities.shape, dtype="float")
 49 |             probabilities[self.eos_index] = 1.
 50 |             return probabilities
 51 | 
 52 |         for index in range(probabilities.shape[0]):
 53 |             word = self.stress_vocabulary.get_word(index)
 54 |             is_good_by_stress = self._filter_word_by_stress(word)
 55 |             is_good_by_rhyme = True
 56 |             if self.stress_position == len(self.metre_pattern) - 1:
 57 |                 is_good_by_rhyme = self._filter_word_by_rhyme(word)
 58 |             if not is_good_by_stress or not is_good_by_rhyme:
 59 |                 probabilities[index] = 0.
 60 | 
 61 |         assert np.sum(probabilities > 0) != 0, "Poem transform filtered out all words"
 62 |         return probabilities
 63 | 
 64 |     def advance(self, index: int):
 65 |         word = self.stress_vocabulary.get_word(index)
 66 |         syllables_count = len(word.syllables)
 67 | 
 68 |         if self.stress_position == len(self.metre_pattern) - 1:
 69 |             letter = self.rhyme_pattern[self.rhyme_position]
 70 |             self.letters_to_rhymes[letter].add(word)
 71 |             self.rhyme_position -= 1
 72 | 
 73 |         self.stress_position -= syllables_count
 74 | 
 75 |         if self.stress_position < 0:
 76 |             self.stress_position = len(self.metre_pattern) - 1
 77 | 
 78 |     def _filter_word_by_stress(self, word: StressedWord) -> bool:
 79 |         syllables = word.syllables
 80 |         syllables_count = len(syllables)
 81 |         if syllables_count == 0:
 82 |             return False
 83 |         if self.stress_position - syllables_count < -1:
 84 |             return False
 85 |         for i in range(syllables_count):
 86 |             syllable = syllables[i]
 87 |             syllable_number = self.stress_position - syllables_count + i + 1
 88 |             if syllables_count >= 2 and syllable.stress == -1 and self.metre_pattern[syllable_number] == "+":
 89 |                 for j in range(syllables_count):
 90 |                     other_syllable = syllables[j]
 91 |                     other_syllable_number = other_syllable.number - syllable.number + syllable_number
 92 |                     if i != j and other_syllable.stress != -1 and self.metre_pattern[other_syllable_number] == "-":
 93 |                         return False
 94 |         return True
 95 | 
 96 |     def _filter_word_by_rhyme(self, word: StressedWord) -> bool:
 97 |         if len(word.syllables) <= 1:
 98 |             return False
 99 |         rhyming_words = self.letters_to_rhymes[self.rhyme_pattern[self.rhyme_position]]
100 |         if len(rhyming_words) == 0:
101 |             return True
102 |         first_word = list(rhyming_words)[0]
103 | 
104 |         is_rhyme = Rhymes.is_rhyme(first_word, word,
105 |                                    score_border=self.score_border,
106 |                                    syllable_number_border=2) and first_word.text != word.text
107 |         return is_rhyme
108 | 
109 |     def __copy__(self):
110 |         obj = type(self)(self.stress_vocabulary, self.metre_pattern, self.rhyme_pattern, self.n_syllables,
111 |                          self.eos_index, self.letters_to_rhymes, self.score_border)
112 |         obj.stress_position = self.stress_position
113 |         obj.rhyme_position = self.rhyme_position
114 |         obj.letters_to_rhymes = deepcopy(self.letters_to_rhymes)
115 |         return obj
116 | 


--------------------------------------------------------------------------------
/rupo/stress/dict.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Автор: Гусев Илья
  3 | # Описание: Класс для удобной работы со словарём ударений.
  4 | 
  5 | import pygtrie
  6 | import os
  7 | import pickle
  8 | from typing import List, Dict, ItemsView, Set
  9 | 
 10 | from rupo.dict.cmu import CMUDict
 11 | from rupo.settings import RU_GRAPHEME_STRESS_PATH, RU_GRAPHEME_STRESS_TRIE_PATH, \
 12 |     EN_PHONEME_STRESS_PATH, EN_PHONEME_STRESS_TRIE_PATH, ZALYZNYAK_DICT, CMU_DICT
 13 | 
 14 | from rupo.stress.word import Stress
 15 | 
 16 | 
 17 | class StressDict:
 18 |     """
 19 |     Класс данных, для сериализации словаря как префиксного дерева и быстрой загрузки в память.
 20 |     """
 21 | 
 22 |     class Mode:
 23 |         GRAPHEMES = 0
 24 |         PHONEMES = 0
 25 | 
 26 |     def __init__(self, language: str="ru", mode: Mode=Mode.GRAPHEMES, raw_dict_path=None, trie_path=None,
 27 |                  zalyzniak_dict=ZALYZNYAK_DICT, cmu_dict=CMU_DICT) -> None:
 28 |         self.data = pygtrie.Trie()  # type: Dict[str, Set[Stress]]
 29 |         self.raw_dict_path = raw_dict_path
 30 |         self.trie_path = trie_path
 31 |         if language == "ru" and mode == self.Mode.GRAPHEMES:
 32 |             self.__init_defaults(RU_GRAPHEME_STRESS_PATH, RU_GRAPHEME_STRESS_TRIE_PATH)
 33 |             if not os.path.exists(self.raw_dict_path):
 34 |                 from rupo.dict.zaliznyak import ZalyzniakDict
 35 |                 ZalyzniakDict.convert_to_accent_only(zalyzniak_dict, self.raw_dict_path)
 36 |         elif mode == self.Mode.PHONEMES and language == "en":
 37 |             self.__init_defaults(EN_PHONEME_STRESS_PATH, EN_PHONEME_STRESS_TRIE_PATH)
 38 |             if not os.path.exists(self.raw_dict_path):
 39 |                 CMUDict.convert_to_phoneme_stress(cmu_dict, self.raw_dict_path)
 40 |         else:
 41 |             assert False
 42 |         if not os.path.isfile(self.raw_dict_path):
 43 |             raise FileNotFoundError("Dictionary raw file not found.")
 44 |         if os.path.isfile(self.trie_path):
 45 |             self.load(self.trie_path)
 46 |         else:
 47 |             self.create(self.raw_dict_path, self.trie_path)
 48 | 
 49 |     def __init_defaults(self, raw_dict_path, trie_path):
 50 |         if self.raw_dict_path is None:
 51 |             self.raw_dict_path = raw_dict_path
 52 |         if self.trie_path is None:
 53 |             self.trie_path = trie_path
 54 | 
 55 |     def create(self, src_filename: str, dst_filename: str) -> None:
 56 |         """
 57 |         Загрузка словаря из файла.
 58 | 
 59 |         :param src_filename: имя файла с оригинальным словарём.
 60 |         :param dst_filename: имя файла, в который будет сохранён дамп.
 61 |         """
 62 |         with open(src_filename, 'r', encoding='utf-8') as f:
 63 |             for line in f:
 64 |                 word, primary, secondary = line.split("\t")
 65 |                 stresses = [Stress(int(a), Stress.Type.PRIMARY) for a in primary.strip().split(",")]
 66 |                 if secondary.strip() != "":
 67 |                     stresses += [Stress(int(a), Stress.Type.SECONDARY) for a in secondary.strip().split(",")]
 68 |                 self.update(word, stresses)
 69 |         self.save(dst_filename)
 70 | 
 71 |     def save(self, dst_filename: str) -> None:
 72 |         """
 73 |         Сохранение дампа.
 74 |         
 75 |         :param dst_filename: имя файла, в который сохраняем дамп словаря.
 76 |         """
 77 |         with open(dst_filename, "wb") as f:
 78 |             pickle.dump(self.data, f, pickle.HIGHEST_PROTOCOL)
 79 | 
 80 |     def load(self, dump_filename: str) -> None:
 81 |         """
 82 |         Загрузка дампа словаря.
 83 |         
 84 |         :param dump_filename: откуда загружаем.
 85 |         """
 86 |         with open(dump_filename, "rb") as f:
 87 |             self.data = pickle.load(f)
 88 | 
 89 |     def get_stresses(self, word: str, stress_type: Stress.Type=Stress.Type.ANY) -> List[int]:
 90 |         """
 91 |         Получение ударений нужного типа у слова.
 92 | 
 93 |         :param word: слово, которое мы хотим посмотреть в словаре.
 94 |         :param stress_type: тип ударения.
 95 |         :return forms: массив всех ударений.
 96 |         """
 97 |         if word in self.data:
 98 |             if stress_type == Stress.Type.ANY:
 99 |                 return [stress.position for stress in self.data[word]]
100 |             else:
101 |                 return [stress.position for stress in self.data[word] if stress.type == stress_type]
102 |         return []
103 | 
104 |     def get_all(self) -> ItemsView[str, Set[Stress]]:
105 |         """
106 |         :return items: все ключи и ударения словаря.
107 |         """
108 |         return self.data.items()
109 | 
110 |     def update(self, word: str, stresses: List[Stress]) -> None:
111 |         """
112 |         Обновление словаря.
113 | 
114 |         :param word: слово.
115 |         :param stresses: набор ударений.
116 |         """
117 |         if word not in self.data:
118 |             self.data[word] = set(stresses)
119 |         else:
120 |             self.data[word].update(stresses)
121 | 
122 |     def update_primary_only(self, word: str, stresses: List[int]) -> None:
123 |         self.update(word, [Stress(stress, Stress.Type.PRIMARY) for stress in stresses])


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # rupo documentation build configuration file, created by
  5 | # sphinx-quickstart on Sat Mar 18 02:33:48 2017.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | #
 20 | import os
 21 | import sys
 22 | sys.path.insert(0, os.path.abspath("../.."))
 23 | 
 24 | 
 25 | # -- General configuration ------------------------------------------------
 26 | 
 27 | # If your documentation needs a minimal Sphinx version, state it here.
 28 | #
 29 | # needs_sphinx = '1.0'
 30 | 
 31 | # Add any Sphinx extension module names here, as strings. They can be
 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 33 | # ones.
 34 | extensions = ['sphinx.ext.autodoc',
 35 |     'sphinx.ext.doctest',
 36 |     'sphinx.ext.viewcode',
 37 |     'sphinx.ext.githubpages']
 38 | 
 39 | # Add any paths that contain templates here, relative to this directory.
 40 | templates_path = ['_templates']
 41 | 
 42 | # The suffix(es) of source filenames.
 43 | # You can specify multiple suffix as a list of string:
 44 | #
 45 | # source_suffix = ['.rst', '.md']
 46 | source_suffix = '.rst'
 47 | 
 48 | # The master toctree document.
 49 | master_doc = 'index'
 50 | 
 51 | # General information about the project.
 52 | project = 'rupo'
 53 | copyright = '2017, Ilya Gusev'
 54 | author = 'Ilya Gusev'
 55 | 
 56 | # The version info for the project you're documenting, acts as replacement for
 57 | # |version| and |release|, also used in various other places throughout the
 58 | # built documents.
 59 | #
 60 | # The short X.Y version.
 61 | version = '0.2.4'
 62 | # The full version, including alpha/beta/rc tags.
 63 | release = '0.2.4'
 64 | 
 65 | # The language for content autogenerated by Sphinx. Refer to documentation
 66 | # for a list of supported languages.
 67 | #
 68 | # This is also used if you do content translation via gettext catalogs.
 69 | # Usually you set "language" from the command line for these cases.
 70 | language = 'ru'
 71 | 
 72 | # List of patterns, relative to source directory, that match files and
 73 | # directories to ignore when looking for source files.
 74 | # This patterns also effect to html_static_path and html_extra_path
 75 | exclude_patterns = []
 76 | 
 77 | # The name of the Pygments (syntax highlighting) style to use.
 78 | pygments_style = 'sphinx'
 79 | 
 80 | # If true, `todo` and `todoList` produce output, else they produce nothing.
 81 | todo_include_todos = False
 82 | 
 83 | 
 84 | # -- Options for HTML output ----------------------------------------------
 85 | 
 86 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 87 | # a list of builtin themes.
 88 | #
 89 | html_theme = 'default'
 90 | 
 91 | # Theme options are theme-specific and customize the look and feel of a theme
 92 | # further.  For a list of options available for each theme, see the
 93 | # documentation.
 94 | #
 95 | # html_theme_options = {}
 96 | 
 97 | # Add any paths that contain custom static files (such as style sheets) here,
 98 | # relative to this directory. They are copied after the builtin static files,
 99 | # so a file named "default.css" will overwrite the builtin "default.css".
100 | html_static_path = ['_static']
101 | 
102 | 
103 | # -- Options for HTMLHelp output ------------------------------------------
104 | 
105 | # Output file base name for HTML help builder.
106 | htmlhelp_basename = 'rupodoc'
107 | 
108 | 
109 | # -- Options for LaTeX output ---------------------------------------------
110 | 
111 | latex_elements = {
112 |     # The paper size ('letterpaper' or 'a4paper').
113 |     #
114 |     # 'papersize': 'letterpaper',
115 | 
116 |     # The font size ('10pt', '11pt' or '12pt').
117 |     #
118 |     # 'pointsize': '10pt',
119 | 
120 |     # Additional stuff for the LaTeX preamble.
121 |     #
122 |     # 'preamble': '',
123 | 
124 |     # Latex figure (float) alignment
125 |     #
126 |     # 'figure_align': 'htbp',
127 | }
128 | 
129 | # Grouping the document tree into LaTeX files. List of tuples
130 | # (source start file, target name, title,
131 | #  author, documentclass [howto, manual, or own class]).
132 | latex_documents = [
133 |     (master_doc, 'rupo.tex', 'rupo Documentation',
134 |      'Ilya Gusev', 'manual'),
135 | ]
136 | 
137 | 
138 | # -- Options for manual page output ---------------------------------------
139 | 
140 | # One entry per manual page. List of tuples
141 | # (source start file, name, description, authors, manual section).
142 | man_pages = [
143 |     (master_doc, 'rupo', 'rupo Documentation',
144 |      [author], 1)
145 | ]
146 | 
147 | 
148 | # -- Options for Texinfo output -------------------------------------------
149 | 
150 | # Grouping the document tree into Texinfo files. List of tuples
151 | # (source start file, target name, title, author,
152 | #  dir menu entry, description, category)
153 | texinfo_documents = [
154 |     (master_doc, 'rupo', 'rupo Documentation',
155 |      author, 'rupo', 'One line description of project.',
156 |      'Miscellaneous'),
157 | ]
158 | 


--------------------------------------------------------------------------------
/rupo/metre/test_pattern_analyzer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Автор: Гусев Илья
 3 | # Описание: Тесты к компилятору выражений.
 4 | 
 5 | import unittest
 6 | 
 7 | from rupo.metre.pattern_analyzer import PatternAnalyzer
 8 | 
 9 | 
10 | class TestPatternAnalyzer(unittest.TestCase):
11 |     def test_pattern_analyzer(self):
12 |         self.assertEqual(PatternAnalyzer.count_errors("(s)*", "uuu"), ('sss', 0, 3, False))
13 |         self.assertEqual(PatternAnalyzer.count_errors("(s)*", "uus"), ('sss', 0, 2, False))
14 |         self.assertEqual(PatternAnalyzer.count_errors("(s)*", "usu"), ('sss', 0, 2, False))
15 |         self.assertEqual(PatternAnalyzer.count_errors("(s)*", "uss"), ('sss', 0, 1, False))
16 |         self.assertEqual(PatternAnalyzer.count_errors("(s)*", "suu"), ('sss', 0, 2, False))
17 |         self.assertEqual(PatternAnalyzer.count_errors("(s)*", "sus"), ('sss', 0, 1, False))
18 |         self.assertEqual(PatternAnalyzer.count_errors("(s)*", "ssu"), ('sss', 0, 1, False))
19 |         self.assertEqual(PatternAnalyzer.count_errors("(s)*", "sss"), ('sss', 0, 0, False))
20 | 
21 |         self.assertEqual(PatternAnalyzer.count_errors("(sus)*(u)?", 'suu'), ('sus', 0, 1, False))
22 | 
23 |         self.assertEqual(PatternAnalyzer.count_errors("((sus)*u)*s", 'susss'), ('susus', 0, 1, False))
24 | 
25 |         self.assertEqual(PatternAnalyzer.count_errors("(s((s)*u)*)*", 'susss'), ('susss', 0, 0, False))
26 |         self.assertEqual(PatternAnalyzer.count_errors("(s((s)*u)*)*", 'usss'), ('ssss', 0, 1, False))
27 |         self.assertEqual(PatternAnalyzer.count_errors("(s((s)*u)*)*", 'suuu'), ('suuu', 0, 0, False))
28 |         self.assertEqual(PatternAnalyzer.count_errors("(s((s)*u)*)*", 'suuusuuus'), ('suuusuuus', 0, 0, False))
29 | 
30 |         self.assertEqual(PatternAnalyzer.count_errors("(sss((sus)*uss)*)*", 'ssssussususs'), ('ssssussususs', 0, 0, False))
31 |         self.assertEqual(PatternAnalyzer.count_errors("(sss((sus)*uss)*)*", 'ssssuuuss'), ('ssssususs', 0, 1, False))
32 | 
33 |         self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", "uuuu"), ('susu', 0, 2, False))
34 |         self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", "uuus"), ('ssus', 0, 2, False))
35 |         self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", "uusu"), ('susu', 0, 1, False))
36 |         self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", "uuss"), ('suss', 0, 1, False))
37 |         self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", "usuu"), ('sssu', 0, 2, False))
38 |         self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", "usus"), ('ssus', 0, 1, False))
39 |         self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", "ussu"), ('sssu', 0, 1, False))
40 |         self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", "usss"), ('ssss', 0, 1, False))
41 |         self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", "suuu"), ('susu', 0, 1, False))
42 |         self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", "suus"), ('ssus', 0, 1, False))
43 |         self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", "susu"), ('susu', 0, 0, False))
44 |         self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", "suss"), ('suss', 0, 0, False))
45 |         self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", "ssuu"), ('sssu', 0, 1, False))
46 |         self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", 'ssus'), ('ssus', 0, 0, False))
47 |         self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", 'sssu'), ('sssu', 0, 0, False))
48 |         self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", "ssss"), ('ssss', 0, 0, False))
49 | 
50 |         self.assertEqual(PatternAnalyzer.count_errors("(s)?(u)?(S)?", "su"), ('su', 0, 0, False))
51 |         self.assertEqual(PatternAnalyzer.count_errors("(s)?(u)?(S)?", "ss"), ('su', 0, 1, False))
52 |         self.assertEqual(PatternAnalyzer.count_errors("(s)?(u)?(S)?", "uS"), ('uS', 0, 0, False))
53 |         self.assertEqual(PatternAnalyzer.count_errors("(s)?(u)?(S)?", "sS"), ('sS', 0, 0, False))
54 | 
55 |         self.assertEqual(PatternAnalyzer.count_errors("(s)?(u)(s)*", "u"), ('u', 0, 0, False))
56 |         self.assertEqual(PatternAnalyzer.count_errors("(s)?(u)(s)*", "su"), ('su', 0, 0, False))
57 |         self.assertEqual(PatternAnalyzer.count_errors("(s)?(u)(s)*", "us"), ('us', 0, 0, False))
58 |         self.assertEqual(PatternAnalyzer.count_errors("(s)?(u)(s)*", "sus"), ('sus', 0, 0, False))
59 |         self.assertEqual(PatternAnalyzer.count_errors("(s)?(u)(s)*", "uss"), ('uss', 0, 0, False))
60 | 
61 |         self.assertEqual(PatternAnalyzer.count_errors("(us)*(uS)(U)?(U)?", "usuS"), ('usuS', 0, 0, False))
62 |         self.assertEqual(PatternAnalyzer.count_errors("(us)*(uS)(U)?(U)?", "uSUU"), ('uSUU', 0, 0, False))
63 | 
64 |         self.assertEqual(PatternAnalyzer.count_errors("(su(u)?)*", "su"), ('su', 0, 0, False))
65 |         self.assertEqual(PatternAnalyzer.count_errors("(su(u)?)*", "suu"), ('suu', 0, 0, False))
66 |         self.assertEqual(PatternAnalyzer.count_errors("(su(u)?)*", "susu"), ('susu', 0, 0, False))
67 |         self.assertEqual(PatternAnalyzer.count_errors("(su(u)?)*", "suusuu"), ('suusuu', 0, 0, False))
68 |         self.assertEqual(PatternAnalyzer.count_errors("(su(u)?)*", "ssussu"), ('suusuu', 0, 2, False))
69 | 
70 |         self.assertEqual(PatternAnalyzer.count_errors("(u)?(u)?((s)(u)?(u)?)*(S)(U)?(U)?", "sssuSU"), ('sssuSU', 0, 0, False))
71 |         self.assertEqual(PatternAnalyzer.count_errors("(u)?(u)?((s)(u)?(u)?)*(S)(U)?(U)?", "ussuSU"), ('ussuSU', 0, 0, False))
72 |         self.assertEqual(PatternAnalyzer.count_errors("(u)?(u)?((s)(u)?(u)?)*(S)(U)?(U)?", "susuuSU"), ('susuuSU', 0, 0, False))
73 |         self.assertEqual(PatternAnalyzer.count_errors("(u)?(u)?((s)(u)?(u)?)*(S)(U)?(U)?", "uusuuSU"), ('uusuuSU', 0, 0, False))
74 | 


--------------------------------------------------------------------------------
/rupo/main/tokenizer.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Автор: Гусев Илья
  3 | # Описание: Модуль токенизации.
  4 | 
  5 | import re
  6 | from typing import List
  7 | from enum import Enum, unique
  8 | 
  9 | from rupo.settings import HYPHEN_TOKENS
 10 | 
 11 | 
 12 | class Token:
 13 |     @unique
 14 |     class TokenType(Enum):
 15 |         """
 16 |         Тип токена.
 17 |         """
 18 |         UNKNOWN = -1
 19 |         WORD = 0
 20 |         PUNCTUATION = 1
 21 |         SPACE = 2
 22 |         ENDLINE = 3
 23 |         NUMBER = 4
 24 | 
 25 |         def __str__(self):
 26 |             return str(self.name)
 27 | 
 28 |         def __repr__(self):
 29 |             return self.__str__()
 30 | 
 31 |     def __init__(self, text: str, token_type: TokenType, begin: int, end: int):
 32 |         """
 33 |         :param text: исходный текст.
 34 |         :param token_type: тип токена.
 35 |         :param begin: начало позиции токена в тексте.
 36 |         :param end: конец позиции токена в тексте.
 37 |         """
 38 |         self.token_type = token_type
 39 |         self.begin = begin
 40 |         self.end = end
 41 |         self.text = text
 42 | 
 43 |     def __str__(self):
 44 |         return "'" + self.text + "'" + "|" + str(self.token_type) + " (" + str(self.begin) + ", " + str(self.end) + ")"
 45 | 
 46 |     def __repr__(self):
 47 |         return self.__str__()
 48 | 
 49 |     def __eq__(self, other):
 50 |         return self.text == other.text and self.token_type == other.token_type
 51 | 
 52 | 
 53 | class Tokenizer(object):
 54 |     """
 55 |     Класс токенизации.
 56 |     """
 57 |     @staticmethod
 58 |     def tokenize(text: str, remove_punct=False, remove_unknown=False, replace_numbers=False) -> List[Token]:
 59 |         """
 60 |         Токенизация текстов на русском языке с учётом знаков препинания и слов с дефисами.
 61 | 
 62 |         :param text: исходный текст.
 63 |         :return: список токенов.
 64 |         """
 65 |         tokens = []
 66 |         punctuation = ".,?:;!—"
 67 |         begin = -1
 68 |         for i, ch in enumerate(text):
 69 |             if ch.isalpha() or ch == "-":
 70 |                 if begin == -1:
 71 |                     begin = i
 72 |             else:
 73 |                 if begin != -1:
 74 |                     tokens.append(Tokenizer.__form_token(text, begin, i))
 75 |                     begin = -1
 76 |                 token_type = Token.TokenType.UNKNOWN
 77 |                 if ch in punctuation:
 78 |                     token_type = Token.TokenType.PUNCTUATION
 79 |                 elif ch == "\n":
 80 |                     token_type = Token.TokenType.ENDLINE
 81 |                 elif ch == " ":
 82 |                     token_type = Token.TokenType.SPACE
 83 |                 elif ch.isdigit():
 84 |                     token_type = Token.TokenType.NUMBER
 85 |                 if len(tokens) != 0 and tokens[-1].token_type == token_type:
 86 |                     tokens[-1].text += ch
 87 |                     tokens[-1].end += 1
 88 |                 else:
 89 |                     tokens.append(Token(ch, token_type, i, i + 1))
 90 |         if begin != -1:
 91 |             tokens.append(Tokenizer.__form_token(text, begin, len(text)))
 92 |         tokens = Tokenizer.__hyphen_map(tokens)
 93 |         if remove_punct:
 94 |             tokens = [token for token in tokens if token.token_type != Token.TokenType.PUNCTUATION]
 95 |         if remove_unknown:
 96 |             tokens = [token for token in tokens if token.token_type != Token.TokenType.UNKNOWN]
 97 |         if replace_numbers:
 98 |             for token in tokens:
 99 |                 if token.token_type != Token.TokenType.NUMBER:
100 |                     continue
101 |                 token.text = "ЧИСЛО"
102 |                 token.token_type = Token.TokenType.WORD
103 |         return tokens
104 | 
105 |     @staticmethod
106 |     def __form_token(text, begin, end):
107 |         word = text[begin:end]
108 |         if word != "-":
109 |             return Token(word, Token.TokenType.WORD, begin, end)
110 |         else:
111 |             return Token("-", Token.TokenType.PUNCTUATION, begin, begin + 1)
112 | 
113 |     @staticmethod
114 |     def __hyphen_map(tokens: List[Token]) -> List[Token]:
115 |         """
116 |         Слова из словаря оставляем с дефисом, остальные разделяем.
117 | 
118 |         :param tokens: токены.
119 |         :return: токены после обработки.
120 |         """
121 |         new_tokens = []
122 |         hyphen_tokens = Tokenizer.__get_hyphen_tokens()
123 |         for token in tokens:
124 |             if token.token_type != Token.TokenType.WORD:
125 |                 new_tokens.append(token)
126 |                 continue
127 |             is_one_word = True
128 |             if "-" in token.text:
129 |                 is_one_word = False
130 |                 for hyphen_token in hyphen_tokens:
131 |                     if hyphen_token in token.text or token.text in hyphen_token:
132 |                         is_one_word = True
133 |             if is_one_word:
134 |                 new_tokens.append(token)
135 |             else:
136 |                 texts = token.text.split("-")
137 |                 pos = token.begin
138 |                 for text in texts:
139 |                     new_tokens.append(Token(text, Token.TokenType.WORD, pos, pos+len(text)))
140 |                     pos += len(text) + 1
141 |         return new_tokens
142 | 
143 |     @staticmethod
144 |     def __get_hyphen_tokens():
145 |         """
146 |         :return: содержание словаря, в котором прописаны слова с дефисом.
147 |         """
148 |         with open(HYPHEN_TOKENS, "r", encoding="utf-8") as file:
149 |             hyphen_tokens = [token.strip() for token in file.readlines()]
150 |             return hyphen_tokens
151 | 
152 | 
153 | class SentenceTokenizer(object):
154 |     @staticmethod
155 |     def tokenize(text: str) -> List[str]:
156 |         m = re.split(r'(?<=[^А-ЯЁ].[^А-ЯЁ][.?!;]) +(?=[А-ЯЁ])', text)
157 |         return m
158 | 


--------------------------------------------------------------------------------
/rupo/files/reader.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Автор: Гусев Илья
  3 | # Описание: Считыватель файлов разных расширений.
  4 | 
  5 | import os
  6 | import xml.etree.ElementTree as etree
  7 | import json
  8 | from enum import Enum
  9 | from typing import Iterator
 10 | 
 11 | from rupo.main.markup import Markup
 12 | from rupo.metre.metre_classifier import MetreClassifier
 13 | from rupo.stress.predictor import StressPredictor
 14 | 
 15 | 
 16 | RAW_SEPARATOR = "\n\n\n"
 17 | 
 18 | 
 19 | class FileType(Enum):
 20 |     """
 21 |     Тип файла.
 22 |     """
 23 |     RAW = ".txt"
 24 |     XML = ".xml"
 25 |     JSON = ".json"
 26 |     VOCAB = ".voc"
 27 | 
 28 | 
 29 | class Reader(object):
 30 |     """
 31 |     Считывание из файлов.
 32 |     """
 33 |     @staticmethod
 34 |     def read_markups(path: str, source_type: FileType, is_processed: bool,
 35 |                      stress_predictor: StressPredictor=None) -> Iterator[Markup]:
 36 |         """
 37 |         Считывание разметок (включая разметку по сырым текстам).
 38 | 
 39 |         :param path: путь к файлу/папке.
 40 |         :param source_type: тип файлов.
 41 |         :param is_processed: уже размеченные тексты?
 42 |         :param stress_predictor: классификатор ударений (для неразмеченных текстов).
 43 |         """
 44 |         paths = Reader.get_paths(path, source_type.value)
 45 |         for filename in paths:
 46 |             with open(filename, "r", encoding="utf-8") as file:
 47 |                 if is_processed:
 48 |                     if source_type == FileType.XML:
 49 |                         for elem in Reader.__xml_iter(file, 'markup'):
 50 |                             yield Markup().from_xml(etree.tostring(elem, encoding='utf-8', method='xml'))
 51 |                     elif source_type == FileType.JSON:
 52 |                         j = json.load(file)
 53 |                         for item in j['items']:
 54 |                             yield Markup().from_dict(item)
 55 |                     elif source_type == FileType.RAW:
 56 |                         separator_count = 0
 57 |                         text = ""
 58 |                         for line in file:
 59 |                             if line == "\n":
 60 |                                 separator_count += 1
 61 |                             else:
 62 |                                 text += line
 63 |                             if separator_count == 3:
 64 |                                 separator_count = 0
 65 |                                 yield Markup().from_raw(text)
 66 |                         if text != "":
 67 |                             yield Markup().from_raw(text)
 68 |                 else:
 69 |                     assert stress_predictor is not None
 70 |                     for text in Reader.read_texts(filename, source_type):
 71 |                         yield Reader.__markup_text(text, stress_predictor)
 72 | 
 73 |     @staticmethod
 74 |     def read_vocabulary(path: str):
 75 |         """
 76 |         Считывание словаря.
 77 | 
 78 |         :param path: путь к словарю.
 79 |         :return: слово и его индекс.
 80 |         """
 81 |         paths = Reader.get_paths(path, FileType.VOCAB.value)
 82 |         for filename in paths:
 83 |             with open(filename, "r", encoding="utf-8") as file:
 84 |                 for line in file:
 85 |                     fields = line.strip().split('\t')
 86 |                     yield Markup().from_raw(fields[0]).lines[0].words[0], int(fields[1])
 87 | 
 88 |     @staticmethod
 89 |     def read_texts(path: str, source_type: FileType) -> Iterator[str]:
 90 |         """
 91 |         Считывание текстов.
 92 | 
 93 |         :param path: путь к файлу/папке.
 94 |         :param source_type: тип файлов.
 95 |         """
 96 |         paths = Reader.get_paths(path, source_type.value)
 97 |         for filename in paths:
 98 |             with open(filename, "r", encoding="utf-8") as file:
 99 |                 if source_type == FileType.XML:
100 |                     for elem in Reader.__xml_iter(file, 'item'):
101 |                         yield elem.find(".//text").text
102 |                 elif source_type == FileType.JSON:
103 |                     # TODO: ленивый парсинг
104 |                     j = json.load(file)
105 |                     for item in j['items']:
106 |                         yield item['text']
107 |                 elif source_type == FileType.RAW:
108 |                     text = file.read()
109 |                     for t in text.split(RAW_SEPARATOR):
110 |                         yield t
111 | 
112 |     @staticmethod
113 |     def get_paths(path: str, ext: str) -> Iterator[str]:
114 |         """
115 |         Получение всех файлов заданного типа по заданному пути.
116 | 
117 |         :param path: путь к файлу/папке.
118 |         :param ext: требуемое расширение.
119 |         """
120 |         if os.path.isfile(path):
121 |             if ext == os.path.splitext(path)[1]:
122 |                 yield path
123 |         else:
124 |             for root, folders, files in os.walk(path):
125 |                 for file in files:
126 |                     if ext == os.path.splitext(file)[1]:
127 |                         yield os.path.join(root, file)
128 |                 for folder in folders:
129 |                     return Reader.get_paths(folder, ext)
130 | 
131 |     @staticmethod
132 |     def __markup_text(text: str, stress_predictor: StressPredictor) -> Markup:
133 |         """
134 |         Разметка текста.
135 | 
136 |         :param text: текст.
137 |         :return: разметка.
138 |         """
139 |         markup = Markup.process_text(text, stress_predictor)
140 |         markup = MetreClassifier.improve_markup(markup)[0]
141 |         return markup
142 | 
143 |     @staticmethod
144 |     def __xml_iter(file, tag):
145 |         """
146 |         :param file: xml файл.
147 |         :param tag: заданный тег.
148 |         :return: все элементы с заданными тегами в xml.
149 |         """
150 |         return (elem for event, elem in etree.iterparse(file, events=['end']) if event == 'end' and elem.tag == tag)
151 | 


--------------------------------------------------------------------------------
/rupo/metre/test_metre_classifier.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Автор: Гусев Илья
  3 | # Описание: Тесты к классификатору метра.
  4 | 
  5 | import unittest
  6 | import jsonpickle
  7 | import copy
  8 | import logging
  9 | import sys
 10 | 
 11 | from rupo.main.markup import Markup
 12 | from rupo.stress.predictor import CombinedStressPredictor
 13 | from rupo.metre.metre_classifier import MetreClassifier, ClassificationResult, StressCorrection
 14 | from rupo.settings import RU_STRESS_DEFAULT_MODEL, ZALYZNYAK_DICT, CMU_DICT, RU_GRAPHEME_STRESS_PATH, \
 15 |     RU_GRAPHEME_STRESS_TRIE_PATH
 16 | 
 17 | 
 18 | class TestMetreClassifier(unittest.TestCase):
 19 |     @classmethod
 20 |     def setUpClass(cls):
 21 |         cls.stress_predictor = CombinedStressPredictor(
 22 |             stress_model_path=RU_STRESS_DEFAULT_MODEL,
 23 |             zalyzniak_dict=ZALYZNYAK_DICT,
 24 |             cmu_dict=CMU_DICT,
 25 |             raw_stress_dict_path=RU_GRAPHEME_STRESS_PATH,
 26 |             stress_trie_path=RU_GRAPHEME_STRESS_TRIE_PATH
 27 |         )
 28 |         logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
 29 | 
 30 |     @classmethod
 31 |     def tearDownClass(cls):
 32 |         del cls.stress_predictor
 33 | 
 34 |     def test_classification_result(self):
 35 |         result = ClassificationResult(5)
 36 |         result.additions["iambos"].append(StressCorrection(0, 0, 0, "", 0))
 37 |         self.assertEqual(result, jsonpickle.decode(result.to_json()))
 38 | 
 39 |     def test_metre_classifier1(self):
 40 |         text = "Горит восток зарёю новой.\n" \
 41 |                "Уж на равнине, по холмам\n" \
 42 |                "Грохочут пушки. Дым багровый\n" \
 43 |                "Кругами всходит к небесам."
 44 |         markup, result = MetreClassifier.improve_markup(Markup.process_text(text, self.stress_predictor))
 45 |         self.assertIsInstance(markup, Markup)
 46 |         self.assertIsInstance(result, ClassificationResult)
 47 |         self.assertEqual(result.metre, "iambos")
 48 | 
 49 |     def test_metre_classifier2(self):
 50 |         text = "Буря мглою небо кроет,\n" \
 51 |                "Вихри снежные крутя;\n" \
 52 |                "То, как зверь, она завоет,\n" \
 53 |                "То заплачет, как дитя..."
 54 |         markup, result = MetreClassifier.improve_markup(Markup.process_text(text, self.stress_predictor))
 55 |         self.assertEqual(result.metre, "choreios")
 56 | 
 57 |     def test_metre_classifier3(self):
 58 |         text = "На стеклах нарастает лед,\n"\
 59 |                "Часы твердят: «Не трусь!»\n"\
 60 |                "Услышать, что ко мне идет,\n"\
 61 |                "И мертвой я боюсь.\n"\
 62 |                "Как идола, молю я дверь;\n"\
 63 |                "«Не пропускай беду!»\n"\
 64 |                "Кто воет за стеной, как зверь,\n"\
 65 |                "Кто прячется в саду?"
 66 |         markup, result = MetreClassifier.improve_markup(Markup.process_text(text, self.stress_predictor))
 67 |         self.assertEqual(result.metre, "iambos")
 68 | 
 69 |     def test_metre_classifier4(self):
 70 |         text = "Вот уж вечер. Роса\n" \
 71 |                "Блестит на крапиве.\n"\
 72 |                "Я стою у дороги,\n"\
 73 |                "Прислонившись к иве.\n"\
 74 |                "От луны свет большой\n"\
 75 |                "Прямо на нашу крышу.\n"\
 76 |                "Где-то песнь соловья\n"\
 77 |                "Хорошо и тепло,\n"\
 78 |                "Как зимой у печки.\n"\
 79 |                "И березы стоят,\n"\
 80 |                "Как большие свечки.\n"\
 81 |                "И вдали за рекой,\n"\
 82 |                "Видно, за опушкой,\n"\
 83 |                "Сонный сторож стучит\n"\
 84 |                "Мертвой колотушкой."
 85 |         markup, result = MetreClassifier.improve_markup(Markup.process_text(text, self.stress_predictor))
 86 |         self.assertTrue(result.metre == "dolnik3" or result.metre == "dolnik2")
 87 | 
 88 |     def test_metre_classifier5(self):
 89 |         text = "Глыбу кварца разбили молотом,\n" \
 90 |                "И, веселым огнем горя,\n" \
 91 |                "Заблестели крупинки золота\n" \
 92 |                "В свете тусклого фонаря.\n" \
 93 |                "И вокруг собрались откатчики:\n" \
 94 |                "Редкий случай, чтоб так, в руде!\n" \
 95 |                "И от ламп заплясали зайчики,\n" \
 96 |                "Отражаясь в черной воде...\n" \
 97 |                "Прислонившись к мокрой стене,\n" \
 98 |                "Мы стояли вокруг.\n" \
 99 |                "Курили,\n" \
100 |                "Прислонившись к мокрой стене,\n" \
101 |                "И мечтательно говорили\n" \
102 |                "Не о золоте — о весне.\n" \
103 |                "И о том, что скоро, наверно,\n" \
104 |                "На заливе вспотеет лед\n" \
105 |                "И, снега огласив сиреной,\n" \
106 |                "Наконец придет пароход...\n" \
107 |                "Покурили еще немного,\n" \
108 |                "Золотинки в кисет смели\n" \
109 |                "И опять — по своим дорогам,\n" \
110 |                "К вагонеткам своим пошли.\n" \
111 |                "Что нам золото? В дни тяжелые\n" \
112 |                "Я от жадности злой не слеп.\n" \
113 |                "Самородки большие, желтые\n" \
114 |                "Отдавал за табак и хлеб.\n" \
115 |                "Не о золоте были мысли...\n" \
116 |                "В ночь таежную у костра\n" \
117 |                "Есть над чем поразмыслить в жизни,\n" \
118 |                "Кроме\n" \
119 |                "Золота-серебра."
120 |         markup, result = MetreClassifier.improve_markup(Markup.process_text(text, self.stress_predictor))
121 |         self.assertTrue(result.metre == "dolnik3" or result.metre == "dolnik2")
122 | 
123 |     def test_metre_classifier6(self):
124 |         text = "Лючинь печальная читала вечером ручьисто-вкрадчиво,\n" \
125 |                "Так чутко чувствуя журчащий вычурно чужой ей плач,\n" \
126 |                "И, в человечестве чтя нечто вечное, чем чушь Бокаччио,\n" \
127 |                "От чар отчаянья кручинно-скучная, чла час удач."
128 |         markup, result = MetreClassifier.improve_markup(Markup.process_text(text, self.stress_predictor))
129 |         self.assertTrue(result.metre == "iambos")
130 | 
131 |     def test_improve(self):
132 |         text = "Буря мглою небо кроет,\n" \
133 |                "Вихри снежные крутя;\n" \
134 |                "То, как зверь, она завоет,\n" \
135 |                "То заплачет, как дитя..."
136 |         initial_markup = Markup.process_text(text, self.stress_predictor)
137 |         markup, result = MetreClassifier.improve_markup(copy.deepcopy(initial_markup))
138 |         self.assertNotEqual(markup.lines[0].words[0].syllables[0].stress, -1)
139 |         self.assertEqual(markup.lines[0].words[0].syllables[1].stress, -1)
140 | 
141 | 
142 | 
143 | 
144 | 


--------------------------------------------------------------------------------
/rupo/data/examples/markup.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1W.0" encoding="UTF-8"?><items><markup><version>2</version><lines><item><begin>0</begin><text>Забывши волнения жизни мятежной,</text><end>33</end><words><item><syllables><item><number>0</number><accent>-1</accent><begin>0</begin><text>За</text><end>2</end></item><item><number>1</number><accent>3</accent><begin>2</begin><text>бы</text><end>4</end></item><item><number>2</number><accent>-1</accent><begin>4</begin><text>вши</text><end>7</end></item></syllables><begin>0</begin><text>Забывши</text><end>7</end></item><item><syllables><item><number>0</number><accent>-1</accent><begin>0</begin><text>во</text><end>2</end></item><item><number>1</number><accent>4</accent><begin>2</begin><text>лне</text><end>5</end></item><item><number>2</number><accent>-1</accent><begin>5</begin><text>ни</text><end>7</end></item><item><number>3</number><accent>-1</accent><begin>7</begin><text>я</text><end>8</end></item></syllables><begin>8</begin><text>волнения</text><end>16</end></item><item><syllables><item><number>0</number><accent>1</accent><begin>0</begin><text>жи</text><end>2</end></item><item><number>1</number><accent>-1</accent><begin>2</begin><text>зни</text><end>5</end></item></syllables><begin>17</begin><text>жизни</text><end>22</end></item><item><syllables><item><number>0</number><accent>-1</accent><begin>0</begin><text>мя</text><end>2</end></item><item><number>1</number><accent>3</accent><begin>2</begin><text>те</text><end>4</end></item><item><number>2</number><accent>-1</accent><begin>4</begin><text>жной</text><end>8</end></item></syllables><begin>23</begin><text>мятежной</text><end>31</end></item></words></item><item><begin>33</begin><text>Один жил в пустыне рыбак молодой.</text><end>67</end><words><item><syllables><item><number>0</number><accent>-1</accent><begin>0</begin><text>О</text><end>1</end></item><item><number>1</number><accent>2</accent><begin>1</begin><text>дин</text><end>4</end></item></syllables><begin>33</begin><text>Один</text><end>37</end></item><item><syllables><item><number>0</number><accent>1</accent><begin>0</begin><text>жил</text><end>3</end></item></syllables><begin>38</begin><text>жил</text><end>41</end></item><item><syllables></syllables><begin>42</begin><text>в</text><end>43</end></item><item><syllables><item><number>0</number><accent>-1</accent><begin>0</begin><text>пу</text><end>2</end></item><item><number>1</number><accent>4</accent><begin>2</begin><text>сты</text><end>5</end></item><item><number>2</number><accent>-1</accent><begin>5</begin><text>не</text><end>7</end></item></syllables><begin>44</begin><text>пустыне</text><end>51</end></item><item><syllables><item><number>0</number><accent>-1</accent><begin>0</begin><text>ры</text><end>2</end></item><item><number>1</number><accent>3</accent><begin>2</begin><text>бак</text><end>5</end></item></syllables><begin>52</begin><text>рыбак</text><end>57</end></item><item><syllables><item><number>0</number><accent>-1</accent><begin>0</begin><text>мо</text><end>2</end></item><item><number>1</number><accent>-1</accent><begin>2</begin><text>ло</text><end>4</end></item><item><number>2</number><accent>5</accent><begin>4</begin><text>дой</text><end>7</end></item></syllables><begin>58</begin><text>молодой</text><end>65</end></item></words></item><item><begin>67</begin><text>Однажды на скале прибрежной,</text><end>96</end><words><item><syllables><item><number>0</number><accent>-1</accent><begin>0</begin><text>О</text><end>1</end></item><item><number>1</number><accent>3</accent><begin>1</begin><text>дна</text><end>4</end></item><item><number>2</number><accent>-1</accent><begin>4</begin><text>жды</text><end>7</end></item></syllables><begin>67</begin><text>Однажды</text><end>74</end></item><item><syllables><item><number>0</number><accent>1</accent><begin>0</begin><text>на</text><end>2</end></item></syllables><begin>75</begin><text>на</text><end>77</end></item><item><syllables><item><number>0</number><accent>2</accent><begin>0</begin><text>ска</text><end>3</end></item><item><number>1</number><accent>-1</accent><begin>3</begin><text>ле</text><end>5</end></item></syllables><begin>78</begin><text>скале</text><end>83</end></item><item><syllables><item><number>0</number><accent>-1</accent><begin>0</begin><text>при</text><end>3</end></item><item><number>1</number><accent>5</accent><begin>3</begin><text>бре</text><end>6</end></item><item><number>2</number><accent>-1</accent><begin>6</begin><text>жной</text><end>10</end></item></syllables><begin>84</begin><text>прибрежной</text><end>94</end></item></words></item><item><begin>96</begin><text>Над тихой прозрачной рекой</text><end>123</end><words><item><syllables><item><number>0</number><accent>1</accent><begin>0</begin><text>Над</text><end>3</end></item></syllables><begin>96</begin><text>Над</text><end>99</end></item><item><syllables><item><number>0</number><accent>1</accent><begin>0</begin><text>ти</text><end>2</end></item><item><number>1</number><accent>-1</accent><begin>2</begin><text>хой</text><end>5</end></item></syllables><begin>100</begin><text>тихой</text><end>105</end></item><item><syllables><item><number>0</number><accent>-1</accent><begin>0</begin><text>про</text><end>3</end></item><item><number>1</number><accent>5</accent><begin>3</begin><text>зра</text><end>6</end></item><item><number>2</number><accent>-1</accent><begin>6</begin><text>чной</text><end>10</end></item></syllables><begin>106</begin><text>прозрачной</text><end>116</end></item><item><syllables><item><number>0</number><accent>-1</accent><begin>0</begin><text>ре</text><end>2</end></item><item><number>1</number><accent>3</accent><begin>2</begin><text>кой</text><end>5</end></item></syllables><begin>117</begin><text>рекой</text><end>122</end></item></words></item><item><begin>123</begin><text>Он с удой беспечно</text><end>142</end><words><item><syllables><item><number>0</number><accent>0</accent><begin>0</begin><text>Он</text><end>2</end></item></syllables><begin>123</begin><text>Он</text><end>125</end></item><item><syllables></syllables><begin>126</begin><text>с</text><end>127</end></item><item><syllables><item><number>0</number><accent>0</accent><begin>0</begin><text>у</text><end>1</end></item><item><number>1</number><accent>-1</accent><begin>1</begin><text>дой</text><end>4</end></item></syllables><begin>128</begin><text>удой</text><end>132</end></item><item><syllables><item><number>0</number><accent>-1</accent><begin>0</begin><text>бе</text><end>2</end></item><item><number>1</number><accent>4</accent><begin>2</begin><text>спе</text><end>5</end></item><item><number>2</number><accent>-1</accent><begin>5</begin><text>чно</text><end>8</end></item></syllables><begin>133</begin><text>беспечно</text><end>141</end></item></words></item><item><begin>142</begin><text>Сидел</text><end>148</end><words><item><syllables><item><number>0</number><accent>-1</accent><begin>0</begin><text>Си</text><end>2</end></item><item><number>1</number><accent>3</accent><begin>2</begin><text>дел</text><end>5</end></item></syllables><begin>142</begin><text>Сидел</text><end>147</end></item></words></item><item><begin>148</begin><text>И думой сердечной</text><end>166</end><words><item><syllables><item><number>0</number><accent>0</accent><begin>0</begin><text>И</text><end>1</end></item></syllables><begin>148</begin><text>И</text><end>149</end></item><item><syllables><item><number>0</number><accent>1</accent><begin>0</begin><text>ду</text><end>2</end></item><item><number>1</number><accent>-1</accent><begin>2</begin><text>мой</text><end>5</end></item></syllables><begin>150</begin><text>думой</text><end>155</end></item><item><syllables><item><number>0</number><accent>-1</accent><begin>0</begin><text>сер</text><end>3</end></item><item><number>1</number><accent>4</accent><begin>3</begin><text>де</text><end>5</end></item><item><number>2</number><accent>-1</accent><begin>5</begin><text>чной</text><end>9</end></item></syllables><begin>156</begin><text>сердечной</text><end>165</end></item></words></item><item><begin>166</begin><text>К прошедшему счастью летел.</text><end>193</end><words><item><syllables></syllables><begin>166</begin><text>К</text><end>167</end></item><item><syllables><item><number>0</number><accent>-1</accent><begin>0</begin><text>про</text><end>3</end></item><item><number>1</number><accent>4</accent><begin>3</begin><text>ше</text><end>5</end></item><item><number>2</number><accent>-1</accent><begin>5</begin><text>дше</text><end>8</end></item><item><number>3</number><accent>-1</accent><begin>8</begin><text>му</text><end>10</end></item></syllables><begin>168</begin><text>прошедшему</text><end>178</end></item><item><syllables><item><number>0</number><accent>2</accent><begin>0</begin><text>сча</text><end>3</end></item><item><number>1</number><accent>-1</accent><begin>3</begin><text>стью</text><end>7</end></item></syllables><begin>179</begin><text>счастью</text><end>186</end></item><item><syllables><item><number>0</number><accent>-1</accent><begin>0</begin><text>ле</text><end>2</end></item><item><number>1</number><accent>3</accent><begin>2</begin><text>тел</text><end>5</end></item></syllables><begin>187</begin><text>летел</text><end>192</end></item></words></item></lines><text>Забывши волнения жизни мятежной,\nОдин жил в пустыне рыбак молодой.\nОднажды на скале прибрежной,\nНад тихой прозрачной рекой\nОн с удой беспечно\nСидел\nИ думой сердечной\nК прошедшему счастью летел.</text></markup></items>


--------------------------------------------------------------------------------
/rupo/api.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Автор: Гусев Илья
  3 | # Описание: Набор внешних методов для работы с библиотекой.
  4 | 
  5 | import os
  6 | from typing import List, Tuple, Dict
  7 | 
  8 | from rulm.language_model import LanguageModel
  9 | 
 10 | from rupo.files.reader import FileType, Reader
 11 | from rupo.files.writer import Writer
 12 | from rupo.main.markup import Markup
 13 | from rupo.metre.metre_classifier import MetreClassifier, ClassificationResult
 14 | from rupo.rhymes.rhymes import Rhymes
 15 | from rupo.settings import ZALYZNYAK_DICT, CMU_DICT, DATA_DIR, DICT_DIR
 16 | from rupo.stress.predictor import StressPredictor, CombinedStressPredictor
 17 | from rupo.main.vocabulary import StressVocabulary, inflate_stress_vocabulary
 18 | from rupo.generate.generator import Generator
 19 | 
 20 | from allennlp.data.vocabulary import Vocabulary, DEFAULT_OOV_TOKEN
 21 | from allennlp.common.util import END_SYMBOL
 22 | from rulm.transform import ExcludeTransform
 23 | from russ.syllables import get_syllables
 24 | 
 25 | 
 26 | class Engine:
 27 |     def __init__(self, language="ru"):
 28 |         self.language = language  # type: str
 29 |         self.vocabulary = None  # type: StressVocabulary
 30 |         self.generator = None  # type: Generator
 31 |         self.stress_predictors = dict()  # type: Dict[str, StressPredictor]
 32 | 
 33 |     def load(self, stress_model_path: str, zalyzniak_dict: str, raw_stress_dict_path=None,
 34 |              stress_trie_path=None):
 35 |         self.stress_predictors = dict()
 36 |         if not os.path.isdir(DATA_DIR):
 37 |             os.makedirs(DATA_DIR)
 38 |         if not os.path.isdir(DICT_DIR):
 39 |             os.makedirs(DICT_DIR)
 40 |         self.get_stress_predictor(self.language, stress_model_path, raw_stress_dict_path,
 41 |                                   stress_trie_path, zalyzniak_dict)
 42 | 
 43 |     def get_vocabulary(self, dump_path: str, markup_path: str) -> StressVocabulary:
 44 |         if self.vocabulary is None:
 45 |             self.vocabulary = StressVocabulary()
 46 |             if os.path.isfile(dump_path):
 47 |                 self.vocabulary.load(dump_path)
 48 |             elif markup_path is not None:
 49 |                 self.vocabulary.parse(markup_path)
 50 |         return self.vocabulary
 51 | 
 52 |     def get_generator(self,
 53 |                       model_path: str,
 54 |                       token_vocab_path: str,
 55 |                       stress_vocab_dump_path: str) -> Generator:
 56 |         if self.generator is None:
 57 |             assert os.path.isdir(model_path) and os.path.isdir(token_vocab_path)
 58 |             vocabulary = Vocabulary.from_files(token_vocab_path)
 59 |             stress_vocabulary = StressVocabulary()
 60 |             if not os.path.isfile(stress_vocab_dump_path):
 61 |                 stress_vocabulary = inflate_stress_vocabulary(vocabulary, self.get_stress_predictor())
 62 |                 stress_vocabulary.save(stress_vocab_dump_path)
 63 |             else:
 64 |                 stress_vocabulary.load(stress_vocab_dump_path)
 65 | 
 66 |             eos_index = vocabulary.get_token_index(END_SYMBOL)
 67 |             unk_index = vocabulary.get_token_index(DEFAULT_OOV_TOKEN)
 68 |             exclude_transform = ExcludeTransform((unk_index, eos_index))
 69 | 
 70 |             model = LanguageModel.load(model_path, vocabulary_dir=token_vocab_path,
 71 |                                        transforms=[exclude_transform, ])
 72 |             self.generator = Generator(model, vocabulary, stress_vocabulary, eos_index)
 73 |         return self.generator
 74 | 
 75 |     def get_stress_predictor(self, language="ru", stress_model_path: str=None, raw_stress_dict_path=None,
 76 |                              stress_trie_path=None, zalyzniak_dict=ZALYZNYAK_DICT, cmu_dict=CMU_DICT):
 77 |         if self.stress_predictors.get(language) is None:
 78 |             self.stress_predictors[language] = CombinedStressPredictor(language, stress_model_path,
 79 |                                                                        raw_stress_dict_path, stress_trie_path,
 80 |                                                                        zalyzniak_dict, cmu_dict)
 81 |         return self.stress_predictors[language]
 82 | 
 83 |     def get_stresses(self, word: str, language: str="ru") -> List[int]:
 84 |         """
 85 |         :param word: слово.
 86 |         :param language: язык.
 87 |         :return: ударения слова.
 88 |         """
 89 |         return self.get_stress_predictor(language).predict(word)
 90 | 
 91 |     @staticmethod
 92 |     def get_word_syllables(word: str) -> List[str]:
 93 |         """
 94 |         :param word: слово.
 95 |         :return: его слоги.
 96 |         """
 97 |         return [syllable.text for syllable in get_syllables(word)]
 98 | 
 99 |     @staticmethod
100 |     def count_syllables(word: str) -> int:
101 |         """
102 |         :param word: слово.
103 |         :return: количество слогов в нём.
104 |         """
105 |         return len(get_syllables(word))
106 | 
107 |     def get_markup(self, text: str, language: str="ru") -> Markup:
108 |         """
109 |         :param text: текст.
110 |         :param language: язык.
111 |         :return: его разметка по словарю.
112 |         """
113 |         return Markup.process_text(text, self.get_stress_predictor(language))
114 | 
115 |     def get_improved_markup(self, text: str, language: str="ru") -> Tuple[Markup, ClassificationResult]:
116 |         """
117 |         :param text: текст.
118 |         :param language: язык.
119 |         :return: его разметка по словарю, классификатору метру и  ML классификатору.
120 |         """
121 |         markup = Markup.process_text(text, self.get_stress_predictor(language))
122 |         return MetreClassifier.improve_markup(markup)
123 | 
124 |     def classify_metre(self, text: str, language: str="ru") -> str:
125 |         """
126 |         :param text: текст.
127 |         :param language: язык.
128 |         :return: его метр.
129 |         """
130 |         return MetreClassifier.classify_metre(Markup.process_text(text, self.get_stress_predictor(language))).metre
131 | 
132 |     def generate_markups(self, input_path: str, input_type: FileType, output_path: str, output_type: FileType) -> None:
133 |         """
134 |         Генерация разметок по текстам.
135 | 
136 |         :param input_path: путь к папке/файлу с текстом.
137 |         :param input_type: тип файлов с текстов.
138 |         :param output_path: путь к файлу с итоговыми разметками.
139 |         :param output_type: тип итогового файла.
140 |         """
141 |         markups = Reader.read_markups(input_path, input_type, False, self.get_stress_predictor())
142 |         writer = Writer(output_type, output_path)
143 |         writer.open()
144 |         for markup in markups:
145 |             writer.write_markup(markup)
146 |         writer.close()
147 | 
148 |     def is_rhyme(self, word1: str, word2: str) -> bool:
149 |         """
150 |         :param word1: первое слово.
151 |         :param word2: второе слово.
152 |         :return: рифмуются ли слова.
153 |         """
154 |         markup_word1 = self.get_markup(word1).lines[0].words[0]
155 |         markup_word1.set_stresses(self.get_stresses(word1))
156 |         markup_word2 = self.get_markup(word2).lines[0].words[0]
157 |         markup_word2.set_stresses(self.get_stresses(word2))
158 |         return Rhymes.is_rhyme(markup_word1, markup_word2)
159 | 
160 |     def generate_poem(self,
161 |                       model_path: str,
162 |                       token_vocab_path: str=None,
163 |                       stress_vocab_path: str=None,
164 |                       metre_schema: str="-+",
165 |                       rhyme_pattern: str="abab",
166 |                       n_syllables: int=8,
167 |                       sampling_k: int=None,
168 |                       beam_width: int=None,
169 |                       seed: int=1337,
170 |                       temperature: float=1.0,
171 |                       last_text: str="") -> str:
172 |         """
173 |         Сгенерировать стих. Нужно задать либо sampling_k, либо beam_width.
174 | 
175 |         :param model_path: путь к модели.
176 |         :param token_vocab_path: путь к словарю.
177 |         :param stress_vocab_path: путь к словарю ударений.
178 |         :param metre_schema: схема метра.
179 |         :param rhyme_pattern: схема рифм.
180 |         :param n_syllables: количество слогов в строке.
181 |         :param sampling_k: top-k при семплинге
182 |         :param beam_width: ширина лучевого поиска.
183 |         :param seed: seed
184 |         :param temperature: температура генерации
185 |         :param last_text: последняя строчка
186 |         :return: стих. None, если генерация не была успешной.
187 |         """
188 |         token_vocab_path = token_vocab_path or os.path.join(model_path, "vocabulary")
189 |         stress_vocab_path = stress_vocab_path or os.path.join(model_path, "stress.pickle")
190 |         generator = self.get_generator(model_path, token_vocab_path, stress_vocab_path)
191 |         poem = generator.generate_poem(
192 |             metre_schema=metre_schema,
193 |             rhyme_pattern=rhyme_pattern,
194 |             n_syllables=n_syllables,
195 |             sampling_k=sampling_k,
196 |             beam_width=beam_width,
197 |             temperature=temperature,
198 |             seed=seed,
199 |             last_text=last_text
200 |         )
201 |         return poem
202 | 
203 |     def get_word_rhymes(self, word: str, vocab_dump_path: str, markup_path: str=None) -> List[str]:
204 |         """
205 |         Поиск рифмы для данного слова.
206 | 
207 |         :param word: слово.
208 |         :param vocab_dump_path: путь, куда сохраняется словарь.
209 |         :param markup_path: путь к разметкам.
210 |         :return: список рифм.
211 |         """
212 |         markup_word = self.get_markup(word).lines[0].words[0]
213 |         markup_word.set_stresses(self.get_stresses(word))
214 |         rhymes = []
215 |         vocabulary = self.get_vocabulary(vocab_dump_path, markup_path)
216 |         for i in range(vocabulary.size()):
217 |             if Rhymes.is_rhyme(markup_word, vocabulary.get_word(i)):
218 |                 rhymes.append(vocabulary.get_word(i).text.lower())
219 |         return rhymes
220 | 


--------------------------------------------------------------------------------
/rupo/main/markup.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Автор: Гусев Илья
  3 | # Описание: Модуль для описания разметки по ударениям и слогам.
  4 | 
  5 | import json
  6 | from typing import List, Set
  7 | import xml.etree.ElementTree as etree
  8 | 
  9 | from dicttoxml import dicttoxml
 10 | 
 11 | from rupo.util.preprocess import get_first_vowel_position
 12 | from rupo.util.mixins import CommonMixin
 13 | from rupo.main.tokenizer import Tokenizer, Token
 14 | from rupo.util.timeit import timeit
 15 | from russ.syllables import get_syllables
 16 | 
 17 | 
 18 | class Annotation(CommonMixin):
 19 |     """
 20 |     Класс аннотации.
 21 |     Содержит начальную и конечную позицию в тексте, а также текст аннотации.
 22 |     """
 23 |     def __init__(self, begin: int, end: int, text: str) -> None:
 24 |         self.begin = begin
 25 |         self.end = end
 26 |         self.text = text
 27 | 
 28 | 
 29 | class Syllable(Annotation):
 30 |     """
 31 |     Разметка слога. Включает в себя аннотацию и номер слога, а также ударение.
 32 |     Если ударение падает не на этот слог, -1.
 33 |     """
 34 |     def __init__(self, begin: int, end: int, number: int, text: str, stress: int=-1) -> None:
 35 |         super(Syllable, self).__init__(begin, end, text)
 36 |         self.number = number
 37 |         self.stress = stress
 38 | 
 39 |     def vowel(self) -> int:
 40 |         """
 41 |         :return: позиция гласной буквы этого слога в слове (с 0).
 42 |         """
 43 |         return get_first_vowel_position(self.text) + self.begin
 44 | 
 45 |     def from_dict(self, d: dict) -> 'Syllable':
 46 |         self.__dict__.update(d)
 47 |         if "accent" in self.__dict__:
 48 |             self.stress = self.__dict__["accent"]
 49 |         return self
 50 | 
 51 | 
 52 | class Word(Annotation):
 53 |     """
 54 |     Разметка слова. Включает в себя аннотацию слова и его слоги.
 55 |     """
 56 |     def __init__(self, begin: int, end: int, text: str, syllables: List[Syllable]) -> None:
 57 |         super(Word, self).__init__(begin, end, text)
 58 |         self.syllables = syllables
 59 | 
 60 |     def count_stresses(self) -> int:
 61 |         """
 62 |         :return: количество ударений в слове.
 63 |         """
 64 |         return sum(syllable.stress != -1 for syllable in self.syllables)
 65 | 
 66 |     def stress(self) -> int:
 67 |         """
 68 |         :return: последнее ударение в слове, если нет, то -1.
 69 |         """
 70 |         stress = -1
 71 |         for syllable in self.syllables:
 72 |             if syllable.stress != -1:
 73 |                 stress = syllable.stress
 74 |         return stress
 75 | 
 76 |     def get_stressed_syllables_numbers(self) -> List[int]:
 77 |         """
 78 |         :return: номера слогов, на которые падают ударения.
 79 |         """
 80 |         return [syllable.number for syllable in self.syllables if syllable.stress != -1]
 81 | 
 82 |     def get_stresses(self) -> Set[int]:
 83 |         """
 84 |         :return: все ударения.
 85 |         """
 86 |         stresses = set()
 87 |         for syllable in self.syllables:
 88 |             if syllable.stress != -1:
 89 |                 stresses.add(syllable.stress)
 90 |         return stresses
 91 | 
 92 |     def set_stresses(self, stresses: List[int]) -> None:
 93 |         """
 94 |         Задать ударения, все остальные убираются.
 95 | 
 96 |         :param stresses: позиции ударения в слове.
 97 |         """
 98 |         for syllable in self.syllables:
 99 |             if syllable.vowel() in stresses:
100 |                 syllable.stress = syllable.vowel()
101 |             else:
102 |                 syllable.stress = -1
103 | 
104 |     def get_short(self) -> str:
105 |         """
106 |         :return: слово в форме "текст"+"последнее ударение".
107 |         """
108 |         return self.text.lower() + str(self.stress())
109 | 
110 |     def from_dict(self, d: dict) -> 'Word':
111 |         self.__dict__.update(d)
112 |         syllables = d["syllables"]  # type: List[dict]
113 |         self.syllables = [Syllable(0, 0, 0, "").from_dict(syllable) for syllable in syllables]
114 |         return self
115 | 
116 |     def to_stressed_word(self):
117 |         from rupo.stress.word import StressedWord, Stress
118 |         return StressedWord(self.text, set([Stress(pos, Stress.Type.PRIMARY) for pos in self.get_stresses()]))
119 | 
120 |     def __hash__(self) -> int:
121 |         """
122 |         :return: хеш разметки.
123 |         """
124 |         return hash(self.get_short())
125 | 
126 | 
127 | class Line(Annotation):
128 |     """
129 |     Разметка строки. Включает в себя аннотацию строки и её слова.
130 |     """
131 |     def __init__(self, begin: int, end: int, text: str, words: List[Word]) -> None:
132 |         super(Line, self).__init__(begin, end, text)
133 |         self.words = words
134 | 
135 |     def from_dict(self, d) -> 'Line':
136 |         self.__dict__.update(d)
137 |         words = d["words"]  # type: List[dict]
138 |         self.words = [Word(0, 0, "", []).from_dict(word) for word in words]
139 |         return self
140 | 
141 |     def count_vowels(self):
142 |         num_vowels = 0
143 |         for word in self.words:
144 |             for syllable in word.syllables:
145 |                 if get_first_vowel_position(syllable.text) != -1:
146 |                     num_vowels += 1
147 |         return num_vowels
148 | 
149 | 
150 | class Markup(CommonMixin):
151 |     """
152 |     Класс данных для разметки в целом с экспортом/импортом в XML и JSON.
153 |     """
154 |     def __init__(self, text: str=None, lines: List[Line]=None) -> None:
155 |         self.text = text
156 |         self.lines = lines
157 |         self.version = 2
158 | 
159 |     def to_json(self) -> str:
160 |         return json.dumps(self.to_dict(), ensure_ascii=False)
161 | 
162 |     def from_json(self, st) -> 'Markup':
163 |         d = json.loads(st)
164 |         return self.from_dict(d)
165 | 
166 |     def from_dict(self, d) -> 'Markup':
167 |         self.__dict__.update(d)
168 |         lines = d["lines"]  # type: List[dict]
169 |         self.lines = [Line(0, 0, "", []).from_dict(line) for line in lines]
170 |         return self
171 | 
172 |     def to_xml(self) -> str:
173 |         """
174 |         Экспорт в XML.
175 | 
176 |         :return self: строка в формате XML
177 |         """
178 |         return dicttoxml(self.to_dict(), custom_root='markup', attr_type=False).decode('utf-8').replace("\n", "\\n")
179 | 
180 |     def from_xml(self, xml: str) -> 'Markup':
181 |         """
182 |         Импорт из XML.
183 | 
184 |         :param xml: XML-разметка
185 |         :return self: получившийся объект Markup
186 |         """
187 |         root = etree.fromstring(xml)
188 |         if root.find("version") is None or int(root.find("version").text) != self.version:
189 |             raise TypeError("Другая версия разметки")
190 |         lines_node = root.find("lines")
191 |         lines = []
192 |         for line_node in lines_node.findall("item"):
193 |             words_node = line_node.find("words")
194 |             words = []
195 |             for word_node in words_node.findall("item"):
196 |                 syllables_node = word_node.find("syllables")
197 |                 syllables = []
198 |                 for syllable_node in syllables_node.findall("item"):
199 |                     stress_node = syllable_node.find("accent") \
200 |                         if syllable_node.find("accent") is not None \
201 |                         else syllable_node.find("stress")
202 |                     stress = int(stress_node.text)
203 |                     syllables.append(Syllable(int(syllable_node.find("begin").text),
204 |                                               int(syllable_node.find("end").text),
205 |                                               int(syllable_node.find("number").text),
206 |                                               syllable_node.find("text").text,
207 |                                               stress))
208 |                 words.append(Word(int(word_node.find("begin").text), int(word_node.find("end").text),
209 |                                   word_node.find("text").text, syllables))
210 |             lines.append(Line(int(line_node.find("begin").text), int(line_node.find("end").text),
211 |                               line_node.find("text").text, words))
212 |         self.text = root.find("text").text.replace("\\n", "\n")
213 |         self.lines = lines
214 |         return self
215 | 
216 |     def from_raw(self, text: str) -> 'Markup':
217 |         """
218 |         Импорт из сырого текста с ударениями в конце слов
219 | 
220 |         :param text: текст.
221 |         :return: разметка.
222 |         """
223 | 
224 |         pos = 0
225 |         lines = []
226 |         for line in text.split("\n"):
227 |             if line == "":
228 |                 continue
229 |             line_tokens = []
230 |             for word in line.split(" "):
231 |                 i = -1
232 |                 ch = word[i]
233 |                 stress = ""
234 |                 while ch.isdigit() or ch == "-":
235 |                     stress += ch
236 |                     i -= 1
237 |                     ch = word[i]
238 |                 line_tokens.append((word[:i+1], int(stress[::-1])))
239 |             words = []
240 |             line_begin = pos
241 |             for pair in line_tokens:
242 |                 token = pair[0]
243 |                 stress = pair[1]
244 |                 syllables = get_syllables(token)
245 |                 for j in range(len(syllables)):
246 |                     syllables[j].begin += pos
247 |                     syllables[j].end += pos
248 |                 word = Word(pos, pos + len(token), token, syllables)
249 |                 word.set_stresses([stress])
250 |                 words.append(word)
251 |                 pos += len(token) + 1
252 |             lines.append(Line(line_begin, pos, " ".join([pair[0] for pair in line_tokens]), words))
253 |         self.text = "\n".join([line.text for line in lines])
254 |         self.lines = lines
255 |         return self
256 | 
257 |     @staticmethod
258 |     @timeit
259 |     def process_text(text: str, stress_predictor) -> 'Markup':
260 |         """
261 |         Получение начального варианта разметки по слогам и ударениям.
262 | 
263 |         :param text: текст для разметки
264 |         :param stress_predictor: предсказатель ударений.
265 |         :return markup: разметка по слогам и ударениям
266 |         """
267 |         begin_line = 0
268 |         lines = []
269 |         words = []
270 |         text_lines = text.split("\n")
271 |         for text_line in text_lines:
272 |             tokens = [token for token in Tokenizer.tokenize(text_line) if token.token_type == Token.TokenType.WORD]
273 |             for token in tokens:
274 |                 word = Word(begin_line + token.begin, begin_line + token.end, token.text, get_syllables(token.text))
275 |                 # Проставляем ударения.
276 |                 stresses = stress_predictor.predict(token.text.lower())
277 |                 # Сопоставляем ударения слогам.
278 |                 if len(word.syllables) > 1:
279 |                     word.set_stresses(stresses)
280 |                 words.append(word)
281 |             end_line = begin_line + len(text_line)
282 |             lines.append(Line(begin_line, end_line, text_line, words))
283 |             words = []
284 |             begin_line = end_line + 1
285 |         return Markup(text, lines)
286 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/rupo/metre/pattern_analyzer.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Автор: Гусев Илья
  3 | # Описание: Сопоставление шаблону.
  4 | 
  5 | from typing import List, Set, Tuple
  6 | 
  7 | 
  8 | class TreeNode:
  9 |     """
 10 |     Нода дерева разбора шаблона.
 11 |     """
 12 |     leaf_chars = "usUS"
 13 |     non_leaf_chars = "*?w"
 14 | 
 15 |     def __init__(self, parent: 'TreeNode', children: List['TreeNode'], text: str, pattern_pos: int):
 16 |         """
 17 |         :param parent: родитель ноды.
 18 |         :param children: дети ноды.
 19 |         :param text: символ, соответствующий ноде.
 20 |         :param pattern_pos: позиция символа в шаблоне
 21 |         """
 22 |         self.parent = parent  # type: TreeNode
 23 |         self.children = children  # type: List[TreeNode]
 24 |         self.text = text  # type: str
 25 |         self.pattern_pos = pattern_pos  # type: int
 26 | 
 27 |     def get_level(self) -> int:
 28 |         """
 29 |         :return: высота ноды в дереве.
 30 |         """
 31 |         parent = self.parent
 32 |         level = 0
 33 |         while parent is not None:
 34 |             parent = parent.parent
 35 |             level += 1
 36 |         return level
 37 | 
 38 |     def get_next_sibling(self) -> 'TreeNode':
 39 |         """
 40 |         :return: соседняя нода справа.
 41 |         """
 42 |         siblings = self.parent.children
 43 |         index = siblings.index(self) + 1
 44 |         if index < len(siblings):
 45 |             return siblings[index]
 46 |         return None
 47 | 
 48 |     def get_last_child_leaf(self) -> 'TreeNode':
 49 |         """
 50 |         :return: последняя нода изе детей, которая является листом.
 51 |         """
 52 |         for child in reversed(self.children):
 53 |             if child.is_leaf():
 54 |                 return child
 55 |         return None
 56 | 
 57 |     def is_first_leaf(self) -> bool:
 58 |         if not self.is_leaf():
 59 |             return False
 60 |         return [child for child in self.parent.children if child.is_leaf()][0] == self
 61 | 
 62 |     def is_last_leaf(self) -> bool:
 63 |         if not self.is_leaf():
 64 |             return False
 65 |         return [child for child in self.parent.children if child.is_leaf()][-1] == self
 66 | 
 67 |     def get_most_left_leaf(self) -> 'TreeNode':
 68 |         """
 69 |         :return: самый левый потомок.
 70 |         """
 71 |         node = self
 72 |         while len(node.children) != 0:
 73 |             node = node.children[0]
 74 |         assert node.is_leaf()
 75 |         return node
 76 | 
 77 |     def print_tree(self) -> None:
 78 |         """
 79 |         Вывод дерева с корнем в этой ноде.
 80 |         """
 81 |         stack = list()
 82 |         stack.append(self)
 83 |         while len(stack) != 0:
 84 |             current_node = stack.pop()
 85 |             print("\t" * current_node.get_level(), current_node)
 86 |             stack += current_node.children
 87 | 
 88 |     def is_leaf(self) -> bool:
 89 |         """
 90 |         :return: является ли нода листом дерева.
 91 |         """
 92 |         return self.text in TreeNode.leaf_chars
 93 | 
 94 |     def __str__(self) -> str:
 95 |         return self.text + " " + str(self.pattern_pos)
 96 | 
 97 |     def __repr__(self) -> str:
 98 |         return self.__str__()
 99 | 
100 |     def __hash__(self):
101 |         return hash(self.pattern_pos)
102 | 
103 |     def __eq__(self, other):
104 |         return self.pattern_pos == other.pattern_pos
105 | 
106 | 
107 | class State:
108 |     """
109 |     Состояние разбора.
110 |     """
111 |     def __init__(self, node: TreeNode, string_pos: int, strong_errors: int, weak_errors: int, pattern: str):
112 |         """
113 |         :param node: нода дерева, соответствующая состоянию.
114 |         :param string_pos: позиция в сопоставляемой строке.
115 |         :param strong_errors: количество ошибок в U и S.
116 |         :param weak_errors: количество ошибок в u и s.
117 |         :param pattern: шаблон - путь, до этого состояния.
118 |         """
119 |         self.node = node  # type: TreeNode
120 |         self.string_pos = string_pos  # type: int
121 |         self.strong_errors = strong_errors  # type: int
122 |         self.weak_errors = weak_errors  # type: int
123 |         self.pattern = pattern  # type: str
124 | 
125 |     def __str__(self) -> str:
126 |         return str(self.node) + " " + str(self.string_pos) + " " + str(self.strong_errors) + " " + str(self.weak_errors)
127 | 
128 |     def __repr__(self) -> str:
129 |         return self.__str__()
130 | 
131 | 
132 | class PatternAnalyzer:
133 |     """
134 |     Сопоставлятель шаблона и строки.
135 |     """
136 |     def __init__(self, pattern: str, error_border: int=8):
137 |         """
138 |         :param error_border: граница по ошибкам.
139 |         :param pattern: шаблон.
140 |         """
141 |         self.pattern = pattern  # type: str
142 |         self.tree = self.__build_tree(pattern)  # type: TreeNode
143 |         self.error_border = error_border
144 | 
145 |     @staticmethod
146 |     def count_errors(pattern: str, string: str, error_border: int=8) -> Tuple[str, int, int, bool]:
147 |         """
148 |         :param pattern: шаблон.
149 |         :param string: строка.
150 |         :param error_border: граница по ошибкам.
151 |         :return: лучший шаблон, количество сильных ошибок, количество слабых ошибок.
152 |         """
153 |         analyzer = PatternAnalyzer(pattern, error_border)
154 |         return analyzer.__accept(string)
155 | 
156 |     @staticmethod
157 |     def __build_tree(pattern: str) -> TreeNode:
158 |         """
159 |         Построение дерева шаблона.
160 | 
161 |         :param pattern: шаблон.
162 |         :return: корень дерева.
163 |         """
164 |         root_node = TreeNode(None, list(), "R", -1)
165 |         current_node = root_node
166 |         for i, ch in enumerate(pattern):
167 |             if ch == "(":
168 |                 node = TreeNode(current_node, list(), "()", i)
169 |                 current_node.children.append(node)
170 |                 current_node = node
171 |             if ch == ")":
172 |                 node = current_node
173 |                 current_node = current_node.parent
174 |                 # Убираем бессмысленные скобки.
175 |                 if i + 1 < len(pattern) and pattern[i + 1] not in "*?":
176 |                     current_node.children = current_node.children[:-1] + node.children
177 |                     for child in node.children:
178 |                         child.parent = current_node
179 |             if ch in TreeNode.leaf_chars:
180 |                 current_node.children.append(TreeNode(current_node, list(), ch, i))
181 |             # Заменяем скобки на нетерминалы.
182 |             if ch in TreeNode.non_leaf_chars:
183 |                 current_node.children[-1].text = ch
184 |                 current_node.children[-1].pattern_pos = i
185 |         return root_node
186 | 
187 |     def __accept(self, string: str) -> Tuple[str, int, int, bool]:
188 |         """
189 |         :param string: строка.
190 |         :return: лучший шаблон, количество сильных ошибок, количество слабых ошибок, были ли ошибки.
191 |         """
192 |         current_states = [State(None, -1, 0, 0, "")]
193 |         current_node = self.tree.get_most_left_leaf()
194 |         for i, ch in enumerate(string):
195 |             new_states = []
196 |             for state in current_states:
197 |                 if state.node is not None:
198 |                     current_node = self.__get_next_leaf(state.node)
199 |                 variants = self.__get_variants(current_node)
200 | 
201 |                 # Каждый вариант - новое состояние.
202 |                 for variant in variants:
203 |                     assert variant.is_leaf()
204 |                     strong_errors = state.strong_errors + int(variant.text.isupper() and variant.text != ch)
205 |                     weak_errors = state.weak_errors + int(variant.text.islower() and variant.text != ch.lower())
206 |                     new_state = State(variant, i, strong_errors, weak_errors, state.pattern+variant.text)
207 |                     if new_state.strong_errors + new_state.weak_errors > self.error_border:
208 |                         continue
209 |                     new_states.append(new_state)
210 | 
211 |             if len(new_states) == 0:
212 |                 # Можем закончить раньше, если по ошибкам порезали ветки, либо если шаблон меньше строки.
213 |                 current_states = PatternAnalyzer.__filter_states(current_states, self.tree)
214 |                 pattern, strong_errors, weak_errors = self.__get_min_errors_from_states(current_states)
215 |                 diff = (len(string) - i)
216 |                 return pattern, strong_errors + diff, weak_errors + diff, True
217 | 
218 |             current_states = new_states
219 |         current_states = PatternAnalyzer.__filter_states(current_states, self.tree)
220 |         return self.__get_min_errors_from_states(current_states) + (False,)
221 | 
222 |     @staticmethod
223 |     def __get_variants(current_node: TreeNode) -> Set[TreeNode]:
224 |         """
225 |         :param current_node: текущая нода.
226 |         :return: варианты ноды на том же символе строки, возникают из-за * и ? в шаблоне.
227 |         """
228 |         variants = set()
229 |         current_variant = current_node
230 |         while current_variant is not None:
231 |             if current_variant not in variants:
232 |                 variants.add(current_variant)
233 |             else:
234 |                 current_variant = current_variant.parent
235 |             current_variant = PatternAnalyzer.__get_next_variant(current_variant)
236 |         return variants
237 | 
238 |     @staticmethod
239 |     def __get_next_variant(node: TreeNode) -> TreeNode:
240 |         """
241 |         Получение следующего варианта из варинатов текущей ноды.
242 |         
243 |         :param node: текущий вариант.
244 |         :return: следующий вариант. 
245 |         """
246 |         assert node.is_leaf()
247 |         while node.parent is not None:
248 |             parent = node.parent
249 |             grandfather = parent.parent
250 |             uncle = parent.get_next_sibling() if grandfather is not None else None
251 |             is_variable = node.is_first_leaf() or not node.is_leaf()
252 |             if is_variable and uncle is not None:
253 |                 return uncle.get_most_left_leaf()
254 |             elif grandfather is not None and grandfather.text == "*" and grandfather.children[-1] == parent:
255 |                 return grandfather.get_most_left_leaf()
256 |             if is_variable:
257 |                 node = parent
258 |             else:
259 |                 break
260 |         return None
261 | 
262 |     @staticmethod
263 |     def __get_next_leaf(node: TreeNode) -> TreeNode:
264 |         """
265 |         Получение следующей ноды.
266 |         
267 |         :param node: текущая нода.
268 |         :return: следующая нода.
269 |         """
270 |         assert node.is_leaf()
271 |         while node.parent is not None:
272 |             sibling = node.get_next_sibling()
273 |             if sibling is not None:
274 |                 return sibling.get_most_left_leaf()
275 |             elif node.parent.text == "*":
276 |                 return node.parent.get_most_left_leaf()
277 |             node = node.parent
278 |         return None
279 | 
280 |     @staticmethod
281 |     def __filter_states(states: List[State], root: TreeNode) -> List[State]:
282 |         """
283 |         Фильтрация по наличию обязательных терминалов.
284 | 
285 |         :param states: состояния.
286 |         :param root: корень дерева.
287 |         :return: отфильтрованные состояния.
288 |         """
289 |         return [state for state in states if root.get_last_child_leaf() is None or
290 |                 state.node.pattern_pos >= root.get_last_child_leaf().pattern_pos]
291 | 
292 |     @staticmethod
293 |     def __get_min_errors_from_states(states: List[State]) -> Tuple[str, int, int]:
294 |         """
295 |         :param states: состояния.
296 |         :return: лучший шаблон, количество сильных ошибок, количество слабых ошибок.
297 |         """
298 |         if len(states) == 0:
299 |             return "", 0, 0
300 |         return min([(state.pattern, state.strong_errors, state.weak_errors) for i, state in enumerate(states)],
301 |                    key=lambda x: (x[1], x[2], x[0]))
302 | 


--------------------------------------------------------------------------------
/rupo/metre/metre_classifier.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Автор: Гусев Илья
  3 | # Описание: Классификатор метра.
  4 | 
  5 | from collections import OrderedDict
  6 | from typing import List, Dict, Tuple
  7 | import jsonpickle
  8 | import logging
  9 | 
 10 | from rupo.main.markup import Line, Markup
 11 | from rupo.util.mixins import CommonMixin
 12 | from rupo.metre.pattern_analyzer import PatternAnalyzer
 13 | from rupo.util.preprocess import get_first_vowel_position
 14 | from rupo.util.timeit import timeit
 15 | 
 16 | 
 17 | class StressCorrection(CommonMixin):
 18 |     """
 19 |     Исправление ударения.
 20 |     """
 21 |     def __init__(self, line_number: int, word_number: int, syllable_number: int,
 22 |                  word_text: str, stress: int) -> None:
 23 |         """
 24 |         :param line_number: номер строки.
 25 |         :param word_number: номер слова.
 26 |         :param syllable_number: номер слога.
 27 |         :param word_text: текст слова.
 28 |         :param stress: позиция ударения (с 0).
 29 |         """
 30 |         self.line_number = line_number
 31 |         self.word_number = word_number
 32 |         self.syllable_number = syllable_number
 33 |         self.word_text = word_text
 34 |         self.stress = stress
 35 | 
 36 | 
 37 | class ClassificationResult(CommonMixin):
 38 |     """
 39 |     Результат классификации стихотворения по метру.
 40 |     """
 41 |     def __init__(self, count_lines: int=0) -> None:
 42 |         """
 43 |         :param count_lines: количество строк.
 44 |         """
 45 |         self.metre = None
 46 |         self.count_lines = count_lines
 47 |         self.errors_count = {k: 0 for k in MetreClassifier.metres.keys()}  # type: Dict[str, int]
 48 |         self.corrections = {k: [] for k in MetreClassifier.metres.keys()}  # type: Dict[str, List[StressCorrection]]
 49 |         self.resolutions = {k: [] for k in MetreClassifier.metres.keys()}  # type: Dict[str, List[StressCorrection]]
 50 |         self.additions = {k: [] for k in MetreClassifier.metres.keys()}  # type: Dict[str, List[StressCorrection]]
 51 | 
 52 |     def get_metre_errors_count(self):
 53 |         """
 54 |         :return: получить количество ошибок на заданном метре.
 55 |         """
 56 |         return self.errors_count[self.metre]
 57 | 
 58 |     def to_json(self):
 59 |         """
 60 |         :return: сериализация в json.
 61 |         """
 62 |         return jsonpickle.encode(self)
 63 | 
 64 |     @staticmethod
 65 |     def str_corrections(collection: List[StressCorrection]) -> str:
 66 |         """
 67 |         :param collection: список исправлений.
 68 |         :return: его строковое представление.
 69 |         """
 70 |         return"\n".join([str((item.word_text, item.syllable_number)) for item in collection])
 71 | 
 72 |     def __str__(self):
 73 |         st = "Метр: " + str(self.metre) + "\n"
 74 |         st += "Снятая омография: \n" + ClassificationResult.str_corrections(self.resolutions[self.metre]) + "\n"
 75 |         st += "Неправильные ударения: \n" + ClassificationResult.str_corrections(self.corrections[self.metre]) + "\n"
 76 |         st += "Новые ударения: \n" + ClassificationResult.str_corrections(self.additions[self.metre]) + "\n"
 77 |         return st
 78 | 
 79 | 
 80 | class ErrorsTableRecord:
 81 |     def __init__(self, strong_errors, weak_errors, pattern, failed=False):
 82 |         self.strong_errors = strong_errors
 83 |         self.weak_errors = weak_errors
 84 |         self.pattern = pattern
 85 |         self.failed = failed
 86 | 
 87 |     def __str__(self):
 88 |         return self.pattern + " " + str(self.strong_errors) + " " + str(self.weak_errors)
 89 | 
 90 |     def __repr__(self):
 91 |         return self.__str__()
 92 | 
 93 | 
 94 | class ErrorsTable:
 95 |     def __init__(self, num_lines):
 96 |         self.data = {}
 97 |         self.num_lines = num_lines
 98 |         self.coef = OrderedDict(
 99 |             [("iambos", 0.3),
100 |              ("choreios", 0.3),
101 |              ("daktylos", 0.4),
102 |              ("amphibrachys", 0.4),
103 |              ("anapaistos", 0.4),
104 |              ("dolnik3", 0.5),
105 |              ("dolnik2", 0.5),
106 |              ("taktovik3", 6.0),
107 |              ("taktovik2", 6.0)
108 |              ])
109 |         self.sum_coef = OrderedDict(
110 |             [("iambos", 0.0),
111 |              ("choreios", 0.0),
112 |              ("daktylos", 0.0),
113 |              ("amphibrachys", 0.0),
114 |              ("anapaistos", 0.0),
115 |              ("dolnik3", 0.035),
116 |              ("dolnik2", 0.035),
117 |              ("taktovik3", 0.10),
118 |              ("taktovik2", 0.10)
119 |              ])
120 |         for metre_name in MetreClassifier.metres.keys():
121 |             self.data[metre_name] = [ErrorsTableRecord(0, 0, "") for _ in range(num_lines)]
122 | 
123 |     def add_record(self, metre_name, line_num, strong_errors, weak_errors, pattern, failed=False):
124 |         self.data[metre_name][line_num] = ErrorsTableRecord(strong_errors, weak_errors, pattern, failed)
125 | 
126 |     def get_best_metre(self):
127 |         for l in range(self.num_lines):
128 |             strong_sum = 0
129 |             weak_sum = 0
130 |             for metre_name in self.data.keys():
131 |                 strong_sum += self.data[metre_name][l].strong_errors
132 |                 weak_sum += self.data[metre_name][l].weak_errors
133 |             for metre_name, column in self.data.items():
134 |                 if strong_sum != 0:
135 |                     column[l].strong_errors = column[l].strong_errors / float(strong_sum)
136 |                 if weak_sum != 0:
137 |                     column[l].weak_errors = column[l].weak_errors / float(weak_sum)
138 |         sums = dict()
139 |         for metre_name in self.data.keys():
140 |             sums[metre_name] = (0, 0)
141 |         for metre_name, column in self.data.items():
142 |             strong_sum = 0
143 |             weak_sum = 0
144 |             for l in range(self.num_lines):
145 |                 strong_sum += column[l].strong_errors
146 |                 weak_sum += column[l].weak_errors
147 |             sums[metre_name] = (strong_sum, weak_sum)
148 |         for metre_name, pair in sums.items():
149 |             sums[metre_name] = self.sum_coef[metre_name] + (pair[0] + pair[1] / 2.0) * self.coef[metre_name] / self.num_lines
150 |         logging.debug(sums)
151 |         return min(sums, key=sums.get)
152 | 
153 | 
154 | class MetreClassifier(object):
155 |     """
156 |     Классификатор, считает отклонения от стандартных шаблонов ритма(метров).
157 |     """
158 |     metres = OrderedDict(
159 |         [("iambos", '(us)*(uS)(U)?(U)?'),
160 |          ("choreios", '(su)*(S)(U)?(U)?'),
161 |          ("daktylos", '(suu)*(S)(U)?(U)?'),
162 |          ("amphibrachys", '(usu)*(uS)(U)?(U)?'),
163 |          ("anapaistos",  '(uus)*(uuS)(U)?(U)?'),
164 |          ("dolnik3", '(u)?(u)?((su)(u)?)*(S)(U)?(U)?'),
165 |          ("dolnik2", '(u)?(u)?((s)(u)?)*(S)(U)?(U)?'),
166 |          ("taktovik3", '(u)?(u)?((su)(u)?(u)?)*(S)(U)?(U)?'),
167 |          ("taktovik2", '(u)?(u)?((s)(u)?(u)?)*(S)(U)?(U)?')
168 |          ])
169 | 
170 |     border_syllables_count = 20
171 | 
172 |     @staticmethod
173 |     @timeit
174 |     def classify_metre(markup):
175 |         """
176 |         Классифицируем стихотворный метр.
177 | 
178 |         :param markup: разметка.
179 |         :return: результат классификации.
180 |         """
181 |         result = ClassificationResult(len(markup.lines))
182 |         num_lines = len(markup.lines)
183 |         errors_table = ErrorsTable(num_lines)
184 |         for l, line in enumerate(markup.lines):
185 |             for metre_name, metre_pattern in MetreClassifier.metres.items():
186 |                 line_syllables_count = sum([len(word.syllables) for word in line.words])
187 | 
188 |                 # Строчки длиной больше border_syllables_count слогов не обрабатываем.
189 |                 if line_syllables_count > MetreClassifier.border_syllables_count or line_syllables_count == 0:
190 |                     continue
191 |                 error_border = 7
192 |                 if metre_name == "dolnik2" or metre_name == "dolnik3":
193 |                     error_border = 3
194 |                 if metre_name == "taktovik2" or metre_name == "taktovik3":
195 |                     error_border = 2
196 |                 pattern, strong_errors, weak_errors, analysis_errored = \
197 |                     PatternAnalyzer.count_errors(MetreClassifier.metres[metre_name],
198 |                                                  MetreClassifier.__get_line_pattern(line),
199 |                                                  error_border)
200 |                 if analysis_errored or len(pattern) == 0:
201 |                     errors_table.add_record(metre_name, l, strong_errors, weak_errors, pattern, True)
202 |                     continue
203 |                 corrections = MetreClassifier.__get_line_pattern_matching_corrections(line, l, pattern)[0]
204 |                 accentuation_errors = len(corrections)
205 |                 strong_errors += accentuation_errors
206 |                 errors_table.add_record(metre_name, l, strong_errors, weak_errors, pattern)
207 |         result.metre = errors_table.get_best_metre()
208 | 
209 |         # Запомним все исправления.
210 |         for l, line in enumerate(markup.lines):
211 |             pattern = errors_table.data[result.metre][l].pattern
212 |             failed = errors_table.data[result.metre][l].failed
213 |             if failed or len(pattern) == 0:
214 |                 continue
215 |             corrections, resolutions, additions =\
216 |                 MetreClassifier.__get_line_pattern_matching_corrections(line, l, pattern)
217 |             result.corrections[result.metre] += corrections
218 |             result.resolutions[result.metre] += resolutions
219 |             result.additions[result.metre] += additions
220 |             result.errors_count[result.metre] += len(corrections)
221 |         return result
222 | 
223 |     @staticmethod
224 |     def __get_line_pattern(line: Line) -> str:
225 |         """
226 |         Сопоставляем строку шаблону, считаем ошибки.
227 | 
228 |         :param line: строка.
229 |         :return: количество ошибок
230 |         """
231 |         pattern = ""
232 |         for w, word in enumerate(line.words):
233 |             if len(word.syllables) == 0:
234 |                 pattern += "U"
235 |             else:
236 |                 for syllable in word.syllables:
237 |                     if syllable.stress != -1:
238 |                         pattern += "S"
239 |                     else:
240 |                         pattern += "U"
241 |         return pattern
242 | 
243 |     @staticmethod
244 |     def __get_line_pattern_matching_corrections(line: Line, line_number: int, pattern: str) \
245 |             -> Tuple[List[StressCorrection], List[StressCorrection], List[StressCorrection]]:
246 |         """
247 |         Ударения могут приходиться на слабое место,
248 |         если безударный слог того же слова не попадает на икт. Иначе - ошибка.
249 | 
250 |         :param line: строка.
251 |         :param line_number: номер строки.
252 |         :param pattern: шаблон.
253 |         :return: ошибки, дополнения и снятия
254 |         """
255 |         corrections = []
256 |         resolutions = []
257 |         additions = []
258 |         number_in_pattern = 0
259 |         for w, word in enumerate(line.words):
260 |             # Игнорируем слова длиной меньше 2 слогов.
261 |             if len(word.syllables) == 0:
262 |                 continue
263 |             if len(word.syllables) == 1:
264 |                 if pattern[number_in_pattern].lower() == "s" and word.syllables[0].stress == -1:
265 |                     additions.append(StressCorrection(line_number, w, 0, word.text, word.syllables[0].vowel()))
266 |                 number_in_pattern += len(word.syllables)
267 |                 continue
268 |             stress_count = word.count_stresses()
269 |             for syllable in word.syllables:
270 |                 if stress_count == 0 and pattern[number_in_pattern].lower() == "s":
271 |                     # Ударений нет, ставим такое, какое подходит по метру. Возможно несколько.
272 |                     additions.append(StressCorrection(line_number, w, syllable.number, word.text, syllable.vowel()))
273 |                 elif pattern[number_in_pattern].lower() == "u" and syllable.stress != -1:
274 |                     # Ударение есть и оно падает на этот слог, при этом в шаблоне безударная позиция.
275 |                     # Найдём такой слог, у которого в шаблоне ударная позиция. Это и есть наше исправление.
276 |                     for other_syllable in word.syllables:
277 |                         other_number_in_pattern = other_syllable.number - syllable.number + number_in_pattern
278 |                         if syllable.number == other_syllable.number or pattern[other_number_in_pattern].lower() != "s":
279 |                             continue
280 |                         ac = StressCorrection(line_number, w, other_syllable.number, word.text, other_syllable.vowel())
281 |                         if stress_count == 1 and other_syllable.stress == -1:
282 |                             corrections.append(ac)
283 |                         else:
284 |                             resolutions.append(ac)
285 |                 number_in_pattern += 1
286 |         return corrections, resolutions, additions
287 | 
288 |     @staticmethod
289 |     def get_improved_markup(markup: Markup, result: ClassificationResult) -> Markup:
290 |         """
291 |         Улучшаем разметку после классификации метра.
292 | 
293 |         :param markup: начальная разметка.
294 |         :param result: результат классификации.
295 |         :return: улучшенная разметка.
296 |         """
297 |         for pos in result.corrections[result.metre] + result.resolutions[result.metre]:
298 |             syllables = markup.lines[pos.line_number].words[pos.word_number].syllables
299 |             for i, syllable in enumerate(syllables):
300 |                 syllable.stress = -1
301 |                 if syllable.number == pos.syllable_number:
302 |                     syllable.stress = syllable.begin + get_first_vowel_position(syllable.text)
303 |         for pos in result.additions[result.metre]:
304 |             syllable = markup.lines[pos.line_number].words[pos.word_number].syllables[pos.syllable_number]
305 |             syllable.stress = syllable.begin + get_first_vowel_position(syllable.text)
306 | 
307 |         return markup
308 | 
309 |     @staticmethod
310 |     def improve_markup(markup: Markup) -> \
311 |             Tuple[Markup, ClassificationResult]:
312 |         """
313 |         Улучшение разметки метрическим классификатором.
314 | 
315 |         :param markup: начальная разметка.
316 |         """
317 |         result = MetreClassifier.classify_metre(markup)
318 |         improved_markup = MetreClassifier.get_improved_markup(markup, result)
319 |         return improved_markup, result
320 | 


--------------------------------------------------------------------------------
/rupo/data/examples/markup.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "items": [
  3 |     {
  4 |       "version": 2,
  5 |       "text": "Забывши волнения жизни мятежной,\nОдин жил в пустыне рыбак молодой.\nОднажды на скале прибрежной,\nНад тихой прозрачной рекой\nОн с удой беспечно\nСидел\nИ думой сердечной\nК прошедшему счастью летел.",
  6 |       "lines": [
  7 |         {
  8 |           "words": [
  9 |             {
 10 |               "syllables": [
 11 |                 {
 12 |                   "begin": 0,
 13 |                   "end": 2,
 14 |                   "text": "За",
 15 |                   "accent": -1,
 16 |                   "number": 0
 17 |                 },
 18 |                 {
 19 |                   "begin": 2,
 20 |                   "end": 4,
 21 |                   "text": "бы",
 22 |                   "accent": -1,
 23 |                   "number": 1
 24 |                 },
 25 |                 {
 26 |                   "begin": 4,
 27 |                   "end": 7,
 28 |                   "text": "вши",
 29 |                   "accent": 6,
 30 |                   "number": 2
 31 |                 }
 32 |               ],
 33 |               "end": 7,
 34 |               "text": "Забывши",
 35 |               "begin": 0
 36 |             },
 37 |             {
 38 |               "syllables": [
 39 |                 {
 40 |                   "begin": 0,
 41 |                   "end": 2,
 42 |                   "text": "во",
 43 |                   "accent": -1,
 44 |                   "number": 0
 45 |                 },
 46 |                 {
 47 |                   "begin": 2,
 48 |                   "end": 5,
 49 |                   "text": "лне",
 50 |                   "accent": 4,
 51 |                   "number": 1
 52 |                 },
 53 |                 {
 54 |                   "begin": 5,
 55 |                   "end": 7,
 56 |                   "text": "ни",
 57 |                   "accent": -1,
 58 |                   "number": 2
 59 |                 },
 60 |                 {
 61 |                   "begin": 7,
 62 |                   "end": 8,
 63 |                   "text": "я",
 64 |                   "accent": -1,
 65 |                   "number": 3
 66 |                 }
 67 |               ],
 68 |               "end": 16,
 69 |               "text": "волнения",
 70 |               "begin": 8
 71 |             },
 72 |             {
 73 |               "syllables": [
 74 |                 {
 75 |                   "begin": 0,
 76 |                   "end": 2,
 77 |                   "text": "жи",
 78 |                   "accent": 1,
 79 |                   "number": 0
 80 |                 },
 81 |                 {
 82 |                   "begin": 2,
 83 |                   "end": 5,
 84 |                   "text": "зни",
 85 |                   "accent": -1,
 86 |                   "number": 1
 87 |                 }
 88 |               ],
 89 |               "end": 22,
 90 |               "text": "жизни",
 91 |               "begin": 17
 92 |             },
 93 |             {
 94 |               "syllables": [
 95 |                 {
 96 |                   "begin": 0,
 97 |                   "end": 2,
 98 |                   "text": "мя",
 99 |                   "accent": -1,
100 |                   "number": 0
101 |                 },
102 |                 {
103 |                   "begin": 2,
104 |                   "end": 4,
105 |                   "text": "те",
106 |                   "accent": 3,
107 |                   "number": 1
108 |                 },
109 |                 {
110 |                   "begin": 4,
111 |                   "end": 8,
112 |                   "text": "жной",
113 |                   "accent": -1,
114 |                   "number": 2
115 |                 }
116 |               ],
117 |               "end": 31,
118 |               "text": "мятежной",
119 |               "begin": 23
120 |             }
121 |           ],
122 |           "end": 33,
123 |           "text": "Забывши волнения жизни мятежной,",
124 |           "begin": 0
125 |         },
126 |         {
127 |           "words": [
128 |             {
129 |               "syllables": [
130 |                 {
131 |                   "begin": 0,
132 |                   "end": 1,
133 |                   "text": "О",
134 |                   "accent": -1,
135 |                   "number": 0
136 |                 },
137 |                 {
138 |                   "begin": 1,
139 |                   "end": 4,
140 |                   "text": "дин",
141 |                   "accent": 2,
142 |                   "number": 1
143 |                 }
144 |               ],
145 |               "end": 37,
146 |               "text": "Один",
147 |               "begin": 33
148 |             },
149 |             {
150 |               "syllables": [
151 |                 {
152 |                   "begin": 0,
153 |                   "end": 3,
154 |                   "text": "жил",
155 |                   "accent": 1,
156 |                   "number": 0
157 |                 }
158 |               ],
159 |               "end": 41,
160 |               "text": "жил",
161 |               "begin": 38
162 |             },
163 |             {
164 |               "syllables": [],
165 |               "end": 43,
166 |               "text": "в",
167 |               "begin": 42
168 |             },
169 |             {
170 |               "syllables": [
171 |                 {
172 |                   "begin": 0,
173 |                   "end": 2,
174 |                   "text": "пу",
175 |                   "accent": -1,
176 |                   "number": 0
177 |                 },
178 |                 {
179 |                   "begin": 2,
180 |                   "end": 5,
181 |                   "text": "сты",
182 |                   "accent": 4,
183 |                   "number": 1
184 |                 },
185 |                 {
186 |                   "begin": 5,
187 |                   "end": 7,
188 |                   "text": "не",
189 |                   "accent": -1,
190 |                   "number": 2
191 |                 }
192 |               ],
193 |               "end": 51,
194 |               "text": "пустыне",
195 |               "begin": 44
196 |             },
197 |             {
198 |               "syllables": [
199 |                 {
200 |                   "begin": 0,
201 |                   "end": 2,
202 |                   "text": "ры",
203 |                   "accent": -1,
204 |                   "number": 0
205 |                 },
206 |                 {
207 |                   "begin": 2,
208 |                   "end": 5,
209 |                   "text": "бак",
210 |                   "accent": 3,
211 |                   "number": 1
212 |                 }
213 |               ],
214 |               "end": 57,
215 |               "text": "рыбак",
216 |               "begin": 52
217 |             },
218 |             {
219 |               "syllables": [
220 |                 {
221 |                   "begin": 0,
222 |                   "end": 2,
223 |                   "text": "мо",
224 |                   "accent": -1,
225 |                   "number": 0
226 |                 },
227 |                 {
228 |                   "begin": 2,
229 |                   "end": 4,
230 |                   "text": "ло",
231 |                   "accent": -1,
232 |                   "number": 1
233 |                 },
234 |                 {
235 |                   "begin": 4,
236 |                   "end": 7,
237 |                   "text": "дой",
238 |                   "accent": 5,
239 |                   "number": 2
240 |                 }
241 |               ],
242 |               "end": 65,
243 |               "text": "молодой",
244 |               "begin": 58
245 |             }
246 |           ],
247 |           "end": 67,
248 |           "text": "Один жил в пустыне рыбак молодой.",
249 |           "begin": 33
250 |         },
251 |         {
252 |           "words": [
253 |             {
254 |               "syllables": [
255 |                 {
256 |                   "begin": 0,
257 |                   "end": 1,
258 |                   "text": "О",
259 |                   "accent": -1,
260 |                   "number": 0
261 |                 },
262 |                 {
263 |                   "begin": 1,
264 |                   "end": 4,
265 |                   "text": "дна",
266 |                   "accent": -1,
267 |                   "number": 1
268 |                 },
269 |                 {
270 |                   "begin": 4,
271 |                   "end": 7,
272 |                   "text": "жды",
273 |                   "accent": 6,
274 |                   "number": 2
275 |                 }
276 |               ],
277 |               "end": 74,
278 |               "text": "Однажды",
279 |               "begin": 67
280 |             },
281 |             {
282 |               "syllables": [
283 |                 {
284 |                   "begin": 0,
285 |                   "end": 2,
286 |                   "text": "на",
287 |                   "accent": 1,
288 |                   "number": 0
289 |                 }
290 |               ],
291 |               "end": 77,
292 |               "text": "на",
293 |               "begin": 75
294 |             },
295 |             {
296 |               "syllables": [
297 |                 {
298 |                   "begin": 0,
299 |                   "end": 3,
300 |                   "text": "ска",
301 |                   "accent": 2,
302 |                   "number": 0
303 |                 },
304 |                 {
305 |                   "begin": 3,
306 |                   "end": 5,
307 |                   "text": "ле",
308 |                   "accent": -1,
309 |                   "number": 1
310 |                 }
311 |               ],
312 |               "end": 83,
313 |               "text": "скале",
314 |               "begin": 78
315 |             },
316 |             {
317 |               "syllables": [
318 |                 {
319 |                   "begin": 0,
320 |                   "end": 3,
321 |                   "text": "при",
322 |                   "accent": -1,
323 |                   "number": 0
324 |                 },
325 |                 {
326 |                   "begin": 3,
327 |                   "end": 6,
328 |                   "text": "бре",
329 |                   "accent": 5,
330 |                   "number": 1
331 |                 },
332 |                 {
333 |                   "begin": 6,
334 |                   "end": 10,
335 |                   "text": "жной",
336 |                   "accent": -1,
337 |                   "number": 2
338 |                 }
339 |               ],
340 |               "end": 94,
341 |               "text": "прибрежной",
342 |               "begin": 84
343 |             }
344 |           ],
345 |           "end": 96,
346 |           "text": "Однажды на скале прибрежной,",
347 |           "begin": 67
348 |         },
349 |         {
350 |           "words": [
351 |             {
352 |               "syllables": [
353 |                 {
354 |                   "begin": 0,
355 |                   "end": 3,
356 |                   "text": "Над",
357 |                   "accent": 1,
358 |                   "number": 0
359 |                 }
360 |               ],
361 |               "end": 99,
362 |               "text": "Над",
363 |               "begin": 96
364 |             },
365 |             {
366 |               "syllables": [
367 |                 {
368 |                   "begin": 0,
369 |                   "end": 2,
370 |                   "text": "ти",
371 |                   "accent": 1,
372 |                   "number": 0
373 |                 },
374 |                 {
375 |                   "begin": 2,
376 |                   "end": 5,
377 |                   "text": "хой",
378 |                   "accent": -1,
379 |                   "number": 1
380 |                 }
381 |               ],
382 |               "end": 105,
383 |               "text": "тихой",
384 |               "begin": 100
385 |             },
386 |             {
387 |               "syllables": [
388 |                 {
389 |                   "begin": 0,
390 |                   "end": 3,
391 |                   "text": "про",
392 |                   "accent": -1,
393 |                   "number": 0
394 |                 },
395 |                 {
396 |                   "begin": 3,
397 |                   "end": 6,
398 |                   "text": "зра",
399 |                   "accent": 5,
400 |                   "number": 1
401 |                 },
402 |                 {
403 |                   "begin": 6,
404 |                   "end": 10,
405 |                   "text": "чной",
406 |                   "accent": -1,
407 |                   "number": 2
408 |                 }
409 |               ],
410 |               "end": 116,
411 |               "text": "прозрачной",
412 |               "begin": 106
413 |             },
414 |             {
415 |               "syllables": [
416 |                 {
417 |                   "begin": 0,
418 |                   "end": 2,
419 |                   "text": "ре",
420 |                   "accent": -1,
421 |                   "number": 0
422 |                 },
423 |                 {
424 |                   "begin": 2,
425 |                   "end": 5,
426 |                   "text": "кой",
427 |                   "accent": 3,
428 |                   "number": 1
429 |                 }
430 |               ],
431 |               "end": 122,
432 |               "text": "рекой",
433 |               "begin": 117
434 |             }
435 |           ],
436 |           "end": 123,
437 |           "text": "Над тихой прозрачной рекой",
438 |           "begin": 96
439 |         },
440 |         {
441 |           "words": [
442 |             {
443 |               "syllables": [
444 |                 {
445 |                   "begin": 0,
446 |                   "end": 2,
447 |                   "text": "Он",
448 |                   "accent": 0,
449 |                   "number": 0
450 |                 }
451 |               ],
452 |               "end": 125,
453 |               "text": "Он",
454 |               "begin": 123
455 |             },
456 |             {
457 |               "syllables": [],
458 |               "end": 127,
459 |               "text": "с",
460 |               "begin": 126
461 |             },
462 |             {
463 |               "syllables": [
464 |                 {
465 |                   "begin": 0,
466 |                   "end": 1,
467 |                   "text": "у",
468 |                   "accent": -1,
469 |                   "number": 0
470 |                 },
471 |                 {
472 |                   "begin": 1,
473 |                   "end": 4,
474 |                   "text": "дой",
475 |                   "accent": 2,
476 |                   "number": 1
477 |                 }
478 |               ],
479 |               "end": 132,
480 |               "text": "удой",
481 |               "begin": 128
482 |             },
483 |             {
484 |               "syllables": [
485 |                 {
486 |                   "begin": 0,
487 |                   "end": 2,
488 |                   "text": "бе",
489 |                   "accent": -1,
490 |                   "number": 0
491 |                 },
492 |                 {
493 |                   "begin": 2,
494 |                   "end": 5,
495 |                   "text": "спе",
496 |                   "accent": 4,
497 |                   "number": 1
498 |                 },
499 |                 {
500 |                   "begin": 5,
501 |                   "end": 8,
502 |                   "text": "чно",
503 |                   "accent": -1,
504 |                   "number": 2
505 |                 }
506 |               ],
507 |               "end": 141,
508 |               "text": "беспечно",
509 |               "begin": 133
510 |             }
511 |           ],
512 |           "end": 142,
513 |           "text": "Он с удой беспечно",
514 |           "begin": 123
515 |         },
516 |         {
517 |           "words": [
518 |             {
519 |               "syllables": [
520 |                 {
521 |                   "begin": 0,
522 |                   "end": 2,
523 |                   "text": "Си",
524 |                   "accent": -1,
525 |                   "number": 0
526 |                 },
527 |                 {
528 |                   "begin": 2,
529 |                   "end": 5,
530 |                   "text": "дел",
531 |                   "accent": 3,
532 |                   "number": 1
533 |                 }
534 |               ],
535 |               "end": 147,
536 |               "text": "Сидел",
537 |               "begin": 142
538 |             }
539 |           ],
540 |           "end": 148,
541 |           "text": "Сидел",
542 |           "begin": 142
543 |         },
544 |         {
545 |           "words": [
546 |             {
547 |               "syllables": [
548 |                 {
549 |                   "begin": 0,
550 |                   "end": 1,
551 |                   "text": "И",
552 |                   "accent": 0,
553 |                   "number": 0
554 |                 }
555 |               ],
556 |               "end": 149,
557 |               "text": "И",
558 |               "begin": 148
559 |             },
560 |             {
561 |               "syllables": [
562 |                 {
563 |                   "begin": 0,
564 |                   "end": 2,
565 |                   "text": "ду",
566 |                   "accent": 1,
567 |                   "number": 0
568 |                 },
569 |                 {
570 |                   "begin": 2,
571 |                   "end": 5,
572 |                   "text": "мой",
573 |                   "accent": -1,
574 |                   "number": 1
575 |                 }
576 |               ],
577 |               "end": 155,
578 |               "text": "думой",
579 |               "begin": 150
580 |             },
581 |             {
582 |               "syllables": [
583 |                 {
584 |                   "begin": 0,
585 |                   "end": 3,
586 |                   "text": "сер",
587 |                   "accent": -1,
588 |                   "number": 0
589 |                 },
590 |                 {
591 |                   "begin": 3,
592 |                   "end": 5,
593 |                   "text": "де",
594 |                   "accent": 4,
595 |                   "number": 1
596 |                 },
597 |                 {
598 |                   "begin": 5,
599 |                   "end": 9,
600 |                   "text": "чной",
601 |                   "accent": -1,
602 |                   "number": 2
603 |                 }
604 |               ],
605 |               "end": 165,
606 |               "text": "сердечной",
607 |               "begin": 156
608 |             }
609 |           ],
610 |           "end": 166,
611 |           "text": "И думой сердечной",
612 |           "begin": 148
613 |         },
614 |         {
615 |           "words": [
616 |             {
617 |               "syllables": [],
618 |               "end": 167,
619 |               "text": "К",
620 |               "begin": 166
621 |             },
622 |             {
623 |               "syllables": [
624 |                 {
625 |                   "begin": 0,
626 |                   "end": 3,
627 |                   "text": "про",
628 |                   "accent": -1,
629 |                   "number": 0
630 |                 },
631 |                 {
632 |                   "begin": 3,
633 |                   "end": 5,
634 |                   "text": "ше",
635 |                   "accent": 4,
636 |                   "number": 1
637 |                 },
638 |                 {
639 |                   "begin": 5,
640 |                   "end": 8,
641 |                   "text": "дше",
642 |                   "accent": -1,
643 |                   "number": 2
644 |                 },
645 |                 {
646 |                   "begin": 8,
647 |                   "end": 10,
648 |                   "text": "му",
649 |                   "accent": -1,
650 |                   "number": 3
651 |                 }
652 |               ],
653 |               "end": 178,
654 |               "text": "прошедшему",
655 |               "begin": 168
656 |             },
657 |             {
658 |               "syllables": [
659 |                 {
660 |                   "begin": 0,
661 |                   "end": 3,
662 |                   "text": "сча",
663 |                   "accent": 2,
664 |                   "number": 0
665 |                 },
666 |                 {
667 |                   "begin": 3,
668 |                   "end": 7,
669 |                   "text": "стью",
670 |                   "accent": -1,
671 |                   "number": 1
672 |                 }
673 |               ],
674 |               "end": 186,
675 |               "text": "счастью",
676 |               "begin": 179
677 |             },
678 |             {
679 |               "syllables": [
680 |                 {
681 |                   "begin": 0,
682 |                   "end": 2,
683 |                   "text": "ле",
684 |                   "accent": -1,
685 |                   "number": 0
686 |                 },
687 |                 {
688 |                   "begin": 2,
689 |                   "end": 5,
690 |                   "text": "тел",
691 |                   "accent": 3,
692 |                   "number": 1
693 |                 }
694 |               ],
695 |               "end": 192,
696 |               "text": "летел",
697 |               "begin": 187
698 |             }
699 |           ],
700 |           "end": 193,
701 |           "text": "К прошедшему счастью летел.",
702 |           "begin": 166
703 |         }
704 |       ]
705 |     }
706 |   ]
707 | }


--------------------------------------------------------------------------------