├── rupo
├── __init__.py
├── dict
│ ├── __init__.py
│ ├── zaliznyak.py
│ ├── wiki.py
│ └── cmu.py
├── files
│ ├── __init__.py
│ ├── test_writer.py
│ ├── test_reader.py
│ ├── writer.py
│ └── reader.py
├── main
│ ├── __init__.py
│ ├── test_vocabulary.py
│ ├── test_markup.py
│ ├── morph.py
│ ├── test_tokenizer.py
│ ├── vocabulary.py
│ ├── tokenizer.py
│ └── markup.py
├── metre
│ ├── __init__.py
│ ├── test_pattern_analyzer.py
│ ├── test_metre_classifier.py
│ ├── pattern_analyzer.py
│ └── metre_classifier.py
├── rhymes
│ ├── __init__.py
│ ├── test_rhymes.py
│ └── rhymes.py
├── stress
│ ├── __init__.py
│ ├── test_dict.py
│ ├── test_predictor.py
│ ├── word.py
│ ├── predictor.py
│ └── dict.py
├── util
│ ├── __init__.py
│ ├── timeit.py
│ ├── tqdm_open.py
│ ├── data.py
│ ├── mixins.py
│ └── preprocess.py
├── generate
│ ├── __init__.py
│ ├── generator.py
│ └── transforms.py
├── data
│ └── examples
│ │ ├── text.txt
│ │ ├── text.xml
│ │ ├── morph_markup.txt
│ │ ├── markup.xml
│ │ └── markup.json
├── settings.py
├── test_api.py
└── api.py
├── setup.cfg
├── docs
├── source
│ ├── modules.rst
│ ├── index.rst
│ ├── rupo.rhymes.rst
│ ├── rupo.util.rst
│ ├── rupo.rst
│ ├── rupo.files.rst
│ ├── rupo.metre.rst
│ ├── rupo.main.rst
│ ├── rupo.generate.rst
│ └── conf.py
└── Makefile
├── .gitignore
├── .gitattributes
├── .codeclimate.yml
├── requirements.txt
├── download.sh
├── .travis.yml
├── generate_poem.py
├── setup.py
├── README.md
└── LICENSE
/rupo/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/rupo/dict/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/rupo/files/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/rupo/main/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/rupo/metre/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/rupo/rhymes/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/rupo/stress/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/rupo/util/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/rupo/generate/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
--------------------------------------------------------------------------------
/docs/source/modules.rst:
--------------------------------------------------------------------------------
1 | rupo
2 | ====
3 |
4 | .. toctree::
5 | :maxdepth: 4
6 |
7 | rupo
8 | setup
9 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | *.so
3 | *.pyc
4 | *~
5 | .idea
6 | *.trie
7 | *.pickle
8 | *.h5
9 | dist/
10 | rupo-*
11 | rupo.*
12 | rupo/data/generator_models/
--------------------------------------------------------------------------------
/rupo/data/examples/text.txt:
--------------------------------------------------------------------------------
1 | Забывши волнения жизни мятежной,
2 | Один жил в пустыне рыбак молодой.
3 | Однажды на скале прибрежной,
4 | Над тихой прозрачной рекой
5 | Он с удой беспечно
6 | Сидел
7 | И думой сердечной
8 | К прошедшему счастью летел.
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 |
2 | rupo/data/stress_models/stress_ru_word30_LSTM256_dropout0.4_acc99_wer3.h5 filter=lfs diff=lfs merge=lfs -text
3 | rupo/data/g2p_models/g2p_ru_maxlen40_BLSTM256_BLSTM256_dropout0.2_acc992_wer140.h5 filter=lfs diff=lfs merge=lfs -text
4 |
--------------------------------------------------------------------------------
/rupo/util/timeit.py:
--------------------------------------------------------------------------------
1 | import time
2 | import logging
3 |
4 |
5 | def timeit(method):
6 | def timed(*args, **kw):
7 | ts = time.time()
8 | result = method(*args, **kw)
9 | te = time.time()
10 | logging.debug('%s %2.2f sec' % (method.__name__, te-ts))
11 | return result
12 | return timed
--------------------------------------------------------------------------------
/.codeclimate.yml:
--------------------------------------------------------------------------------
1 | ---
2 | engines:
3 | csslint:
4 | enabled: false
5 | duplication:
6 | enabled: true
7 | config:
8 | languages:
9 | - python
10 | eslint:
11 | enabled: false
12 | fixme:
13 | enabled: true
14 | radon:
15 | enabled: true
16 | ratings:
17 | paths:
18 | - "**.py"
19 | exclude_paths: []
20 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | dicttoxml >= 1.7.4
2 | pygtrie >= 2.2
3 | numpy >= 1.12.1
4 | scipy >= 0.19.0
5 | scikit-learn >= 0.18.1
6 | jsonpickle >= 0.9.4
7 | pymorphy2 >= 0.8
8 | h5py >= 2.7.0
9 | russian-tagsets == 0.6
10 | tqdm >= 4.14.0
11 | jsonpickle >= 0.9.4
12 | rnnmorph >= 0.2.3
13 | sentence_splitter >= 1.2
14 | allennlp == 0.9.0
15 | overrides == 3.0.0
16 | git+https://github.com/IlyaGusev/rulm.git@4e78a49
17 | git+https://github.com/IlyaGusev/russ.git@288fe6a
18 |
--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
1 | .. rupo documentation master file, created by
2 | sphinx-quickstart on Mon Jul 24 20:49:37 2017.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | Welcome to rupo's documentation!
7 | ================================
8 |
9 | .. toctree::
10 | :maxdepth: 2
11 | :caption: Contents:
12 |
13 |
14 |
15 | Indices and tables
16 | ==================
17 |
18 | * :ref:`genindex`
19 | * :ref:`modindex`
20 | * :ref:`search`
21 |
--------------------------------------------------------------------------------
/rupo/rhymes/test_rhymes.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Автор: Гусев Илья
3 | # Описание: Тесты для модуля рифм.
4 |
5 | import unittest
6 |
7 | from rupo.stress.word import StressedWord, Stress
8 | from rupo.rhymes.rhymes import Rhymes
9 |
10 |
11 | class TestRhymes(unittest.TestCase):
12 | def test_rhyme(self):
13 | self.assertTrue(Rhymes.is_rhyme(StressedWord("братишь", {Stress(4)}), StressedWord("грустишь", {Stress(5)})))
14 | self.assertFalse(Rhymes.is_rhyme(StressedWord("наизусть", {Stress(4)}), StressedWord("сестра", {Stress(5)})))
--------------------------------------------------------------------------------
/rupo/data/examples/text.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | -
4 |
5 | Михаил Лермонтов
6 | 1829
7 | 1829
8 | Забывши волнения жизни мятежной,
9 | Один жил в пустыне рыбак молодой.
10 | Однажды на скале прибрежной,
11 | Над тихой прозрачной рекой
12 | Он с удой беспечно
13 | Сидел
14 | И думой сердечной
15 | К прошедшему счастью летел.
16 | Забывши волнения жизни мятежной...
17 | 1829
18 |
19 |
--------------------------------------------------------------------------------
/docs/source/rupo.rhymes.rst:
--------------------------------------------------------------------------------
1 | rupo.rhymes package
2 | ===================
3 |
4 | Submodules
5 | ----------
6 |
7 | rupo.rhymes.rhymes module
8 | -------------------------
9 |
10 | .. automodule:: rupo.rhymes.rhymes
11 | :members:
12 | :undoc-members:
13 | :show-inheritance:
14 |
15 | rupo.rhymes.test_rhymes module
16 | ------------------------------
17 |
18 | .. automodule:: rupo.rhymes.test_rhymes
19 | :members:
20 | :undoc-members:
21 | :show-inheritance:
22 |
23 |
24 | Module contents
25 | ---------------
26 |
27 | .. automodule:: rupo.rhymes
28 | :members:
29 | :undoc-members:
30 | :show-inheritance:
31 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = sphinx-build
7 | SPHINXPROJ = rupo
8 | SOURCEDIR = source
9 | BUILDDIR = build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--------------------------------------------------------------------------------
/download.sh:
--------------------------------------------------------------------------------
1 | wget https://www.dropbox.com/s/dwkui2xqivzsyw5/generator_model.zip
2 | mkdir -p ./rupo/data/generator_models
3 | unzip generator_model.zip -d ./rupo/data/generator_models
4 | rm generator_model.zip
5 |
6 | wget https://www.dropbox.com/s/ajd8b7lpqaao7xt/stress_ru_main.tar.gz
7 | mkdir -p ./rupo/data/stress_models/ru_main
8 | tar -xzvf stress_ru_main.tar.gz --directory ./rupo/data/stress_models/ru_main
9 | rm stress_ru_main.tar.gz
10 |
11 | wget https://www.dropbox.com/s/7rk135fzd3i8kfw/g2p_models.zip
12 | mkdir -p ./rupo/data/g2p_models
13 | unzip g2p_models.zip -d ./rupo/data/g2p_models
14 | rm g2p_models.zip
15 |
16 | wget https://www.dropbox.com/s/znqlrb1xblh3amo/dict.zip
17 | mkdir -p ./rupo/data/dict
18 | unzip dict.zip -d ./rupo/data/dict
19 | rm dict.zip
20 |
--------------------------------------------------------------------------------
/docs/source/rupo.util.rst:
--------------------------------------------------------------------------------
1 | rupo.util package
2 | =================
3 |
4 | Submodules
5 | ----------
6 |
7 | rupo.util.data module
8 | ---------------------
9 |
10 | .. automodule:: rupo.util.data
11 | :members:
12 | :undoc-members:
13 | :show-inheritance:
14 |
15 | rupo.util.mixins module
16 | -----------------------
17 |
18 | .. automodule:: rupo.util.mixins
19 | :members:
20 | :undoc-members:
21 | :show-inheritance:
22 |
23 | rupo.util.preprocess module
24 | ---------------------------
25 |
26 | .. automodule:: rupo.util.preprocess
27 | :members:
28 | :undoc-members:
29 | :show-inheritance:
30 |
31 |
32 | Module contents
33 | ---------------
34 |
35 | .. automodule:: rupo.util
36 | :members:
37 | :undoc-members:
38 | :show-inheritance:
39 |
--------------------------------------------------------------------------------
/rupo/main/test_vocabulary.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Автор: Гусев Илья
3 | # Описание: Тесты словаря.
4 |
5 | import os
6 | import unittest
7 |
8 | from rupo.main.vocabulary import StressVocabulary
9 | from rupo.settings import EXAMPLES_DIR, MARKUP_XML_EXAMPLE
10 |
11 |
12 | class TestVocabulary(unittest.TestCase):
13 | def test_vocabulary(self):
14 | dump_file = os.path.join(EXAMPLES_DIR, "temp.pickle")
15 | vocabulary = StressVocabulary()
16 | vocabulary.parse(MARKUP_XML_EXAMPLE)
17 | vocabulary.save(dump_file)
18 | self.assertTrue(os.path.exists(dump_file))
19 | os.remove(dump_file)
20 | try:
21 | self.assertTrue(vocabulary.get_word(0) is not None)
22 | except IndexError:
23 | self.assertTrue(False)
24 |
--------------------------------------------------------------------------------
/docs/source/rupo.rst:
--------------------------------------------------------------------------------
1 | rupo package
2 | ============
3 |
4 | Subpackages
5 | -----------
6 |
7 | .. toctree::
8 |
9 | rupo.dict
10 | rupo.files
11 | rupo.g2p
12 | rupo.generate
13 | rupo.main
14 | rupo.metre
15 | rupo.morph
16 | rupo.rhymes
17 | rupo.stress
18 | rupo.util
19 |
20 | Submodules
21 | ----------
22 |
23 | rupo.api module
24 | ---------------
25 |
26 | .. automodule:: rupo.api
27 | :members:
28 | :undoc-members:
29 | :show-inheritance:
30 |
31 | rupo.settings module
32 | --------------------
33 |
34 | .. automodule:: rupo.settings
35 | :members:
36 | :undoc-members:
37 | :show-inheritance:
38 |
39 | rupo.test_api module
40 | --------------------
41 |
42 | .. automodule:: rupo.test_api
43 | :members:
44 | :undoc-members:
45 | :show-inheritance:
46 |
47 |
48 | Module contents
49 | ---------------
50 |
51 | .. automodule:: rupo
52 | :members:
53 | :undoc-members:
54 | :show-inheritance:
55 |
--------------------------------------------------------------------------------
/docs/source/rupo.files.rst:
--------------------------------------------------------------------------------
1 | rupo.files package
2 | ==================
3 |
4 | Submodules
5 | ----------
6 |
7 | rupo.files.reader module
8 | ------------------------
9 |
10 | .. automodule:: rupo.files.reader
11 | :members:
12 | :undoc-members:
13 | :show-inheritance:
14 |
15 | rupo.files.test_reader module
16 | -----------------------------
17 |
18 | .. automodule:: rupo.files.test_reader
19 | :members:
20 | :undoc-members:
21 | :show-inheritance:
22 |
23 | rupo.files.test_writer module
24 | -----------------------------
25 |
26 | .. automodule:: rupo.files.test_writer
27 | :members:
28 | :undoc-members:
29 | :show-inheritance:
30 |
31 | rupo.files.writer module
32 | ------------------------
33 |
34 | .. automodule:: rupo.files.writer
35 | :members:
36 | :undoc-members:
37 | :show-inheritance:
38 |
39 |
40 | Module contents
41 | ---------------
42 |
43 | .. automodule:: rupo.files
44 | :members:
45 | :undoc-members:
46 | :show-inheritance:
47 |
--------------------------------------------------------------------------------
/rupo/util/tqdm_open.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Авторы: Анастасьев Даниил
3 | # Описание: Обертка открытия больших файлов в счетчик tqdm
4 |
5 | from contextlib import contextmanager
6 | from os.path import getsize, basename
7 | from tqdm import tqdm
8 |
9 |
10 | @contextmanager
11 | def tqdm_open(filename, encoding='utf8'):
12 | """
13 | Открытие файла, обёрнутое в tqdm
14 | """
15 | total = getsize(filename)
16 |
17 | def wrapped_line_iterator(fd):
18 | with tqdm(total=total, unit="B", unit_scale=True, desc=basename(filename), miniters=1) as pb:
19 | processed_bytes = 0
20 | for line in fd:
21 | processed_bytes += len(line)
22 | if processed_bytes >= 1024 * 1024:
23 | pb.update(processed_bytes)
24 | processed_bytes = 0
25 | yield line
26 | pb.update(processed_bytes)
27 |
28 | with open(filename, encoding=encoding) as fd:
29 | yield wrapped_line_iterator(fd)
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | dist: xenial
2 | sudo: required
3 | language: python
4 | python: 3.6
5 | before_script:
6 | - git lfs pull
7 | install:
8 | - pip install --upgrade pip setuptools wheel
9 | - pip install -r requirements.txt
10 | - sh download.sh
11 | script:
12 | - pytest
13 | deploy:
14 | provider: pypi
15 | user: Phoenix120
16 | password:
17 | secure: ueaFMBiVlNSPmwivJ0uMGJw1ntj6nTuCqIuxYw0IXQDUebPkU6QZLH3o8k59BgD1o5+7yXmpWQHXb7B5UYr0pBC/4wEutatoGLzDmcTv1DaYn7kzrv5PTBSQVEvCQNzA8jNog0j6Ljg9Z7CN3H/vGIIdPRt1Gxmu0dPCrX3rGMlwKZLH5/gRaZlbgxtov/UGfIUEOgJmM1eJvZYS8Y5InmxlUBJmT0U1QDe1cBooax43KlspQzCJSJ6NciMGXSZUi5nPSb9sKbqvbOjRnCydcazeQwoRf14qIwFS3b7nL4TLb+rRSHFKuOJ9cmnAF+f5qo0ytJuYZqo3dNS8LqwuJH0tXyO4fo5T7Xe2k7eIfla4mg1T+uss5zIM0ttfW/ApKQanAr2kZ/tMl6ywWkWLJ1crSYM9RjUewZw8Z1qwYbEDJrcWIBZxkyPfEkzilgjAvlf4rmEUR3eJtm2YBgoz5XiNR2sdTeRFUgAcZUyC7nx+N15FJgw1HTtZeqbedPGgq84sMk31OxGfpGDJQ9iHqHavvTdxiRjA8YLNlxAeZ+Upop6zLznUM7iE742tNAjqjaXhGR6128Viggn4hL2PZuFYlmRx5Rt7LhCr1OgKViodNJwyoWZTFDl3p+b6GoJai9FPIewX6nmfQTAeYweFM8yz38akC8v21P3/kNeJ2/w=
18 | on:
19 | tags: true
20 |
--------------------------------------------------------------------------------
/docs/source/rupo.metre.rst:
--------------------------------------------------------------------------------
1 | rupo.metre package
2 | ==================
3 |
4 | Submodules
5 | ----------
6 |
7 | rupo.metre.metre_classifier module
8 | ----------------------------------
9 |
10 | .. automodule:: rupo.metre.metre_classifier
11 | :members:
12 | :undoc-members:
13 | :show-inheritance:
14 |
15 | rupo.metre.patterns module
16 | --------------------------
17 |
18 | .. automodule:: rupo.metre.patterns
19 | :members:
20 | :undoc-members:
21 | :show-inheritance:
22 |
23 | rupo.metre.test_metre_classifier module
24 | ---------------------------------------
25 |
26 | .. automodule:: rupo.metre.test_metre_classifier
27 | :members:
28 | :undoc-members:
29 | :show-inheritance:
30 |
31 | rupo.metre.test_patterns module
32 | -------------------------------
33 |
34 | .. automodule:: rupo.metre.test_patterns
35 | :members:
36 | :undoc-members:
37 | :show-inheritance:
38 |
39 |
40 | Module contents
41 | ---------------
42 |
43 | .. automodule:: rupo.metre
44 | :members:
45 | :undoc-members:
46 | :show-inheritance:
47 |
--------------------------------------------------------------------------------
/rupo/files/test_writer.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Автор: Гусев Илья
3 | # Описание: Тесты записи разметок.
4 |
5 | import unittest
6 | import os
7 |
8 | from rupo.main.markup import Markup
9 | from rupo.files.writer import Writer
10 | from rupo.files.reader import Reader, FileType
11 | from rupo.util.data import MARKUP_EXAMPLE
12 | from rupo.settings import EXAMPLES_DIR
13 |
14 |
15 | class TestWriter(unittest.TestCase):
16 | def test_write(self):
17 | temp_file = os.path.join(EXAMPLES_DIR, "temp.xml")
18 | markup = MARKUP_EXAMPLE
19 | Writer.write_markups(FileType.XML, [markup], temp_file)
20 | processed_xml = Reader.read_markups(temp_file, FileType.XML, is_processed=True)
21 | self.assertEqual(next(processed_xml), markup)
22 | processed_xml.close()
23 | os.remove(temp_file)
24 |
25 | temp_file = os.path.join(EXAMPLES_DIR, "temp.txt")
26 | Writer.write_markups(FileType.RAW, [markup], temp_file)
27 | processed_raw = Reader.read_markups(temp_file, FileType.RAW, is_processed=True)
28 | self.assertIsInstance((next(processed_raw)), Markup)
29 | processed_raw.close()
30 | os.remove(temp_file)
31 |
--------------------------------------------------------------------------------
/rupo/util/data.py:
--------------------------------------------------------------------------------
1 | from rupo.main.markup import Markup, Line, Word, Syllable
2 |
3 | MARKUP_EXAMPLE = Markup("Соломка король себя.\n Пора виться майкой в.", [
4 | Line(0, 20, "Соломка король себя.", [
5 | Word(0, 7, "Соломка",
6 | [Syllable(0, 2, 0, "Со"),
7 | Syllable(2, 5, 1, "лом", 3),
8 | Syllable(5, 7, 2, "ка")]),
9 | Word(8, 14, "король",
10 | [Syllable(0, 2, 0, "ко"),
11 | Syllable(2, 6, 1, "роль", 3)]),
12 | Word(15, 19, "себя",
13 | [Syllable(0, 2, 0, "се"),
14 | Syllable(2, 4, 1, "бя", 3)])]),
15 | Line(21, 43, " Пора виться майкой в.",[
16 | Word(22, 26, "Пора",
17 | [Syllable(0, 2, 0, "По", 1),
18 | Syllable(2, 4, 1, "ра", 3)]),
19 | Word(27, 33, "виться",
20 | [Syllable(0, 2, 0, "ви", 1),
21 | Syllable(2, 6, 1, "ться")]),
22 | Word(34, 40, "майкой",
23 | [Syllable(0, 3, 0, "май", 1),
24 | Syllable(3, 6, 1, "кой")]),
25 | Word(41, 42, "в", [])
26 | ])])
--------------------------------------------------------------------------------
/rupo/dict/zaliznyak.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 |
4 | class ZalyzniakDict:
5 | @staticmethod
6 | def convert_to_accent_only(dict_file, accent_file):
7 | with open(dict_file, 'r', encoding='utf-8') as r:
8 | lines = r.readlines()
9 | with open(accent_file, 'w', encoding='utf-8') as w:
10 | for line in lines:
11 | for word in line.split("#")[1].split(","):
12 | word = word.strip()
13 | pos = -1
14 | clean_word = ""
15 | primary = []
16 | secondary = []
17 | for i, ch in enumerate(word):
18 | if ch == "'" or ch == "`":
19 | if ch == "`":
20 | secondary.append(pos)
21 | else:
22 | primary.append(pos)
23 | continue
24 | clean_word += ch
25 | pos += 1
26 | if ch == "ё":
27 | primary.append(pos)
28 | if len(primary) != 0:
29 | w.write(clean_word + "\t" + ",".join([str(a) for a in primary]) + "\t" +
30 | ",".join([str(a) for a in secondary]) + "\n")
--------------------------------------------------------------------------------
/docs/source/rupo.main.rst:
--------------------------------------------------------------------------------
1 | rupo.main package
2 | =================
3 |
4 | Submodules
5 | ----------
6 |
7 | rupo.main.markup module
8 | -----------------------
9 |
10 | .. automodule:: rupo.main.markup
11 | :members:
12 | :undoc-members:
13 | :show-inheritance:
14 |
15 | rupo.main.test_markup module
16 | ----------------------------
17 |
18 | .. automodule:: rupo.main.test_markup
19 | :members:
20 | :undoc-members:
21 | :show-inheritance:
22 |
23 | rupo.main.test_tokenizer module
24 | -------------------------------
25 |
26 | .. automodule:: rupo.main.test_tokenizer
27 | :members:
28 | :undoc-members:
29 | :show-inheritance:
30 |
31 | rupo.main.test_vocabulary module
32 | --------------------------------
33 |
34 | .. automodule:: rupo.main.test_vocabulary
35 | :members:
36 | :undoc-members:
37 | :show-inheritance:
38 |
39 | rupo.main.tokenizer module
40 | --------------------------
41 |
42 | .. automodule:: rupo.main.tokenizer
43 | :members:
44 | :undoc-members:
45 | :show-inheritance:
46 |
47 | rupo.main.vocabulary module
48 | ---------------------------
49 |
50 | .. automodule:: rupo.main.vocabulary
51 | :members:
52 | :undoc-members:
53 | :show-inheritance:
54 |
55 |
56 | Module contents
57 | ---------------
58 |
59 | .. automodule:: rupo.main
60 | :members:
61 | :undoc-members:
62 | :show-inheritance:
63 |
--------------------------------------------------------------------------------
/generate_poem.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | from rupo.api import Engine
4 | from rupo.settings import RU_STRESS_DEFAULT_MODEL, ZALYZNYAK_DICT, GENERATOR_MODEL_DIR
5 |
6 |
7 | if __name__ == "__main__":
8 | parser = argparse.ArgumentParser()
9 | parser.add_argument('--model-path', type=str, default=GENERATOR_MODEL_DIR)
10 | parser.add_argument('--token-vocab-path', type=str, default=None)
11 | parser.add_argument('--stress-vocab-path', type=str, default=None)
12 | parser.add_argument('--metre-schema', type=str, default='+-')
13 | parser.add_argument('--rhyme-pattern', type=str, default='abab')
14 | parser.add_argument('--n-syllables', type=int, default=8)
15 | parser.add_argument('--sampling-k', type=int, default=50000)
16 | parser.add_argument('--beam-width', type=int, default=None)
17 | parser.add_argument('--temperature', type=float, default=1.0)
18 | parser.add_argument('--last-text', type=str, default="")
19 | parser.add_argument('--count', type=int, default=100)
20 | args = parser.parse_args()
21 |
22 | kwargs = vars(args)
23 | count = kwargs.pop('count')
24 |
25 | engine = Engine()
26 | engine.load(RU_STRESS_DEFAULT_MODEL, ZALYZNYAK_DICT)
27 | for seed in range(count):
28 | print(seed)
29 | try:
30 | poem = engine.generate_poem(seed=seed, **kwargs)
31 | print(poem)
32 | except AssertionError as e:
33 | print("Error: ", e)
34 |
--------------------------------------------------------------------------------
/rupo/stress/test_dict.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Автор: Гусев Илья
3 | # Описание: Тесты для словаря ударений.
4 |
5 | import unittest
6 |
7 | from rupo.stress.dict import StressDict
8 | from rupo.stress.word import Stress, StressedWord
9 | from rupo.util.preprocess import VOWELS
10 | from rupo.settings import RU_GRAPHEME_STRESS_PATH, ZALYZNYAK_DICT, RU_GRAPHEME_STRESS_TRIE_PATH
11 |
12 |
13 | class TestStressDict(unittest.TestCase):
14 | @classmethod
15 | def setUpClass(cls):
16 | cls.dict = StressDict(language="ru", zalyzniak_dict=ZALYZNYAK_DICT,
17 | raw_dict_path=RU_GRAPHEME_STRESS_PATH, trie_path=RU_GRAPHEME_STRESS_TRIE_PATH)
18 |
19 | @classmethod
20 | def tearDownClass(cls):
21 | del cls.dict
22 |
23 | def test_get_stresses(self):
24 | self.assertCountEqual(self.dict.get_stresses("данный", Stress.Type.PRIMARY), [1])
25 | self.assertCountEqual(self.dict.get_stresses("союза", Stress.Type.PRIMARY), [2])
26 | self.assertCountEqual(self.dict.get_stresses("англосакс", Stress.Type.SECONDARY), [0])
27 | self.assertCountEqual(self.dict.get_stresses("англосакс", Stress.Type.ANY), [0, 6])
28 | self.assertCountEqual(self.dict.get_stresses("пора", Stress.Type.PRIMARY), [1, 3])
29 |
30 | def test_stress_only_in_vowels(self):
31 | for word, stresses in self.dict.get_all():
32 | for stress in stresses:
33 | self.assertIn(word[stress.position], VOWELS)
34 |
35 |
--------------------------------------------------------------------------------
/rupo/main/test_markup.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Автор: Гусев Илья
3 | # Описание: Тесты для разметки.
4 |
5 | import unittest
6 |
7 | from rupo.util.data import MARKUP_EXAMPLE
8 | from rupo.main.markup import Markup
9 | from rupo.stress.predictor import CombinedStressPredictor
10 | from rupo.settings import RU_STRESS_DEFAULT_MODEL, ZALYZNYAK_DICT, CMU_DICT, \
11 | RU_GRAPHEME_STRESS_PATH, RU_GRAPHEME_STRESS_TRIE_PATH
12 |
13 |
14 | class TestMarkup(unittest.TestCase):
15 | @classmethod
16 | def setUpClass(cls):
17 | cls.stress_predictor = CombinedStressPredictor(
18 | stress_model_path=RU_STRESS_DEFAULT_MODEL,
19 | zalyzniak_dict=ZALYZNYAK_DICT,
20 | cmu_dict=CMU_DICT,
21 | raw_stress_dict_path=RU_GRAPHEME_STRESS_PATH,
22 | stress_trie_path=RU_GRAPHEME_STRESS_TRIE_PATH
23 | )
24 |
25 | @classmethod
26 | def tearDownClass(cls):
27 | del cls.stress_predictor
28 |
29 | def test_from_to(self):
30 | clean_markup = Markup()
31 | self.assertEqual(MARKUP_EXAMPLE, clean_markup.from_xml(MARKUP_EXAMPLE.to_xml()))
32 | clean_markup = Markup()
33 | self.assertEqual(MARKUP_EXAMPLE, clean_markup.from_json(MARKUP_EXAMPLE.to_json()))
34 |
35 | def test_process_text(self):
36 | text = "Соломка король себя.\n Пора виться майкой в."
37 | markup = Markup.process_text(text, self.stress_predictor)
38 | self.assertEqual(markup.to_json(), MARKUP_EXAMPLE.to_json())
39 |
40 |
--------------------------------------------------------------------------------
/rupo/files/test_reader.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Автор: Гусев Илья
3 | # Описание: Тесты считывателя разметок.
4 |
5 | import unittest
6 |
7 | from rupo.files.reader import Reader, FileType
8 | from rupo.stress.predictor import CombinedStressPredictor
9 | from rupo.main.markup import Markup, Line, Word
10 | from rupo.settings import MARKUP_XML_EXAMPLE, TEXT_XML_EXAMPLE, MARKUP_JSON_EXAMPLE
11 |
12 |
13 | class TestReader(unittest.TestCase):
14 | @classmethod
15 | def setUpClass(cls):
16 | cls.stress_predictor = CombinedStressPredictor()
17 |
18 | def test_read(self):
19 | processed_xml = Reader.read_markups(MARKUP_XML_EXAMPLE, FileType.XML, is_processed=True)
20 | self.__assert_markup_is_correct(next(processed_xml))
21 |
22 | unprocessed_xml = Reader.read_markups(TEXT_XML_EXAMPLE, FileType.XML, is_processed=False,
23 | stress_predictor=self.stress_predictor)
24 | self.__assert_markup_is_correct(next(unprocessed_xml))
25 |
26 | processed_json = Reader.read_markups(MARKUP_JSON_EXAMPLE, FileType.JSON, is_processed=True)
27 | self.__assert_markup_is_correct(next(processed_json))
28 |
29 | def __assert_markup_is_correct(self, markup):
30 | self.assertIsInstance(markup, Markup)
31 | self.assertIsNotNone(markup.text)
32 | self.assertNotEqual(markup.text, "")
33 | self.assertNotEqual(markup.lines, [])
34 | self.assertIsInstance(markup.lines[0], Line)
35 | self.assertIsInstance(markup.lines[0].words[0], Word)
36 |
--------------------------------------------------------------------------------
/rupo/stress/test_predictor.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Автор: Гусев Илья
3 | # Описание: Тесты предсказателя ударений.
4 |
5 | import unittest
6 |
7 | from rupo.stress.predictor import CombinedStressPredictor
8 | from rupo.settings import RU_STRESS_DEFAULT_MODEL, ZALYZNYAK_DICT, CMU_DICT, \
9 | RU_GRAPHEME_STRESS_PATH, RU_GRAPHEME_STRESS_TRIE_PATH
10 |
11 |
12 | class TestStressPredictor(unittest.TestCase):
13 | @classmethod
14 | def setUpClass(cls):
15 | cls.stress_predictor = CombinedStressPredictor(
16 | stress_model_path=RU_STRESS_DEFAULT_MODEL,
17 | zalyzniak_dict=ZALYZNYAK_DICT,
18 | cmu_dict=CMU_DICT,
19 | raw_stress_dict_path=RU_GRAPHEME_STRESS_PATH,
20 | stress_trie_path=RU_GRAPHEME_STRESS_TRIE_PATH
21 | )
22 |
23 | def test_stress(self):
24 | checks = {
25 | 'я': [0],
26 | 'в': [],
27 | 'он': [0],
28 | 'майка': [1],
29 | 'соломка': [3],
30 | 'изжить': [3],
31 | 'виться': [1],
32 | 'данный': [1],
33 | 'зорька': [1],
34 | 'банка': [1],
35 | 'оттечь': [3],
36 | 'советского': [3],
37 | 'союза': [2],
38 | 'пора': [3, 1],
39 | 'изжила': [5],
40 | 'меда': [1],
41 | 'автоподъёмник': [8],
42 | 'каракуля': [3],
43 | 'супервайзер': [6],
44 | 'колесом': [5]
45 | }
46 | for word, pos in checks.items():
47 | self.assertEqual(sorted(self.stress_predictor.predict(word)), sorted(pos))
48 |
--------------------------------------------------------------------------------
/rupo/util/mixins.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Автор: Гусев Илья
3 | # Описание: Служебные миксины для удобства сериализации.
4 |
5 |
6 | def to_dict(obj):
7 | """
8 | Преобразование объекта в словарь.
9 |
10 | :param obj: объект, который нужно превратить в словарь
11 | :return data: получившийся словарь
12 | """
13 | if isinstance(obj, dict):
14 | data = {}
15 | for (k, v) in obj.items():
16 | data[k] = to_dict(v)
17 | return data
18 | elif hasattr(obj, "__iter__") and not isinstance(obj, str):
19 | return [to_dict(v) for v in obj]
20 | elif hasattr(obj, "__dict__"):
21 | data = dict([(key, to_dict(value)) for key, value in obj.__dict__.items()
22 | if not callable(value) and not key.startswith('_')])
23 | return data
24 | else:
25 | return obj
26 |
27 |
28 | class CommonMixin(object):
29 | """
30 | Mixin для удобного сравнения и преобразования в dict.
31 | """
32 | def __eq__(self, other):
33 | if isinstance(other, self.__class__):
34 | return self.__dict__ == other.__dict__
35 | return NotImplemented
36 |
37 | def __ne__(self, other):
38 | if isinstance(other, self.__class__):
39 | return not self.__eq__(other)
40 | return NotImplemented
41 |
42 | def __hash__(self):
43 | return hash(tuple(sorted(self.__dict__.items())))
44 |
45 | def __repr__(self):
46 | return str(self.to_dict())
47 |
48 | def __str__(self):
49 | return str(self.to_dict())
50 |
51 | def to_dict(self):
52 | return to_dict(self)
--------------------------------------------------------------------------------
/rupo/data/examples/morph_markup.txt:
--------------------------------------------------------------------------------
1 | забывши забыть VERB Aspect=Perf|Tense=Past|VerbForm=Trans
2 | волнения волнение NOUN Animacy=Inan|Case=Gen|Gender=Neut|Number=Sing
3 | жизни жизнь NOUN Animacy=Inan|Case=Gen|Gender=Fem|Number=Sing
4 | мятежной мятежный ADJ Case=Gen|Gender=Fem|Number=Sing
5 |
6 | один один DET Case=Nom|Gender=Masc|Number=Sing
7 | жил жить VERB Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin
8 | в в ADP _
9 | пустыне пустыня NOUN Animacy=Inan|Case=Loc|Gender=Fem|Number=Sing
10 | рыбак рыбак NOUN Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing
11 | молодой молодая NOUN Animacy=Anim|Case=Gen|Gender=Fem|Number=Sing
12 |
13 | однажды однажды ADV _
14 | на на ADP _
15 | скале скал NOUN Animacy=Inan|Case=Loc|Gender=Masc|Number=Sing
16 | прибрежной прибрежный ADJ Case=Gen|Gender=Fem|Number=Sing
17 |
18 | над над ADP _
19 | тихой тихий ADJ Case=Gen|Gender=Fem|Number=Sing
20 | прозрачной прозрачный ADJ Case=Gen|Gender=Fem|Number=Sing
21 | рекой река NOUN Animacy=Inan|Case=Ins|Gender=Fem|Number=Sing
22 |
23 | он он PRON Case=Nom|Gender=Masc|Number=Sing|Person=3
24 | с с ADP _
25 | удой уда PROPN Animacy=Inan|Case=Ins|Gender=Fem|Number=Sing
26 | беспечно беспечно ADV _
27 |
28 | сидел сидеть VERB Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin
29 |
30 | и и CONJ _
31 | думой дума NOUN Animacy=Inan|Case=Ins|Gender=Fem|Number=Sing
32 | сердечной сердечный ADJ Case=Gen|Gender=Fem|Number=Sing
33 |
34 | к к ADP _
35 | прошедшему прошедшее NOUN Animacy=Inan|Case=Dat|Gender=Neut|Number=Sing
36 | счастью счастие NOUN Animacy=Inan|Case=Dat|Gender=Neut|Number=Sing
37 | летел лететь VERB Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin
38 |
39 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import find_packages, setup
2 | from setuptools.command.develop import develop
3 | from setuptools.command.install import install
4 |
5 |
6 | class PostDevelopCommand(develop):
7 | def run(self):
8 | develop.run(self)
9 |
10 |
11 | class PostInstallCommand(install):
12 | def run(self):
13 | install.run(self)
14 |
15 |
16 | setup(
17 | name='rupo',
18 | packages=find_packages(),
19 | version='0.2.8',
20 | description='RuPo: library for russian poetry analysis and generation',
21 | author='Ilya Gusev',
22 | author_email='phoenixilya@gmail.com',
23 | url='https://github.com/IlyaGusev/rupo',
24 | download_url='https://github.com/IlyaGusev/rupo/archive/0.2.8.tar.gz',
25 | keywords=['poetry', 'nlp', 'russian'],
26 | package_data={
27 | 'rupo': ['data/examples/*', 'data/hyphen-tokens.txt']
28 | },
29 | install_requires=[
30 | 'dicttoxml>=1.7.4',
31 | 'pygtrie>=2.2',
32 | 'numpy>=1.11.3',
33 | 'scipy>=0.18.1',
34 | 'scikit-learn>=0.18.1',
35 | 'jsonpickle>=0.9.4',
36 | 'pymorphy2>=0.8',
37 | 'h5py>=2.7.0',
38 | 'russian-tagsets==0.6',
39 | 'tqdm>=4.14.0',
40 | 'rnnmorph==0.2.3',
41 | 'sentence_splitter>=1.2',
42 | 'rulm==0.0.2',
43 | 'russ==0.0.1'
44 | ],
45 | cmdclass={
46 | 'develop': PostDevelopCommand,
47 | 'install': PostInstallCommand,
48 | },
49 | classifiers=[
50 | 'Development Status :: 4 - Beta',
51 | 'Intended Audience :: Developers',
52 |
53 | 'Topic :: Text Processing :: Linguistic',
54 |
55 | 'License :: OSI Approved :: Apache Software License',
56 |
57 | 'Natural Language :: Russian',
58 |
59 | 'Programming Language :: Python :: 3.5',
60 | 'Programming Language :: Python :: 3.5',
61 | 'Programming Language :: Python :: 3.6',
62 | ],
63 | )
64 |
--------------------------------------------------------------------------------
/rupo/util/preprocess.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Автор: Гусев Илья
3 | # Описание: Служебные функции и константы.
4 |
5 | import re
6 |
7 | CYRRILIC_LOWER_VOWELS = "аоэиуыеёюя"
8 | CYRRILIC_LOWER_CONSONANTS = "йцкнгшщзхъфвпрлджчсмтьб"
9 | VOWELS = "aeiouAEIOUаоэиуыеёюяАОЭИУЫЕЁЮЯ"
10 | CLOSED_SYLLABLE_CHARS = "рлймнРЛЙМН"
11 |
12 |
13 | def text_to_wordlist(sentence, cyrillic=False):
14 | regexp = "[^а-яА-Яёa-zA-Z]"
15 | if cyrillic:
16 | regexp = "[^а-яА-Яё]"
17 | sentence = re.sub(regexp, " ", sentence)
18 | result = sentence.lower().split()
19 | return result
20 |
21 |
22 | def text_to_sentences(text):
23 | regexp = "[\.\?!](?=[\s\n]*[A-ZА-Я])|;|:-|:—|:—|: —|: —|: -"
24 | regexps = ["(?<=[^A-zА-я][A-ZА-Я])\.",
25 | "(?<=[^A-zА-я][A-zА-я])\.[ ]?(?=[A-zА-я][^A-zА-я])",
26 | "\.(?=,)"
27 | ]
28 | for reg in regexps:
29 | text = "$".join(re.split(reg,text))
30 |
31 | result = re.split(regexp, text)
32 | result = map(lambda x: x.strip().replace("$", "."), result)
33 | return result
34 |
35 |
36 | def to_cyrrilic(text):
37 | return text.replace("x", "х") \
38 | .replace("a", "а") \
39 | .replace("y", "у") \
40 | .replace("o", "о") \
41 | .replace("c", "с") \
42 | .replace("ё", "е")
43 |
44 |
45 | def normilize_line(text):
46 | regexp = "[^а-яА-Яёa-zA-Z0-9]"
47 | text = re.sub(regexp, " ", text)
48 | result = to_cyrrilic("".join(text.lower().split()))
49 | return result
50 |
51 |
52 | def count_vowels(string):
53 | num_vowels = 0
54 | for char in string:
55 | if char in VOWELS:
56 | num_vowels += 1
57 | return num_vowels
58 |
59 |
60 | def get_first_vowel_position(string):
61 | for i, ch in enumerate(string):
62 | if ch in VOWELS:
63 | return i
64 | return -1
65 |
66 |
67 | def etree_to_dict(t):
68 | return {t.tag: map(etree_to_dict, t.iterchildren()) or t.text}
--------------------------------------------------------------------------------
/rupo/main/morph.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Автор: Гусев Илья
3 | # Описание: Разметка по грамматическим значениям
4 |
5 | import os
6 | from typing import List, TextIO
7 |
8 | from sentence_splitter import SentenceSplitter
9 | from rnnmorph.predictor import RNNMorphPredictor
10 |
11 | from rupo.main.tokenizer import Tokenizer, Token
12 |
13 |
14 | class Morph:
15 | @staticmethod
16 | def get_morph_markup(input_filenames: List[str], output_filename: str):
17 | """
18 | Разметка по грамматическим значениям
19 |
20 | :param input_filenames: входные текстовые файлы
21 | :param output_filename: путь к файлу, куда будет сохранена разметка
22 | """
23 | if os.path.exists(output_filename):
24 | os.remove(output_filename)
25 |
26 | sentence_splitter = SentenceSplitter(language='ru')
27 | morph_predictor = RNNMorphPredictor()
28 |
29 | for filename in input_filenames:
30 | with open(filename, "r", encoding="utf-8") as r, open(output_filename, "w+", encoding="utf-8") as w:
31 | for line in r:
32 | Morph.__process_line(line, w, sentence_splitter, morph_predictor)
33 |
34 | @staticmethod
35 | def __process_line(line: str, output_file: TextIO, sentence_splitter: SentenceSplitter,
36 | morph_predictor: RNNMorphPredictor):
37 | sentences = sentence_splitter.split(line)
38 | for sentence in sentences:
39 | words = [token.text for token in Tokenizer.tokenize(sentence)
40 | if token.text != '' and token.token_type != Token.TokenType.SPACE]
41 | if not words:
42 | continue
43 | forms = morph_predictor.predict_sentence_tags(words)
44 | for form in forms:
45 | if form.pos == "PUNCT":
46 | continue
47 | output_file.write("%s\t%s\t%s\t%s\n" % (form.word, form.normal_form, form.pos, form.tag))
48 | output_file.write("\n")
49 |
--------------------------------------------------------------------------------
/rupo/stress/word.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Автор: Гусев Илья
3 | # Описание: Класс слова с ударением.
4 |
5 | from enum import Enum
6 | from typing import List, Set
7 | from russ.syllables import get_syllables
8 |
9 |
10 | class Stress:
11 | """
12 | Ударение
13 | """
14 |
15 | class Type(Enum):
16 | ANY = -1
17 | PRIMARY = 0
18 | SECONDARY = 1
19 |
20 | def __init__(self, position: int, stress_type: Type=Type.PRIMARY) -> None:
21 | self.position = position
22 | self.type = stress_type
23 |
24 | def __hash__(self):
25 | return hash(self.position)
26 |
27 | def __eq__(self, other: 'Stress'):
28 | return self.position == other.position and self.type == other.type
29 |
30 | def __str__(self):
31 | return str(self.position) + "\t" + str(self.type)
32 |
33 | def __repr__(self):
34 | return self.__str__()
35 |
36 |
37 | class StressedWord:
38 | """
39 | Слово и его ударения.
40 | """
41 |
42 | def __init__(self, text: str, stresses: Set[Stress]) -> None:
43 | self.stresses = stresses
44 | self.text = text
45 | self.syllables = get_syllables(text)
46 | self.__accent_syllables()
47 |
48 | def get_primary_stresses(self) -> List[int]:
49 | return [stress.position for stress in self.stresses if stress.type == Stress.Type.PRIMARY]
50 |
51 | def get_secondary_stresses(self) -> List[int]:
52 | return [stress.position for stress in self.stresses if stress.type == Stress.Type.SECONDARY]
53 |
54 | def add_stress(self, position: int, stress_type: Stress.Type=Stress.Type.PRIMARY) -> None:
55 | self.stresses.add(Stress(position, stress_type))
56 | self.__accent_syllables()
57 |
58 | def add_stresses(self, stresses: List[Stress]) -> None:
59 | self.stresses = set(self.stresses).union(set(stresses))
60 | self.__accent_syllables()
61 |
62 | def __accent_syllables(self):
63 | for syllable in self.syllables:
64 | if Stress(syllable.vowel()) in self.stresses:
65 | syllable.stress = syllable.vowel()
66 | else:
67 | syllable.stress = -1
68 |
69 | def __str__(self):
70 | return self.text + "\t" + ",".join([str(i) for i in self.get_primary_stresses()])+ \
71 | "\t" + ",".join([str(i) for i in self.get_secondary_stresses()])
72 |
73 | def __repr__(self):
74 | return self.__str__()
75 |
76 | def __hash__(self):
77 | return hash(self.text)
78 |
79 | def __eq__(self, other: 'StressedWord'):
80 | return self.text == other.text
81 |
--------------------------------------------------------------------------------
/rupo/dict/wiki.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | from rupo.settings import RU_GRAPHEME_SET
4 |
5 | from russ.syllables import VOWELS
6 |
7 |
8 | class WikiDict:
9 | @staticmethod
10 | def convert_to_g2p_only(source_file, destination_file):
11 | with open(source_file, 'r', encoding='utf-8') as r:
12 | lines = r.readlines()
13 | with open(destination_file, 'w', encoding='utf-8') as w:
14 | words = []
15 | phonetic_words = []
16 | for line in lines:
17 | words.append(line.split("\t")[0].strip())
18 | phonetic_words.append(line.split("\t")[1].replace("'", "").replace("ˌ", "").strip())
19 | for i, word in enumerate(words):
20 | w.write(word + "\t" + phonetic_words[i] + "\n")
21 |
22 | @staticmethod
23 | def first_clean_up(filename):
24 | words = []
25 | phonetic_words = []
26 | with open(filename, "r") as f:
27 | lines = f.readlines()
28 | print(len(lines))
29 | for line in lines:
30 | word = line.split("#")[0]
31 | word = word.lower()
32 | phonetic_word = line.split("#")[1]
33 | if "'" not in phonetic_word and "ˈ" not in phonetic_word:
34 | continue
35 | phonetic_word = phonetic_word.split("/")[0].strip()
36 | phonetic_word = phonetic_word.split("~")[0].strip()
37 | phonetic_word = phonetic_word.split(";")[0].strip()
38 | phonetic_word = phonetic_word.split(",")[0].strip()
39 | phonetic_word = phonetic_word.replace("ˈ", "'")
40 | phonetic_word = phonetic_word.replace(":", "ː")
41 | phonetic_word = re.sub(r"[\s̟̥̻.̞]", "", phonetic_word)
42 | phonetic_word = re.sub(r"[(⁽][^)⁾]*[)⁾]", "", phonetic_word)
43 | phonetic_word = Phonemes.clean(phonetic_word)
44 | wrong_chars = [ch for ch in word if ch not in RU_GRAPHEME_SET]
45 | if len(wrong_chars) != 0:
46 | continue
47 | if len(word) == 0 or len(phonetic_word) == 0:
48 | continue
49 | if sum([1 for ch in word if ch in "еуаоэяиюёы"]) != \
50 | sum([1 for ch in phonetic_word if ch in VOWELS]):
51 | continue
52 | words.append(word)
53 | phonetic_words.append(phonetic_word)
54 | print(len(words))
55 | with open(filename, "w") as f:
56 | for i, word in enumerate(words):
57 | f.write(word + "\t" + phonetic_words[i] + "\n")
--------------------------------------------------------------------------------
/docs/source/rupo.generate.rst:
--------------------------------------------------------------------------------
1 | rupo.generate package
2 | =====================
3 |
4 | Submodules
5 | ----------
6 |
7 | rupo.generate.corpora_information_loader module
8 | -----------------------------------------------
9 |
10 | .. automodule:: rupo.generate.corpora_information_loader
11 | :members:
12 | :undoc-members:
13 | :show-inheritance:
14 |
15 | rupo.generate.filters module
16 | ----------------------------
17 |
18 | .. automodule:: rupo.generate.filters
19 | :members:
20 | :undoc-members:
21 | :show-inheritance:
22 |
23 | rupo.generate.gen module
24 | ------------------------
25 |
26 | .. automodule:: rupo.generate.gen
27 | :members:
28 | :undoc-members:
29 | :show-inheritance:
30 |
31 | rupo.generate.generator module
32 | ------------------------------
33 |
34 | .. automodule:: rupo.generate.generator
35 | :members:
36 | :undoc-members:
37 | :show-inheritance:
38 |
39 | rupo.generate.grammeme_vectorizer module
40 | ----------------------------------------
41 |
42 | .. automodule:: rupo.generate.grammeme_vectorizer
43 | :members:
44 | :undoc-members:
45 | :show-inheritance:
46 |
47 | rupo.generate.lstm module
48 | -------------------------
49 |
50 | .. automodule:: rupo.generate.lstm
51 | :members:
52 | :undoc-members:
53 | :show-inheritance:
54 |
55 | rupo.generate.markov module
56 | ---------------------------
57 |
58 | .. automodule:: rupo.generate.markov
59 | :members:
60 | :undoc-members:
61 | :show-inheritance:
62 |
63 | rupo.generate.model_container module
64 | ------------------------------------
65 |
66 | .. automodule:: rupo.generate.model_container
67 | :members:
68 | :undoc-members:
69 | :show-inheritance:
70 |
71 | rupo.generate.test_generator module
72 | -----------------------------------
73 |
74 | .. automodule:: rupo.generate.test_generator
75 | :members:
76 | :undoc-members:
77 | :show-inheritance:
78 |
79 | rupo.generate.test_markov module
80 | --------------------------------
81 |
82 | .. automodule:: rupo.generate.test_markov
83 | :members:
84 | :undoc-members:
85 | :show-inheritance:
86 |
87 | rupo.generate.tqdm_open module
88 | ------------------------------
89 |
90 | .. automodule:: rupo.generate.tqdm_open
91 | :members:
92 | :undoc-members:
93 | :show-inheritance:
94 |
95 | rupo.generate.word_form module
96 | ------------------------------
97 |
98 | .. automodule:: rupo.generate.word_form
99 | :members:
100 | :undoc-members:
101 | :show-inheritance:
102 |
103 | rupo.generate.word_form_vocabulary module
104 | -----------------------------------------
105 |
106 | .. automodule:: rupo.generate.word_form_vocabulary
107 | :members:
108 | :undoc-members:
109 | :show-inheritance:
110 |
111 |
112 | Module contents
113 | ---------------
114 |
115 | .. automodule:: rupo.generate
116 | :members:
117 | :undoc-members:
118 | :show-inheritance:
119 |
--------------------------------------------------------------------------------
/rupo/settings.py:
--------------------------------------------------------------------------------
1 | from pkg_resources import resource_filename
2 | foo_config = resource_filename(__name__, 'foo.conf')
3 |
4 | CLASSIFIER_DIR = resource_filename(__name__, "data/classifier/")
5 |
6 | DATA_DIR = resource_filename(__name__, "data")
7 |
8 | DICT_DIR = resource_filename(__name__, "data/dict")
9 | CMU_DICT = resource_filename(__name__, "data/dict/cmu.txt")
10 | ZALYZNYAK_DICT = resource_filename(__name__, "data/dict/zaliznyak.txt")
11 | RU_WIKI_DICT = resource_filename(__name__, "data/dict/wiki_ru.txt")
12 |
13 | RU_ALIGNER_DEFAULT_PATH = resource_filename(__name__, "data/g2p_models/ru_aligner.pickle")
14 | EN_ALIGNER_DEFAULT_PATH = resource_filename(__name__, "data/g2p_models/en_aligner.pickle")
15 |
16 | RU_GRAPHEME_STRESS_PATH = resource_filename(__name__, "data/dict/ru_grapheme_stress.txt")
17 | RU_GRAPHEME_STRESS_TRIE_PATH = resource_filename(__name__, "data/dict/ru_grapheme_stress.trie")
18 | RU_G2P_DICT_PATH = resource_filename(__name__, "data/dict/ru_g2p.txt")
19 | RU_PHONEME_STRESS_PATH = resource_filename(__name__, "data/dict/ru_phoneme_stress.txt")
20 | RU_PHONEME_STRESS_BIG_PATH = resource_filename(__name__, "data/dict/ru_phoneme_stress_big.txt")
21 | RU_PHONEME_STRESS_TRIE_PATH = resource_filename(__name__, "data/dict/ru_phoneme_stress.trie")
22 | RU_G2P_DEFAULT_MODEL = resource_filename(__name__, "data/g2p_models/g2p_ru_maxlen40_BLSTM256_BLSTM256_dropout0.2_acc992_wer140.h5")
23 | RU_STRESS_DEFAULT_MODEL = resource_filename(__name__, "data/stress_models/ru_main")
24 |
25 | EN_G2P_DICT_PATH = resource_filename(__name__, "data/dict/en_g2p.txt")
26 | EN_PHONEME_STRESS_PATH = resource_filename(__name__, "data/dict/en_phoneme_stress.txt")
27 | EN_PHONEME_STRESS_TRIE_PATH = resource_filename(__name__, "data/dict/en_phoneme_stress.trie")
28 | EN_G2P_DEFAULT_MODEL = resource_filename(__name__, "data/g2p_models/g2p_en_maxlen40_BLSTM256+LSTM256_LSTM128_dropout0.4_acc977_wer379.h5")
29 | EN_STRESS_DEFAULT_MODEL = resource_filename(__name__, "data/stress_models/stress_en_LSTM128_dropout0.2_acc99_wer10.h5")
30 |
31 | EXAMPLES_DIR = resource_filename(__name__, "data/examples/")
32 | MARKUP_XML_EXAMPLE = resource_filename(__name__, "data/examples/markup.xml")
33 | MARKUP_JSON_EXAMPLE = resource_filename(__name__, "data/examples/markup.json")
34 | TEXT_XML_EXAMPLE = resource_filename(__name__, "data/examples/text.xml")
35 | TEXT_TXT_EXAMPLE = resource_filename(__name__, "data/examples/text.txt")
36 | HYPHEN_TOKENS = resource_filename(__name__, "data/hyphen-tokens.txt")
37 |
38 | G2P_CURRENT_MODEL_DIR = resource_filename(__name__, "data/g2p_models/")
39 | ACCENT_CURRENT_MODEL_DIR = resource_filename(__name__, "data/stress_models/")
40 |
41 | GENERATOR_MODEL_DIR = resource_filename(__name__, "data/generator_models/")
42 | GENERATOR_WORD_FORM_VOCAB_PATH = resource_filename(__name__, "data/generator_models/vocabulary")
43 | GENERATOR_VOCAB_PATH = resource_filename(__name__, "data/generator_models/stress.pickle")
44 |
45 | TEMP_PATH = resource_filename(__name__, "data/temp.txt")
46 |
47 | RU_GRAPHEME_SET = " абвгдеёжзийклмнопрстуфхцчшщьыъэюя-"
48 | EN_GRAPHEME_SET = " abcdefghijklmnopqrstuvwxyz.'-"
49 |
--------------------------------------------------------------------------------
/rupo/main/test_tokenizer.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Автор: Гусев Илья
3 | # Описание: Тесты для токенизатора.
4 |
5 | import unittest
6 |
7 | from rupo.main.tokenizer import Tokenizer, SentenceTokenizer, Token
8 |
9 |
10 | class TestTokenizer(unittest.TestCase):
11 | def test_tokenizer(self):
12 | text = "О, когда-нибудь, когда?"
13 | self.assertEqual(Tokenizer.tokenize(text), [
14 | Token('О', Token.TokenType.WORD, 0, 1),
15 | Token(',', Token.TokenType.PUNCTUATION, 1, 2),
16 | Token(' ', Token.TokenType.SPACE, 2, 3),
17 | Token('когда-нибудь', Token.TokenType.WORD, 3, 15),
18 | Token(',', Token.TokenType.PUNCTUATION, 15, 16),
19 | Token(' ', Token.TokenType.SPACE, 16, 17),
20 | Token('когда', Token.TokenType.WORD, 17, 22),
21 | Token('?', Token.TokenType.PUNCTUATION, 22, 23)])
22 |
23 | text = " Пора"
24 | self.assertEqual(Tokenizer.tokenize(text), [
25 | Token(' ', Token.TokenType.SPACE, 0, 1),
26 | Token('Пора', Token.TokenType.WORD, 1, 5)])
27 |
28 | def test_numbers(self):
29 | text = "Очевидно, 1 января 1970 года..."
30 | self.assertEqual(Tokenizer.tokenize(text), [
31 | Token('Очевидно', Token.TokenType.WORD, 0, 8),
32 | Token(',', Token.TokenType.PUNCTUATION, 8, 9),
33 | Token(' ', Token.TokenType.SPACE, 9, 10),
34 | Token('1', Token.TokenType.NUMBER, 10, 11),
35 | Token(' ', Token.TokenType.SPACE, 11, 12),
36 | Token('января', Token.TokenType.WORD, 12, 18),
37 | Token(' ', Token.TokenType.SPACE, 18, 19),
38 | Token('1970', Token.TokenType.NUMBER, 19, 23),
39 | Token(' ', Token.TokenType.SPACE, 23, 24),
40 | Token('года', Token.TokenType.WORD, 24, 28),
41 | Token('...', Token.TokenType.PUNCTUATION, 28, 31)])
42 |
43 | self.assertEqual(Tokenizer.tokenize(text, replace_numbers=True), [
44 | Token('Очевидно', Token.TokenType.WORD, 0, 8),
45 | Token(',', Token.TokenType.PUNCTUATION, 8, 9),
46 | Token(' ', Token.TokenType.SPACE, 9, 10),
47 | Token('ЧИСЛО', Token.TokenType.WORD, 10, 11),
48 | Token(' ', Token.TokenType.SPACE, 11, 12),
49 | Token('января', Token.TokenType.WORD, 12, 18),
50 | Token(' ', Token.TokenType.SPACE, 18, 19),
51 | Token('ЧИСЛО', Token.TokenType.WORD, 19, 23),
52 | Token(' ', Token.TokenType.SPACE, 23, 24),
53 | Token('года', Token.TokenType.WORD, 24, 28),
54 | Token('...', Token.TokenType.PUNCTUATION, 28, 31)])
55 |
56 |
57 | class TestSentenceTokenizer(unittest.TestCase):
58 | def test_tokenizer(self):
59 | text1 = "Конкурс учреждён в 2005 году!!! Официальный партнёр конкурса – Президентский центр Б.Н. Ельцина."
60 | self.assertEqual(SentenceTokenizer.tokenize(text1),
61 | ['Конкурс учреждён в 2005 году!!!',
62 | 'Официальный партнёр конкурса – Президентский центр Б.Н. Ельцина.'])
63 |
--------------------------------------------------------------------------------
/rupo/test_api.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Автор: Гусев Илья
3 | # Описание: Тесты для API библиотеки.
4 |
5 | import unittest
6 | import os
7 | import random
8 |
9 | from rupo.settings import MARKUP_XML_EXAMPLE, EXAMPLES_DIR, GENERATOR_MODEL_DIR, \
10 | GENERATOR_WORD_FORM_VOCAB_PATH, GENERATOR_VOCAB_PATH, RU_STRESS_DEFAULT_MODEL,\
11 | ZALYZNYAK_DICT
12 | from rupo.main.markup import Markup
13 | from rupo.api import Engine
14 |
15 |
16 | class TestApi(unittest.TestCase):
17 | @classmethod
18 | def setUpClass(cls):
19 | cls.engine = Engine(language="ru")
20 | cls.engine.load(
21 | stress_model_path=RU_STRESS_DEFAULT_MODEL,
22 | zalyzniak_dict=ZALYZNYAK_DICT
23 | )
24 |
25 | @classmethod
26 | def tearDownClass(cls):
27 | del cls.engine
28 |
29 | def test_stress(self):
30 | self.assertEqual(self.engine.get_stresses("корова"), [3])
31 | self.assertEqual(self.engine.get_stresses("триплекс"), [2])
32 | self.assertEqual(self.engine.get_stresses("горит"), [3])
33 | self.assertEqual(self.engine.get_stresses("восток"), [4])
34 | self.assertEqual(self.engine.get_stresses("зарёю"), [3])
35 | self.assertEqual(self.engine.get_stresses("новой"), [1])
36 | self.assertEqual(self.engine.get_stresses("равнине"), [4])
37 | self.assertEqual(self.engine.get_stresses("холмам"), [4])
38 | self.assertEqual(self.engine.get_stresses("грохочут"), [4])
39 | self.assertCountEqual(self.engine.get_stresses("пушки"), [4, 1])
40 | self.assertEqual(self.engine.get_stresses("багровый"), [4])
41 | self.assertEqual(self.engine.get_stresses("кругами"), [4])
42 | self.assertEqual(self.engine.get_stresses("уж"), [0])
43 | self.assertEqual(self.engine.get_stresses('колесом'), [5])
44 |
45 | def test_get_word_syllables(self):
46 | self.assertEqual(self.engine.get_word_syllables("корова"), ["ко", "ро", "ва"])
47 |
48 | def test_count_syllables(self):
49 | self.assertEqual(self.engine.count_syllables("корова"), 3)
50 |
51 | def test_is_rhyme(self):
52 | self.assertTrue(self.engine.is_rhyme("корова", "здорова"))
53 |
54 | def test_get_markup(self):
55 | self.assertIsInstance(self.engine.get_markup("корова"), Markup)
56 |
57 | def test_get_improved_markup(self):
58 | self.assertIsInstance(self.engine.get_improved_markup("корова")[0], Markup)
59 |
60 | def test_classify_metre(self):
61 | text = "Горит восток зарёю новой.\n" \
62 | "Уж на равнине, по холмам\n" \
63 | "Грохочут пушки. Дым багровый\n" \
64 | "Кругами всходит к небесам."
65 | self.assertEqual(self.engine.classify_metre(text), "iambos")
66 |
67 | def test_generate_poem(self):
68 | random.seed(42)
69 | model_path = GENERATOR_MODEL_DIR
70 | vocab_path = GENERATOR_WORD_FORM_VOCAB_PATH
71 | stress_path = GENERATOR_VOCAB_PATH
72 | poem = self.engine.generate_poem(
73 | model_path,
74 | vocab_path,
75 | stress_path,
76 | sampling_k=10000,
77 | n_syllables=8,
78 | rhyme_pattern="abab",
79 | metre_schema="-+")
80 | self.assertIsNotNone(poem)
81 |
82 | def test_get_word_rhymes(self):
83 | vocab_dump_file = os.path.join(EXAMPLES_DIR, "vocab_rhymes.pickle")
84 | self.assertEqual(self.engine.get_word_rhymes("глядел", vocab_dump_file, MARKUP_XML_EXAMPLE), ["сидел", "летел"])
85 |
86 |
--------------------------------------------------------------------------------
/rupo/stress/predictor.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Автор: Гусев Илья
3 | # Описание: Класс для определения ударения.
4 |
5 | from typing import List
6 |
7 | from rupo.stress.dict import StressDict
8 | from rupo.util.preprocess import count_vowels, get_first_vowel_position
9 | from rupo.settings import CMU_DICT, ZALYZNYAK_DICT, RU_STRESS_DEFAULT_MODEL
10 | from rupo.stress.word import Stress
11 |
12 | from russ.stress.model import StressModel
13 |
14 |
15 | class StressPredictor:
16 | def predict(self, word: str) -> List[int]:
17 | raise NotImplementedError()
18 |
19 |
20 | class DictStressPredictor(StressPredictor):
21 | def __init__(self, language="ru", raw_dict_path=None, trie_path=None,
22 | zalyzniak_dict=ZALYZNYAK_DICT, cmu_dict=CMU_DICT):
23 | self.stress_dict = StressDict(language, raw_dict_path=raw_dict_path, trie_path=trie_path,
24 | zalyzniak_dict=zalyzniak_dict, cmu_dict=cmu_dict)
25 |
26 | def predict(self, word: str) -> List[int]:
27 | """
28 | Определение ударения в слове по словарю. Возможно несколько вариантов ударения.
29 |
30 | :param word: слово для простановки ударений.
31 | :return stresses: позиции букв, на которые падает ударение.
32 | """
33 | stresses = []
34 | if count_vowels(word) == 0:
35 | # Если гласных нет, то и ударений нет.
36 | pass
37 | elif count_vowels(word) == 1:
38 | # Если одна гласная, то на неё и падает ударение.
39 | stresses.append(get_first_vowel_position(word))
40 | elif word.find("ё") != -1:
41 | # Если есть буква "ё", то только на неё может падать ударение.
42 | stresses.append(word.find("ё"))
43 | else:
44 | # Проверяем словарь на наличие форм с ударениями.
45 | stresses = self.stress_dict.get_stresses(word, Stress.Type.PRIMARY) +\
46 | self.stress_dict.get_stresses(word, Stress.Type.SECONDARY)
47 | if 'е' not in word:
48 | return stresses
49 | # Находим все возможные варинаты преобразований 'е' в 'ё'.
50 | positions = [i for i in range(len(word)) if word[i] == 'е']
51 | beam = [word[:positions[0]]]
52 | for i in range(len(positions)):
53 | new_beam = []
54 | for prefix in beam:
55 | n = positions[i+1] if i+1 < len(positions) else len(word)
56 | new_beam.append(prefix + 'ё' + word[positions[i]+1:n])
57 | new_beam.append(prefix + 'е' + word[positions[i]+1:n])
58 | beam = new_beam
59 | # И проверяем их по словарю.
60 | for permutation in beam:
61 | if len(self.stress_dict.get_stresses(permutation)) != 0:
62 | yo_pos = permutation.find("ё")
63 | if yo_pos != -1:
64 | stresses.append(yo_pos)
65 | return stresses
66 |
67 |
68 | class CombinedStressPredictor(StressPredictor):
69 | def __init__(self, language="ru", stress_model_path: str=RU_STRESS_DEFAULT_MODEL, raw_stress_dict_path=None,
70 | stress_trie_path=None, zalyzniak_dict=ZALYZNYAK_DICT, cmu_dict=CMU_DICT):
71 | self.rnn = StressModel.load(stress_model_path)
72 | self.dict = DictStressPredictor(language, raw_stress_dict_path, stress_trie_path, zalyzniak_dict, cmu_dict)
73 |
74 | def predict(self, word: str) -> List[int]:
75 | stresses = self.dict.predict(word)
76 | if len(stresses) == 0:
77 | return self.rnn.predict(word)
78 | else:
79 | return stresses
80 |
--------------------------------------------------------------------------------
/rupo/main/vocabulary.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Автор: Гусев Илья
3 | # Описание: Индексы слов для языковой модели.
4 |
5 | from typing import Dict
6 | import pickle
7 |
8 | from allennlp.data.vocabulary import Vocabulary
9 |
10 | from rupo.main.markup import Markup
11 | from rupo.files.reader import Reader, FileType
12 | from rupo.stress.word import StressedWord, Stress
13 | from rupo.stress.predictor import StressPredictor
14 |
15 |
16 | class StressVocabulary(object):
17 | """
18 | Индексированный словарь.
19 | """
20 | def __init__(self) -> None:
21 | self.word_to_index = {} # type: Dict[StressedWord, int]
22 | self.index_to_word = {} # type: Dict[int, StressedWord]
23 |
24 | def save(self, dump_filename: str) -> None:
25 | """
26 | Сохранение словаря.
27 | """
28 | with open(dump_filename, "wb") as f:
29 | pickle.dump(self, f, pickle.HIGHEST_PROTOCOL)
30 |
31 | def load(self, dump_filename: str):
32 | """
33 | Загрузка словаря.
34 | """
35 | with open(dump_filename, "rb") as f:
36 | vocab = pickle.load(f)
37 | self.__dict__.update(vocab.__dict__)
38 |
39 | def parse(self, markup_path: str, from_voc: bool=False):
40 | if from_voc:
41 | word_indexes = Reader.read_vocabulary(markup_path)
42 | for word, index in word_indexes:
43 | self.add_word(word.to_stressed_word(), index)
44 | else:
45 | markups = Reader.read_markups(markup_path, FileType.XML, is_processed=True)
46 | for markup in markups:
47 | self.add_markup(markup)
48 |
49 | def add_markup(self, markup: Markup) -> None:
50 | """
51 | Добавление слов из разметки в словарь.
52 |
53 | :param markup: разметка.
54 | """
55 | for line in markup.lines:
56 | for word in line.words:
57 | self.add_word(word.to_stressed_word())
58 |
59 | def add_word(self, word: StressedWord, index: int=-1) -> bool:
60 | """
61 | Добавление слова.
62 |
63 | :param word: слово.
64 | :param index: индекс, если задан заранее.
65 | :return: слово новое или нет.
66 | """
67 | if word in self.word_to_index:
68 | if index != -1:
69 | self.index_to_word[index] = word
70 | return False
71 | self.word_to_index[word] = self.size() if index == -1 else index
72 | self.index_to_word[self.size() if index == -1 else index] = word
73 | return True
74 |
75 | def get_word_index(self, word: StressedWord) -> int:
76 | """
77 | Получить индекс слова.
78 |
79 | :param word: слово (Word).
80 | :return: индекс.
81 | """
82 | if word in self.word_to_index:
83 | return self.word_to_index[word]
84 | raise IndexError("Can't find word: " + word.text)
85 |
86 | def get_word(self, index: int) -> StressedWord:
87 | """
88 | Получить слово по индексу.
89 |
90 | :param index: индекс.
91 | :return: слово.
92 | """
93 | return self.index_to_word[index]
94 |
95 | def size(self):
96 | """
97 | :return: получить размер словаря.
98 | """
99 | return len(self.index_to_word)
100 |
101 |
102 | def inflate_stress_vocabulary(vocabulary: Vocabulary, stress_predictor: StressPredictor):
103 | vocab = StressVocabulary()
104 | for index, word in vocabulary.get_index_to_token_vocabulary("tokens").items():
105 | stresses = [Stress(pos, Stress.Type.PRIMARY) for pos in stress_predictor.predict(word)]
106 | word = StressedWord(word, set(stresses))
107 | vocab.add_word(word, index)
108 | return vocab
109 |
--------------------------------------------------------------------------------
/rupo/files/writer.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Автор: Гусев Илья
3 | # Описание: Запись в файлы разных расширений.
4 |
5 | import os
6 | from typing import List
7 |
8 | from rupo.files.reader import RAW_SEPARATOR
9 | from rupo.main.markup import Markup
10 | from rupo.files.reader import FileType
11 |
12 |
13 | class Writer(object):
14 | """
15 | Запись в файл.
16 | """
17 | def __init__(self, destination_type: FileType, path: str) -> None:
18 | """
19 | Нужно, когда хотим записывать разметки по одной (экономия памяти).
20 |
21 | :param destination_type: тип файла.
22 | :param path: путь к файлу.
23 | """
24 | self.type = destination_type
25 | self.path = path
26 | self.file = None
27 | try:
28 | os.remove(path)
29 | except OSError:
30 | pass
31 |
32 | def open(self) -> None:
33 | """
34 | Открываем файл, вызывать до начала записи.
35 | """
36 | self.file = open(self.path, "w", encoding="utf-8")
37 | if self.type == FileType.XML:
38 | self.file.write('')
39 |
40 | def write_markup(self, markup: Markup) -> None:
41 | """
42 | Запись разметки в уже открытый файл.
43 | :param markup: разметка.
44 | """
45 | assert self.file is not None
46 | if self.type == FileType.XML:
47 | xml = markup.to_xml().encode('utf-8')\
48 | .replace(b'', b'').decode('utf-8')
49 | self.file.write(xml)
50 | elif self.type == FileType.RAW:
51 | Writer.__write_markup_raw(markup, self.file)
52 |
53 | def close(self) -> None:
54 | """
55 | Закрываем файл.
56 | """
57 | if self.type == FileType.XML:
58 | self.file.write('')
59 | self.file.close()
60 |
61 | @staticmethod
62 | def write_markups(destination_type: FileType, markups: List[Markup], path: str) -> None:
63 | """
64 | Запись разметок в файл.
65 |
66 | :param destination_type: тип файла.
67 | :param markups: разметки.
68 | :param path: путь к файлу.
69 | """
70 | with open(path, "w", encoding="utf-8") as file:
71 | if destination_type == FileType.XML:
72 | file.write('')
73 | for markup in markups:
74 | xml = markup.to_xml().encode('utf-8')\
75 | .replace(b'', b'').decode('utf-8')
76 | file.write(xml)
77 | file.write("\n")
78 | file.write('')
79 | elif destination_type == FileType.JSON:
80 | file.write("[")
81 | for markup in markups:
82 | file.write(markup.to_json())
83 | file.write(",")
84 | file.seek(0, 2)
85 | size = file.tell()
86 | file.truncate(size - 1)
87 | file.write(']')
88 | elif destination_type == FileType.RAW:
89 | for markup in markups:
90 | Writer.__write_markup_raw(markup, file)
91 |
92 | @staticmethod
93 | def __write_markup_raw(markup: Markup, file) -> None:
94 | """
95 | Запись разметки в текстовом виде (слово+ударение).
96 |
97 | :param markup: разметка.
98 | :param file: открытый файл.
99 | """
100 | lines = []
101 | for line in markup.lines:
102 | lines.append(" ".join([word.get_short() for word in line.words]))
103 | file.write("\n".join(lines))
104 | file.write(RAW_SEPARATOR)
105 |
--------------------------------------------------------------------------------
/rupo/generate/generator.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Автор: Гусев Илья
3 | # Описание: Модуль создания стихотворений.
4 |
5 | from typing import Optional
6 |
7 | from allennlp.data.vocabulary import Vocabulary
8 | from rulm.language_model import LanguageModel
9 |
10 | from rupo.main.vocabulary import StressVocabulary
11 | from rupo.generate.transforms import PoemTransform
12 |
13 |
14 | class Generator(object):
15 | """
16 | Генератор стихов
17 | """
18 | def __init__(self,
19 | model: LanguageModel,
20 | token_vocabulary: Vocabulary,
21 | stress_vocabulary: StressVocabulary,
22 | eos_index: int):
23 | self.model = model # type: LanguageModel
24 | self.token_vocabulary = token_vocabulary # type: Vocabulary
25 | self.stress_vocabulary = stress_vocabulary # type: StressVocabulary
26 | self.eos_index = eos_index
27 |
28 | def generate_poem(self,
29 | metre_schema: str="+-",
30 | rhyme_pattern: str="aabb",
31 | n_syllables: int=8,
32 | letters_to_rhymes: dict=None,
33 | beam_width: int=None,
34 | sampling_k: int=None,
35 | rhyme_score_border: int=4,
36 | temperature: float=1.0,
37 | seed: int=1337,
38 | last_text: str="") -> Optional[str]:
39 | assert beam_width or sampling_k, "Set sampling_k or beam_width"
40 | self.model.set_seed(seed)
41 |
42 | poem_transform = PoemTransform(
43 | stress_vocabulary=self.stress_vocabulary,
44 | metre_pattern=metre_schema,
45 | rhyme_pattern=rhyme_pattern,
46 | n_syllables=n_syllables,
47 | eos_index=self.eos_index,
48 | letters_to_rhymes=letters_to_rhymes,
49 | score_border=rhyme_score_border
50 | )
51 |
52 | if last_text:
53 | words = last_text.lower().split(" ")
54 | last_text = " ".join(words[::-1])
55 | filled_syllables = 0
56 | for word in last_text.split():
57 | index = self.token_vocabulary.get_token_index(word)
58 | word = self.stress_vocabulary.get_word(index)
59 | syllables_count = len(word.syllables)
60 | filled_syllables += syllables_count
61 | poem_transform.stress_position -= filled_syllables
62 | poem_transform.rhyme_position -= 1
63 | last_index = self.token_vocabulary.get_token_index(words[-1])
64 | last_word = self.stress_vocabulary.get_word(last_index)
65 | poem_transform.letters_to_rhymes[rhyme_pattern[-1]].add(last_word)
66 |
67 | self.model.transforms.append(poem_transform)
68 |
69 | try:
70 | if beam_width:
71 | poem = self.model.beam_decoding(last_text, beam_width=beam_width, temperature=temperature)
72 | elif sampling_k:
73 | poem = self.model.sample_decoding(last_text, k=sampling_k, temperature=temperature)
74 | else:
75 | assert False
76 | except Exception as e:
77 | self.model.transforms.pop()
78 | raise e
79 |
80 | self.model.transforms.pop()
81 |
82 | words = poem.split(" ")
83 | words = words[::-1]
84 | result_words = []
85 | current_n_syllables = 0
86 | for word in words:
87 | result_words.append(word)
88 | index = self.token_vocabulary.get_token_index(word)
89 | word = self.stress_vocabulary.get_word(index)
90 | syllables_count = len(word.syllables)
91 | current_n_syllables += syllables_count
92 | if n_syllables == current_n_syllables:
93 | current_n_syllables = 0
94 | result_words.append("\n")
95 | poem = " ".join(result_words)
96 | poem = "\n".join([line.strip() for line in poem.split("\n")])
97 | return poem
98 |
99 |
--------------------------------------------------------------------------------
/rupo/dict/cmu.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Автор: Гусев Илья
3 | # Описание: Конвертер CMU словаря.
4 |
5 |
6 | class CMUDict:
7 | aprabet2ipa = {
8 | "AO": "ɔ",
9 | "AA": "ɑ",
10 | "IY": "i",
11 | "UW": "u",
12 | "EH": "ɛ",
13 | "IH": "ɪ",
14 | "UH": "ʊ",
15 | "AH": "ʌ",
16 | "AX": "ə",
17 | "AE": "æ",
18 | "EY": "eɪ",
19 | "AY": "aɪ",
20 | "OW": "oʊ",
21 | "AW": "aʊ",
22 | "OY": "ɔɪ",
23 | "ER": "ɝ",
24 | "AXR": "ɚ",
25 | "P": "p",
26 | "B": "b",
27 | "T": "t",
28 | "D": "d",
29 | "K": "k",
30 | "G": "ɡ",
31 | "CH": "ʦ",
32 | "JH": "ʤ",
33 | "F": "f",
34 | "V": "v",
35 | "TH": "θ",
36 | "DH": "ð",
37 | "S": "s",
38 | "Z": "z",
39 | "SH": "ʃ",
40 | "ZH": "ʒ",
41 | "HH": "h",
42 | "M": "m",
43 | "EM": "m",
44 | "N": "n",
45 | "EN": "n",
46 | "NG": "ŋ",
47 | "ENG": "ŋ",
48 | "L": "ɫ",
49 | "EL": "ɫ",
50 | "R": "r",
51 | "DX": "ɾ",
52 | "NX": "ɾ",
53 | "Y": "j",
54 | "W": "w",
55 | "Q": "ʔ"
56 | }
57 |
58 | diphtongs = ["EY", "AY", "OW", "AW", "OY"]
59 |
60 | @staticmethod
61 | def convert_to_g2p_only(source_file, destination_file):
62 | clean = []
63 | with open(source_file, 'r', encoding="utf-8", errors="ignore") as f:
64 | lines = f.readlines()
65 | for line in lines:
66 | g = line.split(" ")[0].lower()
67 | if not ("a" <= g[0] <= "z"):
68 | continue
69 | if "(" in g:
70 | continue
71 | p = line.split(" ")[1].strip()
72 | phonemes = p.split(" ")
73 | for i, phoneme in enumerate(phonemes):
74 | if not ("A" <= phoneme[-1] <= "Z"):
75 | phonemes[i] = phoneme[:-1]
76 | p = "".join([CMUDict.aprabet2ipa[phoneme] for phoneme in phonemes])
77 | clean.append((g, p))
78 | with open(destination_file, 'w', encoding="utf-8") as w:
79 | for g, p in clean:
80 | w.write(g+"\t"+p+"\n")
81 |
82 | @staticmethod
83 | def convert_to_phoneme_stress(source_file, destination_file):
84 | clean = []
85 | with open(source_file, 'r', encoding="utf-8", errors="ignore") as f:
86 | for line in f:
87 | g = line.split(" ")[0].lower()
88 | if not ("a" <= g[0] <= "z"):
89 | continue
90 | p = line.split(" ")[1].strip()
91 | if "(1)" in g:
92 | g = g.replace("(1)", "")
93 | if "(2)" in g:
94 | g = g.replace("(2)", "")
95 | if "(" in g:
96 | continue
97 |
98 | phonemes = p.split(" ")
99 | primary = []
100 | secondary = []
101 | diphtongs_count = 0
102 | for i, phoneme in enumerate(phonemes):
103 | if not ("A" <= phoneme[-1] <= "Z"):
104 | if int(phoneme[-1]) == 1:
105 | primary.append(str(i+diphtongs_count))
106 | if int(phoneme[-1]) == 2:
107 | secondary.append(str(i+diphtongs_count))
108 | phonemes[i] = phoneme[:-1]
109 | if phonemes[i] in CMUDict.diphtongs:
110 | diphtongs_count += 1
111 | p = "".join([CMUDict.aprabet2ipa[phoneme] for phoneme in phonemes])
112 | clean.append((p, primary, secondary))
113 | with open(destination_file, 'w', encoding="utf-8") as w:
114 | for p, f, s in clean:
115 | w.write(p + "\t" + ",".join(f) + "\t" + ",".join(s) + "\n")
--------------------------------------------------------------------------------
/rupo/rhymes/rhymes.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Автор: Гусев Илья
3 | # Описание: Класс рифм.
4 |
5 | from rupo.stress.word import StressedWord
6 | from rupo.util.preprocess import VOWELS
7 |
8 |
9 | class RhymeProfile:
10 | def __init__(self, syllable_count: int, stressed_syllable_number: int,
11 | stressed_syllable_text: str, next_syllable_text: str, next_char: str):
12 | self.syllable_count = syllable_count
13 | self.stressed_syllable_number = stressed_syllable_number
14 | self.stressed_syllable_text = stressed_syllable_text
15 | self.next_syllable_text = next_syllable_text
16 | self.next_char = next_char
17 |
18 | def __str__(self):
19 | return "Syllable count: {}; Stressed syllable: {}; " \
20 | "Stressed syllable text: {}; Next syllable: {}; " \
21 | "Next char: {}".format(self.syllable_count, self.stressed_syllable_number,
22 | self.stressed_syllable_text, self.next_syllable_text, self.next_char)
23 |
24 | def __repr__(self):
25 | return self.__str__()
26 |
27 |
28 | class Rhymes(object):
29 | """
30 | Поиск рифм.
31 | """
32 |
33 | @staticmethod
34 | def is_rhyme(word1: StressedWord,
35 | word2: StressedWord,
36 | score_border: int=4,
37 | syllable_number_border: int=4) -> bool:
38 | """
39 | Проверка рифмованности 2 слов.
40 |
41 | :param word1: первое слово для проверки рифмы, уже акцентуированное (Word).
42 | :param word2: второе слово для проверки рифмы, уже акцентуированное (Word).
43 | :param score_border: граница определния рифмы, чем выше, тем строже совпадение.
44 | :param syllable_number_border: ограничение на номер слога с конца, на который падает ударение.
45 | :return result: является рифмой или нет.
46 | """
47 | profile1 = Rhymes.__get_rhyme_profile(word1)
48 | profile2 = Rhymes.__get_rhyme_profile(word2)
49 | score = 0
50 | for i, ch1 in enumerate(profile1.stressed_syllable_text):
51 | for j, ch2 in enumerate(profile2.stressed_syllable_text[i:]):
52 | if ch1 != ch2:
53 | continue
54 | if ch1 in VOWELS:
55 | score += 3
56 | else:
57 | score += 1
58 | if profile1.next_syllable_text == profile2.next_syllable_text and profile1.next_syllable_text != '':
59 | score += 3
60 | elif profile1.next_char == profile2.next_char and profile1.next_char != '':
61 | score += 1
62 | return (profile1.stressed_syllable_number == profile2.stressed_syllable_number and
63 | profile1.syllable_count == profile2.syllable_count and
64 | profile1.stressed_syllable_number <= syllable_number_border and
65 | score >= score_border)
66 |
67 | @staticmethod
68 | def __get_rhyme_profile(word: StressedWord) -> 'RhymeProfile':
69 | """
70 | Получение профиля рифмовки (набора признаков для сопоставления).
71 |
72 | :param word: уже акцентуированное слово (Word).
73 | :return profile: профиль рифмовки.
74 | """
75 | # TODO: Переход на фонетическое слово, больше признаков.
76 |
77 | profile = RhymeProfile(syllable_count=0,
78 | stressed_syllable_number=-1,
79 | stressed_syllable_text="",
80 | next_syllable_text="",
81 | next_char="")
82 | syllables = list(word.syllables)
83 | profile.syllable_count = len(syllables)
84 | for i, syllable in enumerate(reversed(syllables)):
85 | if syllable.stress == -1:
86 | continue
87 | profile.stressed_syllable_text = syllable.text
88 | profile.stressed_syllable_number = -i-1
89 | if i != 0:
90 | profile.next_syllable = syllables[-i].text
91 | if syllable.stress + 1 < len(word.text):
92 | profile.next_char = word.text[syllable.stress + 1]
93 | break
94 | return profile
95 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Python library for analysis and generation of poems in Russian #
2 |
3 | [](https://pypi.python.org/pypi/rupo)
4 | [](https://pypi.python.org/pypi/rupo)
5 | [](https://travis-ci.org/IlyaGusev/rupo)
6 | [](https://codeclimate.com/github/IlyaGusev/rupo)
7 | [](http://rupo.readthedocs.io/en/latest/?badge=latest)
8 |
9 | ### Install ###
10 | Warning: Python 3.9+ is not supported! Use Python 3.8.
11 |
12 | ```
13 | git clone https://github.com/IlyaGusev/rupo
14 | cd rupo
15 | pip install -r requirements.txt
16 | sh download.sh
17 | ```
18 |
19 | ### Example ###
20 | https://colab.research.google.com/drive/1WBl9erJvC9Oc9PjCD8JyC_40TDUqahCx
21 |
22 | ### Usage manual ###
23 | #### Analysis ####
24 | ```
25 | >>> from rupo.api import Engine
26 | >>> engine = Engine(language="ru")
27 | >>> engine.load(, )
28 | >>> engine.get_stresses("корова")
29 | [3]
30 |
31 | >>> engine.get_word_syllables("корова")
32 | ["ко", "ро", "ва"]
33 |
34 | >>> engine.is_rhyme("корова", "здорова")
35 | True
36 |
37 | >>> text = "Горит восток зарёю новой.\nУж на равнине, по холмам\nГрохочут пушки. Дым багровый\nКругами всходит к небесам."
38 | >>> engine.classify_metre(text)
39 | iambos
40 | ```
41 |
42 | #### Generation ####
43 | Script for poem generation. It can work in two different modes: sampling or beam search.
44 |
45 | ```
46 | python generate_poem.py
47 | ```
48 |
49 | | Argument | Default | Description |
50 | |:--------------------|:--------|:-------------------------------------------|
51 | | --metre-schema | +- | feet type: -+ (iambos), +- (trochee), ... |
52 | | --rhyme-pattern | abab | rhyme pattern |
53 | | --n-syllables | 8 | number of syllables in line |
54 | | --sampling-k | 50000 | top-k words to sample from (sampling mode) |
55 | | --beam-width | None | width of beam search (beam search mode) |
56 | | --temperature | 1.0 | sampling softmax temperature |
57 | | --last-text | None | custom last line |
58 | | --count | 100 | count of poems to generate |
59 | | --model-path | None | optional path to generator model directory |
60 | | --token-vocab-path | None | optional path to vocabulary |
61 | | --stress-vocab-path | None | optional path to stress vocabulary |
62 |
63 | ## Models ###
64 | * Generator: https://www.dropbox.com/s/dwkui2xqivzsyw5/generator_model.zip
65 | * Stress predictor: https://www.dropbox.com/s/i9tarc8pum4e40p/stress_models_14_05_17.zip
66 | * G2P: https://www.dropbox.com/s/7rk135fzd3i8kfw/g2p_models.zip
67 | * Dictionaries: https://www.dropbox.com/s/znqlrb1xblh3amo/dict.zip
68 |
69 | ### Литература ###
70 | * Брейдо, 1996, [Автоматический анализ метрики русского стиха](http://search.rsl.ru/ru/record/01000000124)
71 | * Каганов, 1996, [Лингвистическое конструирование в системах искусственного интеллекта](http://lleo.me/soft/text_dip.htm)
72 | * Козьмин, 2006, [Автоматический анализ стиха в системе Starling](http://www.dialog-21.ru/digests/dialog2006/materials/html/Kozmin.htm)
73 | * Гришина, 2008, [Поэтический корпус в рамках НКРЯ: общая структура и перспективы использования](http://ruscorpora.ru/sbornik2008/05.pdf)
74 | * Пильщиков, Старостин, 2012, [Автоматическое распознавание метра: проблемы и решения](http://www.academia.edu/11465228/%D0%90%D0%B2%D1%82%D0%BE%D0%BC%D0%B0%D1%82%D0%B8%D1%87%D0%B5%D1%81%D0%BA%D0%BE%D0%B5_%D1%80%D0%B0%D1%81%D0%BF%D0%BE%D0%B7%D0%BD%D0%B0%D0%B2%D0%B0%D0%BD%D0%B8%D0%B5_%D0%BC%D0%B5%D1%82%D1%80%D0%B0_%D0%BF%D1%80%D0%BE%D0%B1%D0%BB%D0%B5%D0%BC%D1%8B_%D0%B8_%D1%80%D0%B5%D1%88%D0%B5%D0%BD%D0%B8%D1%8F)
75 | * Барахнин, 2015, [Алгоритмы комплексного анализа русских поэтических текстов с целью автоматизации процесса создания метрических справочников и конкордансов](http://ceur-ws.org/Vol-1536/paper21.pdf), [сама система](http://poem.ict.nsc.ru/)
76 |
--------------------------------------------------------------------------------
/rupo/generate/transforms.py:
--------------------------------------------------------------------------------
1 | from copy import deepcopy
2 | import numpy as np
3 | from collections import defaultdict
4 |
5 | from rulm.transform import Transform
6 |
7 | from rupo.rhymes.rhymes import Rhymes
8 | from rupo.main.vocabulary import StressVocabulary
9 | from rupo.stress.word import StressedWord
10 |
11 |
12 | class PoemTransform(Transform):
13 | """
14 | Фильтр по шаблону метра.
15 | """
16 | def __init__(self,
17 | stress_vocabulary: StressVocabulary,
18 | metre_pattern: str,
19 | rhyme_pattern: str,
20 | n_syllables: int,
21 | eos_index: int,
22 | letters_to_rhymes: dict=None,
23 | score_border=4):
24 | self.stress_vocabulary = stress_vocabulary
25 |
26 | self.n_syllables = n_syllables
27 |
28 | mul = n_syllables // len(metre_pattern)
29 | if n_syllables % len(metre_pattern) != 0:
30 | mul += 1
31 |
32 | self.metre_pattern = metre_pattern * mul
33 | self.stress_position = len(self.metre_pattern) - 1
34 | self.eos_index = eos_index
35 |
36 | self.rhyme_pattern = rhyme_pattern
37 | self.rhyme_position = len(self.rhyme_pattern) - 1
38 | self.score_border = score_border
39 |
40 | self.letters_to_rhymes = defaultdict(set)
41 | if letters_to_rhymes is not None:
42 | for letter, words in letters_to_rhymes.items():
43 | for word in words:
44 | self.letters_to_rhymes[letter].add(word)
45 |
46 | def __call__(self, probabilities: np.array) -> np.array:
47 | if self.rhyme_position < 0 and self.stress_position == len(self.metre_pattern) - 1:
48 | probabilities = np.zeros(probabilities.shape, dtype="float")
49 | probabilities[self.eos_index] = 1.
50 | return probabilities
51 |
52 | for index in range(probabilities.shape[0]):
53 | word = self.stress_vocabulary.get_word(index)
54 | is_good_by_stress = self._filter_word_by_stress(word)
55 | is_good_by_rhyme = True
56 | if self.stress_position == len(self.metre_pattern) - 1:
57 | is_good_by_rhyme = self._filter_word_by_rhyme(word)
58 | if not is_good_by_stress or not is_good_by_rhyme:
59 | probabilities[index] = 0.
60 |
61 | assert np.sum(probabilities > 0) != 0, "Poem transform filtered out all words"
62 | return probabilities
63 |
64 | def advance(self, index: int):
65 | word = self.stress_vocabulary.get_word(index)
66 | syllables_count = len(word.syllables)
67 |
68 | if self.stress_position == len(self.metre_pattern) - 1:
69 | letter = self.rhyme_pattern[self.rhyme_position]
70 | self.letters_to_rhymes[letter].add(word)
71 | self.rhyme_position -= 1
72 |
73 | self.stress_position -= syllables_count
74 |
75 | if self.stress_position < 0:
76 | self.stress_position = len(self.metre_pattern) - 1
77 |
78 | def _filter_word_by_stress(self, word: StressedWord) -> bool:
79 | syllables = word.syllables
80 | syllables_count = len(syllables)
81 | if syllables_count == 0:
82 | return False
83 | if self.stress_position - syllables_count < -1:
84 | return False
85 | for i in range(syllables_count):
86 | syllable = syllables[i]
87 | syllable_number = self.stress_position - syllables_count + i + 1
88 | if syllables_count >= 2 and syllable.stress == -1 and self.metre_pattern[syllable_number] == "+":
89 | for j in range(syllables_count):
90 | other_syllable = syllables[j]
91 | other_syllable_number = other_syllable.number - syllable.number + syllable_number
92 | if i != j and other_syllable.stress != -1 and self.metre_pattern[other_syllable_number] == "-":
93 | return False
94 | return True
95 |
96 | def _filter_word_by_rhyme(self, word: StressedWord) -> bool:
97 | if len(word.syllables) <= 1:
98 | return False
99 | rhyming_words = self.letters_to_rhymes[self.rhyme_pattern[self.rhyme_position]]
100 | if len(rhyming_words) == 0:
101 | return True
102 | first_word = list(rhyming_words)[0]
103 |
104 | is_rhyme = Rhymes.is_rhyme(first_word, word,
105 | score_border=self.score_border,
106 | syllable_number_border=2) and first_word.text != word.text
107 | return is_rhyme
108 |
109 | def __copy__(self):
110 | obj = type(self)(self.stress_vocabulary, self.metre_pattern, self.rhyme_pattern, self.n_syllables,
111 | self.eos_index, self.letters_to_rhymes, self.score_border)
112 | obj.stress_position = self.stress_position
113 | obj.rhyme_position = self.rhyme_position
114 | obj.letters_to_rhymes = deepcopy(self.letters_to_rhymes)
115 | return obj
116 |
--------------------------------------------------------------------------------
/rupo/stress/dict.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Автор: Гусев Илья
3 | # Описание: Класс для удобной работы со словарём ударений.
4 |
5 | import pygtrie
6 | import os
7 | import pickle
8 | from typing import List, Dict, ItemsView, Set
9 |
10 | from rupo.dict.cmu import CMUDict
11 | from rupo.settings import RU_GRAPHEME_STRESS_PATH, RU_GRAPHEME_STRESS_TRIE_PATH, \
12 | EN_PHONEME_STRESS_PATH, EN_PHONEME_STRESS_TRIE_PATH, ZALYZNYAK_DICT, CMU_DICT
13 |
14 | from rupo.stress.word import Stress
15 |
16 |
17 | class StressDict:
18 | """
19 | Класс данных, для сериализации словаря как префиксного дерева и быстрой загрузки в память.
20 | """
21 |
22 | class Mode:
23 | GRAPHEMES = 0
24 | PHONEMES = 0
25 |
26 | def __init__(self, language: str="ru", mode: Mode=Mode.GRAPHEMES, raw_dict_path=None, trie_path=None,
27 | zalyzniak_dict=ZALYZNYAK_DICT, cmu_dict=CMU_DICT) -> None:
28 | self.data = pygtrie.Trie() # type: Dict[str, Set[Stress]]
29 | self.raw_dict_path = raw_dict_path
30 | self.trie_path = trie_path
31 | if language == "ru" and mode == self.Mode.GRAPHEMES:
32 | self.__init_defaults(RU_GRAPHEME_STRESS_PATH, RU_GRAPHEME_STRESS_TRIE_PATH)
33 | if not os.path.exists(self.raw_dict_path):
34 | from rupo.dict.zaliznyak import ZalyzniakDict
35 | ZalyzniakDict.convert_to_accent_only(zalyzniak_dict, self.raw_dict_path)
36 | elif mode == self.Mode.PHONEMES and language == "en":
37 | self.__init_defaults(EN_PHONEME_STRESS_PATH, EN_PHONEME_STRESS_TRIE_PATH)
38 | if not os.path.exists(self.raw_dict_path):
39 | CMUDict.convert_to_phoneme_stress(cmu_dict, self.raw_dict_path)
40 | else:
41 | assert False
42 | if not os.path.isfile(self.raw_dict_path):
43 | raise FileNotFoundError("Dictionary raw file not found.")
44 | if os.path.isfile(self.trie_path):
45 | self.load(self.trie_path)
46 | else:
47 | self.create(self.raw_dict_path, self.trie_path)
48 |
49 | def __init_defaults(self, raw_dict_path, trie_path):
50 | if self.raw_dict_path is None:
51 | self.raw_dict_path = raw_dict_path
52 | if self.trie_path is None:
53 | self.trie_path = trie_path
54 |
55 | def create(self, src_filename: str, dst_filename: str) -> None:
56 | """
57 | Загрузка словаря из файла.
58 |
59 | :param src_filename: имя файла с оригинальным словарём.
60 | :param dst_filename: имя файла, в который будет сохранён дамп.
61 | """
62 | with open(src_filename, 'r', encoding='utf-8') as f:
63 | for line in f:
64 | word, primary, secondary = line.split("\t")
65 | stresses = [Stress(int(a), Stress.Type.PRIMARY) for a in primary.strip().split(",")]
66 | if secondary.strip() != "":
67 | stresses += [Stress(int(a), Stress.Type.SECONDARY) for a in secondary.strip().split(",")]
68 | self.update(word, stresses)
69 | self.save(dst_filename)
70 |
71 | def save(self, dst_filename: str) -> None:
72 | """
73 | Сохранение дампа.
74 |
75 | :param dst_filename: имя файла, в который сохраняем дамп словаря.
76 | """
77 | with open(dst_filename, "wb") as f:
78 | pickle.dump(self.data, f, pickle.HIGHEST_PROTOCOL)
79 |
80 | def load(self, dump_filename: str) -> None:
81 | """
82 | Загрузка дампа словаря.
83 |
84 | :param dump_filename: откуда загружаем.
85 | """
86 | with open(dump_filename, "rb") as f:
87 | self.data = pickle.load(f)
88 |
89 | def get_stresses(self, word: str, stress_type: Stress.Type=Stress.Type.ANY) -> List[int]:
90 | """
91 | Получение ударений нужного типа у слова.
92 |
93 | :param word: слово, которое мы хотим посмотреть в словаре.
94 | :param stress_type: тип ударения.
95 | :return forms: массив всех ударений.
96 | """
97 | if word in self.data:
98 | if stress_type == Stress.Type.ANY:
99 | return [stress.position for stress in self.data[word]]
100 | else:
101 | return [stress.position for stress in self.data[word] if stress.type == stress_type]
102 | return []
103 |
104 | def get_all(self) -> ItemsView[str, Set[Stress]]:
105 | """
106 | :return items: все ключи и ударения словаря.
107 | """
108 | return self.data.items()
109 |
110 | def update(self, word: str, stresses: List[Stress]) -> None:
111 | """
112 | Обновление словаря.
113 |
114 | :param word: слово.
115 | :param stresses: набор ударений.
116 | """
117 | if word not in self.data:
118 | self.data[word] = set(stresses)
119 | else:
120 | self.data[word].update(stresses)
121 |
122 | def update_primary_only(self, word: str, stresses: List[int]) -> None:
123 | self.update(word, [Stress(stress, Stress.Type.PRIMARY) for stress in stresses])
--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | #
4 | # rupo documentation build configuration file, created by
5 | # sphinx-quickstart on Sat Mar 18 02:33:48 2017.
6 | #
7 | # This file is execfile()d with the current directory set to its
8 | # containing dir.
9 | #
10 | # Note that not all possible configuration values are present in this
11 | # autogenerated file.
12 | #
13 | # All configuration values have a default; values that are commented out
14 | # serve to show the default.
15 |
16 | # If extensions (or modules to document with autodoc) are in another directory,
17 | # add these directories to sys.path here. If the directory is relative to the
18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
19 | #
20 | import os
21 | import sys
22 | sys.path.insert(0, os.path.abspath("../.."))
23 |
24 |
25 | # -- General configuration ------------------------------------------------
26 |
27 | # If your documentation needs a minimal Sphinx version, state it here.
28 | #
29 | # needs_sphinx = '1.0'
30 |
31 | # Add any Sphinx extension module names here, as strings. They can be
32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
33 | # ones.
34 | extensions = ['sphinx.ext.autodoc',
35 | 'sphinx.ext.doctest',
36 | 'sphinx.ext.viewcode',
37 | 'sphinx.ext.githubpages']
38 |
39 | # Add any paths that contain templates here, relative to this directory.
40 | templates_path = ['_templates']
41 |
42 | # The suffix(es) of source filenames.
43 | # You can specify multiple suffix as a list of string:
44 | #
45 | # source_suffix = ['.rst', '.md']
46 | source_suffix = '.rst'
47 |
48 | # The master toctree document.
49 | master_doc = 'index'
50 |
51 | # General information about the project.
52 | project = 'rupo'
53 | copyright = '2017, Ilya Gusev'
54 | author = 'Ilya Gusev'
55 |
56 | # The version info for the project you're documenting, acts as replacement for
57 | # |version| and |release|, also used in various other places throughout the
58 | # built documents.
59 | #
60 | # The short X.Y version.
61 | version = '0.2.4'
62 | # The full version, including alpha/beta/rc tags.
63 | release = '0.2.4'
64 |
65 | # The language for content autogenerated by Sphinx. Refer to documentation
66 | # for a list of supported languages.
67 | #
68 | # This is also used if you do content translation via gettext catalogs.
69 | # Usually you set "language" from the command line for these cases.
70 | language = 'ru'
71 |
72 | # List of patterns, relative to source directory, that match files and
73 | # directories to ignore when looking for source files.
74 | # This patterns also effect to html_static_path and html_extra_path
75 | exclude_patterns = []
76 |
77 | # The name of the Pygments (syntax highlighting) style to use.
78 | pygments_style = 'sphinx'
79 |
80 | # If true, `todo` and `todoList` produce output, else they produce nothing.
81 | todo_include_todos = False
82 |
83 |
84 | # -- Options for HTML output ----------------------------------------------
85 |
86 | # The theme to use for HTML and HTML Help pages. See the documentation for
87 | # a list of builtin themes.
88 | #
89 | html_theme = 'default'
90 |
91 | # Theme options are theme-specific and customize the look and feel of a theme
92 | # further. For a list of options available for each theme, see the
93 | # documentation.
94 | #
95 | # html_theme_options = {}
96 |
97 | # Add any paths that contain custom static files (such as style sheets) here,
98 | # relative to this directory. They are copied after the builtin static files,
99 | # so a file named "default.css" will overwrite the builtin "default.css".
100 | html_static_path = ['_static']
101 |
102 |
103 | # -- Options for HTMLHelp output ------------------------------------------
104 |
105 | # Output file base name for HTML help builder.
106 | htmlhelp_basename = 'rupodoc'
107 |
108 |
109 | # -- Options for LaTeX output ---------------------------------------------
110 |
111 | latex_elements = {
112 | # The paper size ('letterpaper' or 'a4paper').
113 | #
114 | # 'papersize': 'letterpaper',
115 |
116 | # The font size ('10pt', '11pt' or '12pt').
117 | #
118 | # 'pointsize': '10pt',
119 |
120 | # Additional stuff for the LaTeX preamble.
121 | #
122 | # 'preamble': '',
123 |
124 | # Latex figure (float) alignment
125 | #
126 | # 'figure_align': 'htbp',
127 | }
128 |
129 | # Grouping the document tree into LaTeX files. List of tuples
130 | # (source start file, target name, title,
131 | # author, documentclass [howto, manual, or own class]).
132 | latex_documents = [
133 | (master_doc, 'rupo.tex', 'rupo Documentation',
134 | 'Ilya Gusev', 'manual'),
135 | ]
136 |
137 |
138 | # -- Options for manual page output ---------------------------------------
139 |
140 | # One entry per manual page. List of tuples
141 | # (source start file, name, description, authors, manual section).
142 | man_pages = [
143 | (master_doc, 'rupo', 'rupo Documentation',
144 | [author], 1)
145 | ]
146 |
147 |
148 | # -- Options for Texinfo output -------------------------------------------
149 |
150 | # Grouping the document tree into Texinfo files. List of tuples
151 | # (source start file, target name, title, author,
152 | # dir menu entry, description, category)
153 | texinfo_documents = [
154 | (master_doc, 'rupo', 'rupo Documentation',
155 | author, 'rupo', 'One line description of project.',
156 | 'Miscellaneous'),
157 | ]
158 |
--------------------------------------------------------------------------------
/rupo/metre/test_pattern_analyzer.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Автор: Гусев Илья
3 | # Описание: Тесты к компилятору выражений.
4 |
5 | import unittest
6 |
7 | from rupo.metre.pattern_analyzer import PatternAnalyzer
8 |
9 |
10 | class TestPatternAnalyzer(unittest.TestCase):
11 | def test_pattern_analyzer(self):
12 | self.assertEqual(PatternAnalyzer.count_errors("(s)*", "uuu"), ('sss', 0, 3, False))
13 | self.assertEqual(PatternAnalyzer.count_errors("(s)*", "uus"), ('sss', 0, 2, False))
14 | self.assertEqual(PatternAnalyzer.count_errors("(s)*", "usu"), ('sss', 0, 2, False))
15 | self.assertEqual(PatternAnalyzer.count_errors("(s)*", "uss"), ('sss', 0, 1, False))
16 | self.assertEqual(PatternAnalyzer.count_errors("(s)*", "suu"), ('sss', 0, 2, False))
17 | self.assertEqual(PatternAnalyzer.count_errors("(s)*", "sus"), ('sss', 0, 1, False))
18 | self.assertEqual(PatternAnalyzer.count_errors("(s)*", "ssu"), ('sss', 0, 1, False))
19 | self.assertEqual(PatternAnalyzer.count_errors("(s)*", "sss"), ('sss', 0, 0, False))
20 |
21 | self.assertEqual(PatternAnalyzer.count_errors("(sus)*(u)?", 'suu'), ('sus', 0, 1, False))
22 |
23 | self.assertEqual(PatternAnalyzer.count_errors("((sus)*u)*s", 'susss'), ('susus', 0, 1, False))
24 |
25 | self.assertEqual(PatternAnalyzer.count_errors("(s((s)*u)*)*", 'susss'), ('susss', 0, 0, False))
26 | self.assertEqual(PatternAnalyzer.count_errors("(s((s)*u)*)*", 'usss'), ('ssss', 0, 1, False))
27 | self.assertEqual(PatternAnalyzer.count_errors("(s((s)*u)*)*", 'suuu'), ('suuu', 0, 0, False))
28 | self.assertEqual(PatternAnalyzer.count_errors("(s((s)*u)*)*", 'suuusuuus'), ('suuusuuus', 0, 0, False))
29 |
30 | self.assertEqual(PatternAnalyzer.count_errors("(sss((sus)*uss)*)*", 'ssssussususs'), ('ssssussususs', 0, 0, False))
31 | self.assertEqual(PatternAnalyzer.count_errors("(sss((sus)*uss)*)*", 'ssssuuuss'), ('ssssususs', 0, 1, False))
32 |
33 | self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", "uuuu"), ('susu', 0, 2, False))
34 | self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", "uuus"), ('ssus', 0, 2, False))
35 | self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", "uusu"), ('susu', 0, 1, False))
36 | self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", "uuss"), ('suss', 0, 1, False))
37 | self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", "usuu"), ('sssu', 0, 2, False))
38 | self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", "usus"), ('ssus', 0, 1, False))
39 | self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", "ussu"), ('sssu', 0, 1, False))
40 | self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", "usss"), ('ssss', 0, 1, False))
41 | self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", "suuu"), ('susu', 0, 1, False))
42 | self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", "suus"), ('ssus', 0, 1, False))
43 | self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", "susu"), ('susu', 0, 0, False))
44 | self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", "suss"), ('suss', 0, 0, False))
45 | self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", "ssuu"), ('sssu', 0, 1, False))
46 | self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", 'ssus'), ('ssus', 0, 0, False))
47 | self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", 'sssu'), ('sssu', 0, 0, False))
48 | self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", "ssss"), ('ssss', 0, 0, False))
49 |
50 | self.assertEqual(PatternAnalyzer.count_errors("(s)?(u)?(S)?", "su"), ('su', 0, 0, False))
51 | self.assertEqual(PatternAnalyzer.count_errors("(s)?(u)?(S)?", "ss"), ('su', 0, 1, False))
52 | self.assertEqual(PatternAnalyzer.count_errors("(s)?(u)?(S)?", "uS"), ('uS', 0, 0, False))
53 | self.assertEqual(PatternAnalyzer.count_errors("(s)?(u)?(S)?", "sS"), ('sS', 0, 0, False))
54 |
55 | self.assertEqual(PatternAnalyzer.count_errors("(s)?(u)(s)*", "u"), ('u', 0, 0, False))
56 | self.assertEqual(PatternAnalyzer.count_errors("(s)?(u)(s)*", "su"), ('su', 0, 0, False))
57 | self.assertEqual(PatternAnalyzer.count_errors("(s)?(u)(s)*", "us"), ('us', 0, 0, False))
58 | self.assertEqual(PatternAnalyzer.count_errors("(s)?(u)(s)*", "sus"), ('sus', 0, 0, False))
59 | self.assertEqual(PatternAnalyzer.count_errors("(s)?(u)(s)*", "uss"), ('uss', 0, 0, False))
60 |
61 | self.assertEqual(PatternAnalyzer.count_errors("(us)*(uS)(U)?(U)?", "usuS"), ('usuS', 0, 0, False))
62 | self.assertEqual(PatternAnalyzer.count_errors("(us)*(uS)(U)?(U)?", "uSUU"), ('uSUU', 0, 0, False))
63 |
64 | self.assertEqual(PatternAnalyzer.count_errors("(su(u)?)*", "su"), ('su', 0, 0, False))
65 | self.assertEqual(PatternAnalyzer.count_errors("(su(u)?)*", "suu"), ('suu', 0, 0, False))
66 | self.assertEqual(PatternAnalyzer.count_errors("(su(u)?)*", "susu"), ('susu', 0, 0, False))
67 | self.assertEqual(PatternAnalyzer.count_errors("(su(u)?)*", "suusuu"), ('suusuu', 0, 0, False))
68 | self.assertEqual(PatternAnalyzer.count_errors("(su(u)?)*", "ssussu"), ('suusuu', 0, 2, False))
69 |
70 | self.assertEqual(PatternAnalyzer.count_errors("(u)?(u)?((s)(u)?(u)?)*(S)(U)?(U)?", "sssuSU"), ('sssuSU', 0, 0, False))
71 | self.assertEqual(PatternAnalyzer.count_errors("(u)?(u)?((s)(u)?(u)?)*(S)(U)?(U)?", "ussuSU"), ('ussuSU', 0, 0, False))
72 | self.assertEqual(PatternAnalyzer.count_errors("(u)?(u)?((s)(u)?(u)?)*(S)(U)?(U)?", "susuuSU"), ('susuuSU', 0, 0, False))
73 | self.assertEqual(PatternAnalyzer.count_errors("(u)?(u)?((s)(u)?(u)?)*(S)(U)?(U)?", "uusuuSU"), ('uusuuSU', 0, 0, False))
74 |
--------------------------------------------------------------------------------
/rupo/main/tokenizer.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Автор: Гусев Илья
3 | # Описание: Модуль токенизации.
4 |
5 | import re
6 | from typing import List
7 | from enum import Enum, unique
8 |
9 | from rupo.settings import HYPHEN_TOKENS
10 |
11 |
12 | class Token:
13 | @unique
14 | class TokenType(Enum):
15 | """
16 | Тип токена.
17 | """
18 | UNKNOWN = -1
19 | WORD = 0
20 | PUNCTUATION = 1
21 | SPACE = 2
22 | ENDLINE = 3
23 | NUMBER = 4
24 |
25 | def __str__(self):
26 | return str(self.name)
27 |
28 | def __repr__(self):
29 | return self.__str__()
30 |
31 | def __init__(self, text: str, token_type: TokenType, begin: int, end: int):
32 | """
33 | :param text: исходный текст.
34 | :param token_type: тип токена.
35 | :param begin: начало позиции токена в тексте.
36 | :param end: конец позиции токена в тексте.
37 | """
38 | self.token_type = token_type
39 | self.begin = begin
40 | self.end = end
41 | self.text = text
42 |
43 | def __str__(self):
44 | return "'" + self.text + "'" + "|" + str(self.token_type) + " (" + str(self.begin) + ", " + str(self.end) + ")"
45 |
46 | def __repr__(self):
47 | return self.__str__()
48 |
49 | def __eq__(self, other):
50 | return self.text == other.text and self.token_type == other.token_type
51 |
52 |
53 | class Tokenizer(object):
54 | """
55 | Класс токенизации.
56 | """
57 | @staticmethod
58 | def tokenize(text: str, remove_punct=False, remove_unknown=False, replace_numbers=False) -> List[Token]:
59 | """
60 | Токенизация текстов на русском языке с учётом знаков препинания и слов с дефисами.
61 |
62 | :param text: исходный текст.
63 | :return: список токенов.
64 | """
65 | tokens = []
66 | punctuation = ".,?:;!—"
67 | begin = -1
68 | for i, ch in enumerate(text):
69 | if ch.isalpha() or ch == "-":
70 | if begin == -1:
71 | begin = i
72 | else:
73 | if begin != -1:
74 | tokens.append(Tokenizer.__form_token(text, begin, i))
75 | begin = -1
76 | token_type = Token.TokenType.UNKNOWN
77 | if ch in punctuation:
78 | token_type = Token.TokenType.PUNCTUATION
79 | elif ch == "\n":
80 | token_type = Token.TokenType.ENDLINE
81 | elif ch == " ":
82 | token_type = Token.TokenType.SPACE
83 | elif ch.isdigit():
84 | token_type = Token.TokenType.NUMBER
85 | if len(tokens) != 0 and tokens[-1].token_type == token_type:
86 | tokens[-1].text += ch
87 | tokens[-1].end += 1
88 | else:
89 | tokens.append(Token(ch, token_type, i, i + 1))
90 | if begin != -1:
91 | tokens.append(Tokenizer.__form_token(text, begin, len(text)))
92 | tokens = Tokenizer.__hyphen_map(tokens)
93 | if remove_punct:
94 | tokens = [token for token in tokens if token.token_type != Token.TokenType.PUNCTUATION]
95 | if remove_unknown:
96 | tokens = [token for token in tokens if token.token_type != Token.TokenType.UNKNOWN]
97 | if replace_numbers:
98 | for token in tokens:
99 | if token.token_type != Token.TokenType.NUMBER:
100 | continue
101 | token.text = "ЧИСЛО"
102 | token.token_type = Token.TokenType.WORD
103 | return tokens
104 |
105 | @staticmethod
106 | def __form_token(text, begin, end):
107 | word = text[begin:end]
108 | if word != "-":
109 | return Token(word, Token.TokenType.WORD, begin, end)
110 | else:
111 | return Token("-", Token.TokenType.PUNCTUATION, begin, begin + 1)
112 |
113 | @staticmethod
114 | def __hyphen_map(tokens: List[Token]) -> List[Token]:
115 | """
116 | Слова из словаря оставляем с дефисом, остальные разделяем.
117 |
118 | :param tokens: токены.
119 | :return: токены после обработки.
120 | """
121 | new_tokens = []
122 | hyphen_tokens = Tokenizer.__get_hyphen_tokens()
123 | for token in tokens:
124 | if token.token_type != Token.TokenType.WORD:
125 | new_tokens.append(token)
126 | continue
127 | is_one_word = True
128 | if "-" in token.text:
129 | is_one_word = False
130 | for hyphen_token in hyphen_tokens:
131 | if hyphen_token in token.text or token.text in hyphen_token:
132 | is_one_word = True
133 | if is_one_word:
134 | new_tokens.append(token)
135 | else:
136 | texts = token.text.split("-")
137 | pos = token.begin
138 | for text in texts:
139 | new_tokens.append(Token(text, Token.TokenType.WORD, pos, pos+len(text)))
140 | pos += len(text) + 1
141 | return new_tokens
142 |
143 | @staticmethod
144 | def __get_hyphen_tokens():
145 | """
146 | :return: содержание словаря, в котором прописаны слова с дефисом.
147 | """
148 | with open(HYPHEN_TOKENS, "r", encoding="utf-8") as file:
149 | hyphen_tokens = [token.strip() for token in file.readlines()]
150 | return hyphen_tokens
151 |
152 |
153 | class SentenceTokenizer(object):
154 | @staticmethod
155 | def tokenize(text: str) -> List[str]:
156 | m = re.split(r'(?<=[^А-ЯЁ].[^А-ЯЁ][.?!;]) +(?=[А-ЯЁ])', text)
157 | return m
158 |
--------------------------------------------------------------------------------
/rupo/files/reader.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Автор: Гусев Илья
3 | # Описание: Считыватель файлов разных расширений.
4 |
5 | import os
6 | import xml.etree.ElementTree as etree
7 | import json
8 | from enum import Enum
9 | from typing import Iterator
10 |
11 | from rupo.main.markup import Markup
12 | from rupo.metre.metre_classifier import MetreClassifier
13 | from rupo.stress.predictor import StressPredictor
14 |
15 |
16 | RAW_SEPARATOR = "\n\n\n"
17 |
18 |
19 | class FileType(Enum):
20 | """
21 | Тип файла.
22 | """
23 | RAW = ".txt"
24 | XML = ".xml"
25 | JSON = ".json"
26 | VOCAB = ".voc"
27 |
28 |
29 | class Reader(object):
30 | """
31 | Считывание из файлов.
32 | """
33 | @staticmethod
34 | def read_markups(path: str, source_type: FileType, is_processed: bool,
35 | stress_predictor: StressPredictor=None) -> Iterator[Markup]:
36 | """
37 | Считывание разметок (включая разметку по сырым текстам).
38 |
39 | :param path: путь к файлу/папке.
40 | :param source_type: тип файлов.
41 | :param is_processed: уже размеченные тексты?
42 | :param stress_predictor: классификатор ударений (для неразмеченных текстов).
43 | """
44 | paths = Reader.get_paths(path, source_type.value)
45 | for filename in paths:
46 | with open(filename, "r", encoding="utf-8") as file:
47 | if is_processed:
48 | if source_type == FileType.XML:
49 | for elem in Reader.__xml_iter(file, 'markup'):
50 | yield Markup().from_xml(etree.tostring(elem, encoding='utf-8', method='xml'))
51 | elif source_type == FileType.JSON:
52 | j = json.load(file)
53 | for item in j['items']:
54 | yield Markup().from_dict(item)
55 | elif source_type == FileType.RAW:
56 | separator_count = 0
57 | text = ""
58 | for line in file:
59 | if line == "\n":
60 | separator_count += 1
61 | else:
62 | text += line
63 | if separator_count == 3:
64 | separator_count = 0
65 | yield Markup().from_raw(text)
66 | if text != "":
67 | yield Markup().from_raw(text)
68 | else:
69 | assert stress_predictor is not None
70 | for text in Reader.read_texts(filename, source_type):
71 | yield Reader.__markup_text(text, stress_predictor)
72 |
73 | @staticmethod
74 | def read_vocabulary(path: str):
75 | """
76 | Считывание словаря.
77 |
78 | :param path: путь к словарю.
79 | :return: слово и его индекс.
80 | """
81 | paths = Reader.get_paths(path, FileType.VOCAB.value)
82 | for filename in paths:
83 | with open(filename, "r", encoding="utf-8") as file:
84 | for line in file:
85 | fields = line.strip().split('\t')
86 | yield Markup().from_raw(fields[0]).lines[0].words[0], int(fields[1])
87 |
88 | @staticmethod
89 | def read_texts(path: str, source_type: FileType) -> Iterator[str]:
90 | """
91 | Считывание текстов.
92 |
93 | :param path: путь к файлу/папке.
94 | :param source_type: тип файлов.
95 | """
96 | paths = Reader.get_paths(path, source_type.value)
97 | for filename in paths:
98 | with open(filename, "r", encoding="utf-8") as file:
99 | if source_type == FileType.XML:
100 | for elem in Reader.__xml_iter(file, 'item'):
101 | yield elem.find(".//text").text
102 | elif source_type == FileType.JSON:
103 | # TODO: ленивый парсинг
104 | j = json.load(file)
105 | for item in j['items']:
106 | yield item['text']
107 | elif source_type == FileType.RAW:
108 | text = file.read()
109 | for t in text.split(RAW_SEPARATOR):
110 | yield t
111 |
112 | @staticmethod
113 | def get_paths(path: str, ext: str) -> Iterator[str]:
114 | """
115 | Получение всех файлов заданного типа по заданному пути.
116 |
117 | :param path: путь к файлу/папке.
118 | :param ext: требуемое расширение.
119 | """
120 | if os.path.isfile(path):
121 | if ext == os.path.splitext(path)[1]:
122 | yield path
123 | else:
124 | for root, folders, files in os.walk(path):
125 | for file in files:
126 | if ext == os.path.splitext(file)[1]:
127 | yield os.path.join(root, file)
128 | for folder in folders:
129 | return Reader.get_paths(folder, ext)
130 |
131 | @staticmethod
132 | def __markup_text(text: str, stress_predictor: StressPredictor) -> Markup:
133 | """
134 | Разметка текста.
135 |
136 | :param text: текст.
137 | :return: разметка.
138 | """
139 | markup = Markup.process_text(text, stress_predictor)
140 | markup = MetreClassifier.improve_markup(markup)[0]
141 | return markup
142 |
143 | @staticmethod
144 | def __xml_iter(file, tag):
145 | """
146 | :param file: xml файл.
147 | :param tag: заданный тег.
148 | :return: все элементы с заданными тегами в xml.
149 | """
150 | return (elem for event, elem in etree.iterparse(file, events=['end']) if event == 'end' and elem.tag == tag)
151 |
--------------------------------------------------------------------------------
/rupo/metre/test_metre_classifier.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Автор: Гусев Илья
3 | # Описание: Тесты к классификатору метра.
4 |
5 | import unittest
6 | import jsonpickle
7 | import copy
8 | import logging
9 | import sys
10 |
11 | from rupo.main.markup import Markup
12 | from rupo.stress.predictor import CombinedStressPredictor
13 | from rupo.metre.metre_classifier import MetreClassifier, ClassificationResult, StressCorrection
14 | from rupo.settings import RU_STRESS_DEFAULT_MODEL, ZALYZNYAK_DICT, CMU_DICT, RU_GRAPHEME_STRESS_PATH, \
15 | RU_GRAPHEME_STRESS_TRIE_PATH
16 |
17 |
18 | class TestMetreClassifier(unittest.TestCase):
19 | @classmethod
20 | def setUpClass(cls):
21 | cls.stress_predictor = CombinedStressPredictor(
22 | stress_model_path=RU_STRESS_DEFAULT_MODEL,
23 | zalyzniak_dict=ZALYZNYAK_DICT,
24 | cmu_dict=CMU_DICT,
25 | raw_stress_dict_path=RU_GRAPHEME_STRESS_PATH,
26 | stress_trie_path=RU_GRAPHEME_STRESS_TRIE_PATH
27 | )
28 | logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
29 |
30 | @classmethod
31 | def tearDownClass(cls):
32 | del cls.stress_predictor
33 |
34 | def test_classification_result(self):
35 | result = ClassificationResult(5)
36 | result.additions["iambos"].append(StressCorrection(0, 0, 0, "", 0))
37 | self.assertEqual(result, jsonpickle.decode(result.to_json()))
38 |
39 | def test_metre_classifier1(self):
40 | text = "Горит восток зарёю новой.\n" \
41 | "Уж на равнине, по холмам\n" \
42 | "Грохочут пушки. Дым багровый\n" \
43 | "Кругами всходит к небесам."
44 | markup, result = MetreClassifier.improve_markup(Markup.process_text(text, self.stress_predictor))
45 | self.assertIsInstance(markup, Markup)
46 | self.assertIsInstance(result, ClassificationResult)
47 | self.assertEqual(result.metre, "iambos")
48 |
49 | def test_metre_classifier2(self):
50 | text = "Буря мглою небо кроет,\n" \
51 | "Вихри снежные крутя;\n" \
52 | "То, как зверь, она завоет,\n" \
53 | "То заплачет, как дитя..."
54 | markup, result = MetreClassifier.improve_markup(Markup.process_text(text, self.stress_predictor))
55 | self.assertEqual(result.metre, "choreios")
56 |
57 | def test_metre_classifier3(self):
58 | text = "На стеклах нарастает лед,\n"\
59 | "Часы твердят: «Не трусь!»\n"\
60 | "Услышать, что ко мне идет,\n"\
61 | "И мертвой я боюсь.\n"\
62 | "Как идола, молю я дверь;\n"\
63 | "«Не пропускай беду!»\n"\
64 | "Кто воет за стеной, как зверь,\n"\
65 | "Кто прячется в саду?"
66 | markup, result = MetreClassifier.improve_markup(Markup.process_text(text, self.stress_predictor))
67 | self.assertEqual(result.metre, "iambos")
68 |
69 | def test_metre_classifier4(self):
70 | text = "Вот уж вечер. Роса\n" \
71 | "Блестит на крапиве.\n"\
72 | "Я стою у дороги,\n"\
73 | "Прислонившись к иве.\n"\
74 | "От луны свет большой\n"\
75 | "Прямо на нашу крышу.\n"\
76 | "Где-то песнь соловья\n"\
77 | "Хорошо и тепло,\n"\
78 | "Как зимой у печки.\n"\
79 | "И березы стоят,\n"\
80 | "Как большие свечки.\n"\
81 | "И вдали за рекой,\n"\
82 | "Видно, за опушкой,\n"\
83 | "Сонный сторож стучит\n"\
84 | "Мертвой колотушкой."
85 | markup, result = MetreClassifier.improve_markup(Markup.process_text(text, self.stress_predictor))
86 | self.assertTrue(result.metre == "dolnik3" or result.metre == "dolnik2")
87 |
88 | def test_metre_classifier5(self):
89 | text = "Глыбу кварца разбили молотом,\n" \
90 | "И, веселым огнем горя,\n" \
91 | "Заблестели крупинки золота\n" \
92 | "В свете тусклого фонаря.\n" \
93 | "И вокруг собрались откатчики:\n" \
94 | "Редкий случай, чтоб так, в руде!\n" \
95 | "И от ламп заплясали зайчики,\n" \
96 | "Отражаясь в черной воде...\n" \
97 | "Прислонившись к мокрой стене,\n" \
98 | "Мы стояли вокруг.\n" \
99 | "Курили,\n" \
100 | "Прислонившись к мокрой стене,\n" \
101 | "И мечтательно говорили\n" \
102 | "Не о золоте — о весне.\n" \
103 | "И о том, что скоро, наверно,\n" \
104 | "На заливе вспотеет лед\n" \
105 | "И, снега огласив сиреной,\n" \
106 | "Наконец придет пароход...\n" \
107 | "Покурили еще немного,\n" \
108 | "Золотинки в кисет смели\n" \
109 | "И опять — по своим дорогам,\n" \
110 | "К вагонеткам своим пошли.\n" \
111 | "Что нам золото? В дни тяжелые\n" \
112 | "Я от жадности злой не слеп.\n" \
113 | "Самородки большие, желтые\n" \
114 | "Отдавал за табак и хлеб.\n" \
115 | "Не о золоте были мысли...\n" \
116 | "В ночь таежную у костра\n" \
117 | "Есть над чем поразмыслить в жизни,\n" \
118 | "Кроме\n" \
119 | "Золота-серебра."
120 | markup, result = MetreClassifier.improve_markup(Markup.process_text(text, self.stress_predictor))
121 | self.assertTrue(result.metre == "dolnik3" or result.metre == "dolnik2")
122 |
123 | def test_metre_classifier6(self):
124 | text = "Лючинь печальная читала вечером ручьисто-вкрадчиво,\n" \
125 | "Так чутко чувствуя журчащий вычурно чужой ей плач,\n" \
126 | "И, в человечестве чтя нечто вечное, чем чушь Бокаччио,\n" \
127 | "От чар отчаянья кручинно-скучная, чла час удач."
128 | markup, result = MetreClassifier.improve_markup(Markup.process_text(text, self.stress_predictor))
129 | self.assertTrue(result.metre == "iambos")
130 |
131 | def test_improve(self):
132 | text = "Буря мглою небо кроет,\n" \
133 | "Вихри снежные крутя;\n" \
134 | "То, как зверь, она завоет,\n" \
135 | "То заплачет, как дитя..."
136 | initial_markup = Markup.process_text(text, self.stress_predictor)
137 | markup, result = MetreClassifier.improve_markup(copy.deepcopy(initial_markup))
138 | self.assertNotEqual(markup.lines[0].words[0].syllables[0].stress, -1)
139 | self.assertEqual(markup.lines[0].words[0].syllables[1].stress, -1)
140 |
141 |
142 |
143 |
144 |
--------------------------------------------------------------------------------
/rupo/data/examples/markup.xml:
--------------------------------------------------------------------------------
1 | 2- 0Забывши волнения жизни мятежной,33
- 0-10За2
- 132бы4
- 2-14вши7
0Забывши7- 0-10во2
- 142лне5
- 2-15ни7
- 3-17я8
8волнения16- 010жи2
- 1-12зни5
17жизни22- 0-10мя2
- 132те4
- 2-14жной8
23мятежной31 - 33Один жил в пустыне рыбак молодой.67
- 0-10О1
- 121дин4
33Один37- 010жил3
38жил41- 42в43
- 0-10пу2
- 142сты5
- 2-15не7
44пустыне51- 0-10ры2
- 132бак5
52рыбак57- 0-10мо2
- 1-12ло4
- 254дой7
58молодой65 - 67Однажды на скале прибрежной,96
- 0-10О1
- 131дна4
- 2-14жды7
67Однажды74- 010на2
75на77- 020ска3
- 1-13ле5
78скале83- 0-10при3
- 153бре6
- 2-16жной10
84прибрежной94 - 96Над тихой прозрачной рекой123
- 010Над3
96Над99- 010ти2
- 1-12хой5
100тихой105- 0-10про3
- 153зра6
- 2-16чной10
106прозрачной116- 0-10ре2
- 132кой5
117рекой122 - 123Он с удой беспечно142
- 000Он2
123Он125- 126с127
- 000у1
- 1-11дой4
128удой132- 0-10бе2
- 142спе5
- 2-15чно8
133беспечно141 - 142Сидел148
- 0-10Си2
- 132дел5
142Сидел147 - 148И думой сердечной166
- 000И1
148И149- 010ду2
- 1-12мой5
150думой155- 0-10сер3
- 143де5
- 2-15чной9
156сердечной165 - 166К прошедшему счастью летел.193
- 166К167
- 0-10про3
- 143ше5
- 2-15дше8
- 3-18му10
168прошедшему178- 020сча3
- 1-13стью7
179счастью186- 0-10ле2
- 132тел5
187летел192 Забывши волнения жизни мятежной,\nОдин жил в пустыне рыбак молодой.\nОднажды на скале прибрежной,\nНад тихой прозрачной рекой\nОн с удой беспечно\nСидел\nИ думой сердечной\nК прошедшему счастью летел.
--------------------------------------------------------------------------------
/rupo/api.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Автор: Гусев Илья
3 | # Описание: Набор внешних методов для работы с библиотекой.
4 |
5 | import os
6 | from typing import List, Tuple, Dict
7 |
8 | from rulm.language_model import LanguageModel
9 |
10 | from rupo.files.reader import FileType, Reader
11 | from rupo.files.writer import Writer
12 | from rupo.main.markup import Markup
13 | from rupo.metre.metre_classifier import MetreClassifier, ClassificationResult
14 | from rupo.rhymes.rhymes import Rhymes
15 | from rupo.settings import ZALYZNYAK_DICT, CMU_DICT, DATA_DIR, DICT_DIR
16 | from rupo.stress.predictor import StressPredictor, CombinedStressPredictor
17 | from rupo.main.vocabulary import StressVocabulary, inflate_stress_vocabulary
18 | from rupo.generate.generator import Generator
19 |
20 | from allennlp.data.vocabulary import Vocabulary, DEFAULT_OOV_TOKEN
21 | from allennlp.common.util import END_SYMBOL
22 | from rulm.transform import ExcludeTransform
23 | from russ.syllables import get_syllables
24 |
25 |
26 | class Engine:
27 | def __init__(self, language="ru"):
28 | self.language = language # type: str
29 | self.vocabulary = None # type: StressVocabulary
30 | self.generator = None # type: Generator
31 | self.stress_predictors = dict() # type: Dict[str, StressPredictor]
32 |
33 | def load(self, stress_model_path: str, zalyzniak_dict: str, raw_stress_dict_path=None,
34 | stress_trie_path=None):
35 | self.stress_predictors = dict()
36 | if not os.path.isdir(DATA_DIR):
37 | os.makedirs(DATA_DIR)
38 | if not os.path.isdir(DICT_DIR):
39 | os.makedirs(DICT_DIR)
40 | self.get_stress_predictor(self.language, stress_model_path, raw_stress_dict_path,
41 | stress_trie_path, zalyzniak_dict)
42 |
43 | def get_vocabulary(self, dump_path: str, markup_path: str) -> StressVocabulary:
44 | if self.vocabulary is None:
45 | self.vocabulary = StressVocabulary()
46 | if os.path.isfile(dump_path):
47 | self.vocabulary.load(dump_path)
48 | elif markup_path is not None:
49 | self.vocabulary.parse(markup_path)
50 | return self.vocabulary
51 |
52 | def get_generator(self,
53 | model_path: str,
54 | token_vocab_path: str,
55 | stress_vocab_dump_path: str) -> Generator:
56 | if self.generator is None:
57 | assert os.path.isdir(model_path) and os.path.isdir(token_vocab_path)
58 | vocabulary = Vocabulary.from_files(token_vocab_path)
59 | stress_vocabulary = StressVocabulary()
60 | if not os.path.isfile(stress_vocab_dump_path):
61 | stress_vocabulary = inflate_stress_vocabulary(vocabulary, self.get_stress_predictor())
62 | stress_vocabulary.save(stress_vocab_dump_path)
63 | else:
64 | stress_vocabulary.load(stress_vocab_dump_path)
65 |
66 | eos_index = vocabulary.get_token_index(END_SYMBOL)
67 | unk_index = vocabulary.get_token_index(DEFAULT_OOV_TOKEN)
68 | exclude_transform = ExcludeTransform((unk_index, eos_index))
69 |
70 | model = LanguageModel.load(model_path, vocabulary_dir=token_vocab_path,
71 | transforms=[exclude_transform, ])
72 | self.generator = Generator(model, vocabulary, stress_vocabulary, eos_index)
73 | return self.generator
74 |
75 | def get_stress_predictor(self, language="ru", stress_model_path: str=None, raw_stress_dict_path=None,
76 | stress_trie_path=None, zalyzniak_dict=ZALYZNYAK_DICT, cmu_dict=CMU_DICT):
77 | if self.stress_predictors.get(language) is None:
78 | self.stress_predictors[language] = CombinedStressPredictor(language, stress_model_path,
79 | raw_stress_dict_path, stress_trie_path,
80 | zalyzniak_dict, cmu_dict)
81 | return self.stress_predictors[language]
82 |
83 | def get_stresses(self, word: str, language: str="ru") -> List[int]:
84 | """
85 | :param word: слово.
86 | :param language: язык.
87 | :return: ударения слова.
88 | """
89 | return self.get_stress_predictor(language).predict(word)
90 |
91 | @staticmethod
92 | def get_word_syllables(word: str) -> List[str]:
93 | """
94 | :param word: слово.
95 | :return: его слоги.
96 | """
97 | return [syllable.text for syllable in get_syllables(word)]
98 |
99 | @staticmethod
100 | def count_syllables(word: str) -> int:
101 | """
102 | :param word: слово.
103 | :return: количество слогов в нём.
104 | """
105 | return len(get_syllables(word))
106 |
107 | def get_markup(self, text: str, language: str="ru") -> Markup:
108 | """
109 | :param text: текст.
110 | :param language: язык.
111 | :return: его разметка по словарю.
112 | """
113 | return Markup.process_text(text, self.get_stress_predictor(language))
114 |
115 | def get_improved_markup(self, text: str, language: str="ru") -> Tuple[Markup, ClassificationResult]:
116 | """
117 | :param text: текст.
118 | :param language: язык.
119 | :return: его разметка по словарю, классификатору метру и ML классификатору.
120 | """
121 | markup = Markup.process_text(text, self.get_stress_predictor(language))
122 | return MetreClassifier.improve_markup(markup)
123 |
124 | def classify_metre(self, text: str, language: str="ru") -> str:
125 | """
126 | :param text: текст.
127 | :param language: язык.
128 | :return: его метр.
129 | """
130 | return MetreClassifier.classify_metre(Markup.process_text(text, self.get_stress_predictor(language))).metre
131 |
132 | def generate_markups(self, input_path: str, input_type: FileType, output_path: str, output_type: FileType) -> None:
133 | """
134 | Генерация разметок по текстам.
135 |
136 | :param input_path: путь к папке/файлу с текстом.
137 | :param input_type: тип файлов с текстов.
138 | :param output_path: путь к файлу с итоговыми разметками.
139 | :param output_type: тип итогового файла.
140 | """
141 | markups = Reader.read_markups(input_path, input_type, False, self.get_stress_predictor())
142 | writer = Writer(output_type, output_path)
143 | writer.open()
144 | for markup in markups:
145 | writer.write_markup(markup)
146 | writer.close()
147 |
148 | def is_rhyme(self, word1: str, word2: str) -> bool:
149 | """
150 | :param word1: первое слово.
151 | :param word2: второе слово.
152 | :return: рифмуются ли слова.
153 | """
154 | markup_word1 = self.get_markup(word1).lines[0].words[0]
155 | markup_word1.set_stresses(self.get_stresses(word1))
156 | markup_word2 = self.get_markup(word2).lines[0].words[0]
157 | markup_word2.set_stresses(self.get_stresses(word2))
158 | return Rhymes.is_rhyme(markup_word1, markup_word2)
159 |
160 | def generate_poem(self,
161 | model_path: str,
162 | token_vocab_path: str=None,
163 | stress_vocab_path: str=None,
164 | metre_schema: str="-+",
165 | rhyme_pattern: str="abab",
166 | n_syllables: int=8,
167 | sampling_k: int=None,
168 | beam_width: int=None,
169 | seed: int=1337,
170 | temperature: float=1.0,
171 | last_text: str="") -> str:
172 | """
173 | Сгенерировать стих. Нужно задать либо sampling_k, либо beam_width.
174 |
175 | :param model_path: путь к модели.
176 | :param token_vocab_path: путь к словарю.
177 | :param stress_vocab_path: путь к словарю ударений.
178 | :param metre_schema: схема метра.
179 | :param rhyme_pattern: схема рифм.
180 | :param n_syllables: количество слогов в строке.
181 | :param sampling_k: top-k при семплинге
182 | :param beam_width: ширина лучевого поиска.
183 | :param seed: seed
184 | :param temperature: температура генерации
185 | :param last_text: последняя строчка
186 | :return: стих. None, если генерация не была успешной.
187 | """
188 | token_vocab_path = token_vocab_path or os.path.join(model_path, "vocabulary")
189 | stress_vocab_path = stress_vocab_path or os.path.join(model_path, "stress.pickle")
190 | generator = self.get_generator(model_path, token_vocab_path, stress_vocab_path)
191 | poem = generator.generate_poem(
192 | metre_schema=metre_schema,
193 | rhyme_pattern=rhyme_pattern,
194 | n_syllables=n_syllables,
195 | sampling_k=sampling_k,
196 | beam_width=beam_width,
197 | temperature=temperature,
198 | seed=seed,
199 | last_text=last_text
200 | )
201 | return poem
202 |
203 | def get_word_rhymes(self, word: str, vocab_dump_path: str, markup_path: str=None) -> List[str]:
204 | """
205 | Поиск рифмы для данного слова.
206 |
207 | :param word: слово.
208 | :param vocab_dump_path: путь, куда сохраняется словарь.
209 | :param markup_path: путь к разметкам.
210 | :return: список рифм.
211 | """
212 | markup_word = self.get_markup(word).lines[0].words[0]
213 | markup_word.set_stresses(self.get_stresses(word))
214 | rhymes = []
215 | vocabulary = self.get_vocabulary(vocab_dump_path, markup_path)
216 | for i in range(vocabulary.size()):
217 | if Rhymes.is_rhyme(markup_word, vocabulary.get_word(i)):
218 | rhymes.append(vocabulary.get_word(i).text.lower())
219 | return rhymes
220 |
--------------------------------------------------------------------------------
/rupo/main/markup.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Автор: Гусев Илья
3 | # Описание: Модуль для описания разметки по ударениям и слогам.
4 |
5 | import json
6 | from typing import List, Set
7 | import xml.etree.ElementTree as etree
8 |
9 | from dicttoxml import dicttoxml
10 |
11 | from rupo.util.preprocess import get_first_vowel_position
12 | from rupo.util.mixins import CommonMixin
13 | from rupo.main.tokenizer import Tokenizer, Token
14 | from rupo.util.timeit import timeit
15 | from russ.syllables import get_syllables
16 |
17 |
18 | class Annotation(CommonMixin):
19 | """
20 | Класс аннотации.
21 | Содержит начальную и конечную позицию в тексте, а также текст аннотации.
22 | """
23 | def __init__(self, begin: int, end: int, text: str) -> None:
24 | self.begin = begin
25 | self.end = end
26 | self.text = text
27 |
28 |
29 | class Syllable(Annotation):
30 | """
31 | Разметка слога. Включает в себя аннотацию и номер слога, а также ударение.
32 | Если ударение падает не на этот слог, -1.
33 | """
34 | def __init__(self, begin: int, end: int, number: int, text: str, stress: int=-1) -> None:
35 | super(Syllable, self).__init__(begin, end, text)
36 | self.number = number
37 | self.stress = stress
38 |
39 | def vowel(self) -> int:
40 | """
41 | :return: позиция гласной буквы этого слога в слове (с 0).
42 | """
43 | return get_first_vowel_position(self.text) + self.begin
44 |
45 | def from_dict(self, d: dict) -> 'Syllable':
46 | self.__dict__.update(d)
47 | if "accent" in self.__dict__:
48 | self.stress = self.__dict__["accent"]
49 | return self
50 |
51 |
52 | class Word(Annotation):
53 | """
54 | Разметка слова. Включает в себя аннотацию слова и его слоги.
55 | """
56 | def __init__(self, begin: int, end: int, text: str, syllables: List[Syllable]) -> None:
57 | super(Word, self).__init__(begin, end, text)
58 | self.syllables = syllables
59 |
60 | def count_stresses(self) -> int:
61 | """
62 | :return: количество ударений в слове.
63 | """
64 | return sum(syllable.stress != -1 for syllable in self.syllables)
65 |
66 | def stress(self) -> int:
67 | """
68 | :return: последнее ударение в слове, если нет, то -1.
69 | """
70 | stress = -1
71 | for syllable in self.syllables:
72 | if syllable.stress != -1:
73 | stress = syllable.stress
74 | return stress
75 |
76 | def get_stressed_syllables_numbers(self) -> List[int]:
77 | """
78 | :return: номера слогов, на которые падают ударения.
79 | """
80 | return [syllable.number for syllable in self.syllables if syllable.stress != -1]
81 |
82 | def get_stresses(self) -> Set[int]:
83 | """
84 | :return: все ударения.
85 | """
86 | stresses = set()
87 | for syllable in self.syllables:
88 | if syllable.stress != -1:
89 | stresses.add(syllable.stress)
90 | return stresses
91 |
92 | def set_stresses(self, stresses: List[int]) -> None:
93 | """
94 | Задать ударения, все остальные убираются.
95 |
96 | :param stresses: позиции ударения в слове.
97 | """
98 | for syllable in self.syllables:
99 | if syllable.vowel() in stresses:
100 | syllable.stress = syllable.vowel()
101 | else:
102 | syllable.stress = -1
103 |
104 | def get_short(self) -> str:
105 | """
106 | :return: слово в форме "текст"+"последнее ударение".
107 | """
108 | return self.text.lower() + str(self.stress())
109 |
110 | def from_dict(self, d: dict) -> 'Word':
111 | self.__dict__.update(d)
112 | syllables = d["syllables"] # type: List[dict]
113 | self.syllables = [Syllable(0, 0, 0, "").from_dict(syllable) for syllable in syllables]
114 | return self
115 |
116 | def to_stressed_word(self):
117 | from rupo.stress.word import StressedWord, Stress
118 | return StressedWord(self.text, set([Stress(pos, Stress.Type.PRIMARY) for pos in self.get_stresses()]))
119 |
120 | def __hash__(self) -> int:
121 | """
122 | :return: хеш разметки.
123 | """
124 | return hash(self.get_short())
125 |
126 |
127 | class Line(Annotation):
128 | """
129 | Разметка строки. Включает в себя аннотацию строки и её слова.
130 | """
131 | def __init__(self, begin: int, end: int, text: str, words: List[Word]) -> None:
132 | super(Line, self).__init__(begin, end, text)
133 | self.words = words
134 |
135 | def from_dict(self, d) -> 'Line':
136 | self.__dict__.update(d)
137 | words = d["words"] # type: List[dict]
138 | self.words = [Word(0, 0, "", []).from_dict(word) for word in words]
139 | return self
140 |
141 | def count_vowels(self):
142 | num_vowels = 0
143 | for word in self.words:
144 | for syllable in word.syllables:
145 | if get_first_vowel_position(syllable.text) != -1:
146 | num_vowels += 1
147 | return num_vowels
148 |
149 |
150 | class Markup(CommonMixin):
151 | """
152 | Класс данных для разметки в целом с экспортом/импортом в XML и JSON.
153 | """
154 | def __init__(self, text: str=None, lines: List[Line]=None) -> None:
155 | self.text = text
156 | self.lines = lines
157 | self.version = 2
158 |
159 | def to_json(self) -> str:
160 | return json.dumps(self.to_dict(), ensure_ascii=False)
161 |
162 | def from_json(self, st) -> 'Markup':
163 | d = json.loads(st)
164 | return self.from_dict(d)
165 |
166 | def from_dict(self, d) -> 'Markup':
167 | self.__dict__.update(d)
168 | lines = d["lines"] # type: List[dict]
169 | self.lines = [Line(0, 0, "", []).from_dict(line) for line in lines]
170 | return self
171 |
172 | def to_xml(self) -> str:
173 | """
174 | Экспорт в XML.
175 |
176 | :return self: строка в формате XML
177 | """
178 | return dicttoxml(self.to_dict(), custom_root='markup', attr_type=False).decode('utf-8').replace("\n", "\\n")
179 |
180 | def from_xml(self, xml: str) -> 'Markup':
181 | """
182 | Импорт из XML.
183 |
184 | :param xml: XML-разметка
185 | :return self: получившийся объект Markup
186 | """
187 | root = etree.fromstring(xml)
188 | if root.find("version") is None or int(root.find("version").text) != self.version:
189 | raise TypeError("Другая версия разметки")
190 | lines_node = root.find("lines")
191 | lines = []
192 | for line_node in lines_node.findall("item"):
193 | words_node = line_node.find("words")
194 | words = []
195 | for word_node in words_node.findall("item"):
196 | syllables_node = word_node.find("syllables")
197 | syllables = []
198 | for syllable_node in syllables_node.findall("item"):
199 | stress_node = syllable_node.find("accent") \
200 | if syllable_node.find("accent") is not None \
201 | else syllable_node.find("stress")
202 | stress = int(stress_node.text)
203 | syllables.append(Syllable(int(syllable_node.find("begin").text),
204 | int(syllable_node.find("end").text),
205 | int(syllable_node.find("number").text),
206 | syllable_node.find("text").text,
207 | stress))
208 | words.append(Word(int(word_node.find("begin").text), int(word_node.find("end").text),
209 | word_node.find("text").text, syllables))
210 | lines.append(Line(int(line_node.find("begin").text), int(line_node.find("end").text),
211 | line_node.find("text").text, words))
212 | self.text = root.find("text").text.replace("\\n", "\n")
213 | self.lines = lines
214 | return self
215 |
216 | def from_raw(self, text: str) -> 'Markup':
217 | """
218 | Импорт из сырого текста с ударениями в конце слов
219 |
220 | :param text: текст.
221 | :return: разметка.
222 | """
223 |
224 | pos = 0
225 | lines = []
226 | for line in text.split("\n"):
227 | if line == "":
228 | continue
229 | line_tokens = []
230 | for word in line.split(" "):
231 | i = -1
232 | ch = word[i]
233 | stress = ""
234 | while ch.isdigit() or ch == "-":
235 | stress += ch
236 | i -= 1
237 | ch = word[i]
238 | line_tokens.append((word[:i+1], int(stress[::-1])))
239 | words = []
240 | line_begin = pos
241 | for pair in line_tokens:
242 | token = pair[0]
243 | stress = pair[1]
244 | syllables = get_syllables(token)
245 | for j in range(len(syllables)):
246 | syllables[j].begin += pos
247 | syllables[j].end += pos
248 | word = Word(pos, pos + len(token), token, syllables)
249 | word.set_stresses([stress])
250 | words.append(word)
251 | pos += len(token) + 1
252 | lines.append(Line(line_begin, pos, " ".join([pair[0] for pair in line_tokens]), words))
253 | self.text = "\n".join([line.text for line in lines])
254 | self.lines = lines
255 | return self
256 |
257 | @staticmethod
258 | @timeit
259 | def process_text(text: str, stress_predictor) -> 'Markup':
260 | """
261 | Получение начального варианта разметки по слогам и ударениям.
262 |
263 | :param text: текст для разметки
264 | :param stress_predictor: предсказатель ударений.
265 | :return markup: разметка по слогам и ударениям
266 | """
267 | begin_line = 0
268 | lines = []
269 | words = []
270 | text_lines = text.split("\n")
271 | for text_line in text_lines:
272 | tokens = [token for token in Tokenizer.tokenize(text_line) if token.token_type == Token.TokenType.WORD]
273 | for token in tokens:
274 | word = Word(begin_line + token.begin, begin_line + token.end, token.text, get_syllables(token.text))
275 | # Проставляем ударения.
276 | stresses = stress_predictor.predict(token.text.lower())
277 | # Сопоставляем ударения слогам.
278 | if len(word.syllables) > 1:
279 | word.set_stresses(stresses)
280 | words.append(word)
281 | end_line = begin_line + len(text_line)
282 | lines.append(Line(begin_line, end_line, text_line, words))
283 | words = []
284 | begin_line = end_line + 1
285 | return Markup(text, lines)
286 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright {yyyy} {name of copyright owner}
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/rupo/metre/pattern_analyzer.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Автор: Гусев Илья
3 | # Описание: Сопоставление шаблону.
4 |
5 | from typing import List, Set, Tuple
6 |
7 |
8 | class TreeNode:
9 | """
10 | Нода дерева разбора шаблона.
11 | """
12 | leaf_chars = "usUS"
13 | non_leaf_chars = "*?w"
14 |
15 | def __init__(self, parent: 'TreeNode', children: List['TreeNode'], text: str, pattern_pos: int):
16 | """
17 | :param parent: родитель ноды.
18 | :param children: дети ноды.
19 | :param text: символ, соответствующий ноде.
20 | :param pattern_pos: позиция символа в шаблоне
21 | """
22 | self.parent = parent # type: TreeNode
23 | self.children = children # type: List[TreeNode]
24 | self.text = text # type: str
25 | self.pattern_pos = pattern_pos # type: int
26 |
27 | def get_level(self) -> int:
28 | """
29 | :return: высота ноды в дереве.
30 | """
31 | parent = self.parent
32 | level = 0
33 | while parent is not None:
34 | parent = parent.parent
35 | level += 1
36 | return level
37 |
38 | def get_next_sibling(self) -> 'TreeNode':
39 | """
40 | :return: соседняя нода справа.
41 | """
42 | siblings = self.parent.children
43 | index = siblings.index(self) + 1
44 | if index < len(siblings):
45 | return siblings[index]
46 | return None
47 |
48 | def get_last_child_leaf(self) -> 'TreeNode':
49 | """
50 | :return: последняя нода изе детей, которая является листом.
51 | """
52 | for child in reversed(self.children):
53 | if child.is_leaf():
54 | return child
55 | return None
56 |
57 | def is_first_leaf(self) -> bool:
58 | if not self.is_leaf():
59 | return False
60 | return [child for child in self.parent.children if child.is_leaf()][0] == self
61 |
62 | def is_last_leaf(self) -> bool:
63 | if not self.is_leaf():
64 | return False
65 | return [child for child in self.parent.children if child.is_leaf()][-1] == self
66 |
67 | def get_most_left_leaf(self) -> 'TreeNode':
68 | """
69 | :return: самый левый потомок.
70 | """
71 | node = self
72 | while len(node.children) != 0:
73 | node = node.children[0]
74 | assert node.is_leaf()
75 | return node
76 |
77 | def print_tree(self) -> None:
78 | """
79 | Вывод дерева с корнем в этой ноде.
80 | """
81 | stack = list()
82 | stack.append(self)
83 | while len(stack) != 0:
84 | current_node = stack.pop()
85 | print("\t" * current_node.get_level(), current_node)
86 | stack += current_node.children
87 |
88 | def is_leaf(self) -> bool:
89 | """
90 | :return: является ли нода листом дерева.
91 | """
92 | return self.text in TreeNode.leaf_chars
93 |
94 | def __str__(self) -> str:
95 | return self.text + " " + str(self.pattern_pos)
96 |
97 | def __repr__(self) -> str:
98 | return self.__str__()
99 |
100 | def __hash__(self):
101 | return hash(self.pattern_pos)
102 |
103 | def __eq__(self, other):
104 | return self.pattern_pos == other.pattern_pos
105 |
106 |
107 | class State:
108 | """
109 | Состояние разбора.
110 | """
111 | def __init__(self, node: TreeNode, string_pos: int, strong_errors: int, weak_errors: int, pattern: str):
112 | """
113 | :param node: нода дерева, соответствующая состоянию.
114 | :param string_pos: позиция в сопоставляемой строке.
115 | :param strong_errors: количество ошибок в U и S.
116 | :param weak_errors: количество ошибок в u и s.
117 | :param pattern: шаблон - путь, до этого состояния.
118 | """
119 | self.node = node # type: TreeNode
120 | self.string_pos = string_pos # type: int
121 | self.strong_errors = strong_errors # type: int
122 | self.weak_errors = weak_errors # type: int
123 | self.pattern = pattern # type: str
124 |
125 | def __str__(self) -> str:
126 | return str(self.node) + " " + str(self.string_pos) + " " + str(self.strong_errors) + " " + str(self.weak_errors)
127 |
128 | def __repr__(self) -> str:
129 | return self.__str__()
130 |
131 |
132 | class PatternAnalyzer:
133 | """
134 | Сопоставлятель шаблона и строки.
135 | """
136 | def __init__(self, pattern: str, error_border: int=8):
137 | """
138 | :param error_border: граница по ошибкам.
139 | :param pattern: шаблон.
140 | """
141 | self.pattern = pattern # type: str
142 | self.tree = self.__build_tree(pattern) # type: TreeNode
143 | self.error_border = error_border
144 |
145 | @staticmethod
146 | def count_errors(pattern: str, string: str, error_border: int=8) -> Tuple[str, int, int, bool]:
147 | """
148 | :param pattern: шаблон.
149 | :param string: строка.
150 | :param error_border: граница по ошибкам.
151 | :return: лучший шаблон, количество сильных ошибок, количество слабых ошибок.
152 | """
153 | analyzer = PatternAnalyzer(pattern, error_border)
154 | return analyzer.__accept(string)
155 |
156 | @staticmethod
157 | def __build_tree(pattern: str) -> TreeNode:
158 | """
159 | Построение дерева шаблона.
160 |
161 | :param pattern: шаблон.
162 | :return: корень дерева.
163 | """
164 | root_node = TreeNode(None, list(), "R", -1)
165 | current_node = root_node
166 | for i, ch in enumerate(pattern):
167 | if ch == "(":
168 | node = TreeNode(current_node, list(), "()", i)
169 | current_node.children.append(node)
170 | current_node = node
171 | if ch == ")":
172 | node = current_node
173 | current_node = current_node.parent
174 | # Убираем бессмысленные скобки.
175 | if i + 1 < len(pattern) and pattern[i + 1] not in "*?":
176 | current_node.children = current_node.children[:-1] + node.children
177 | for child in node.children:
178 | child.parent = current_node
179 | if ch in TreeNode.leaf_chars:
180 | current_node.children.append(TreeNode(current_node, list(), ch, i))
181 | # Заменяем скобки на нетерминалы.
182 | if ch in TreeNode.non_leaf_chars:
183 | current_node.children[-1].text = ch
184 | current_node.children[-1].pattern_pos = i
185 | return root_node
186 |
187 | def __accept(self, string: str) -> Tuple[str, int, int, bool]:
188 | """
189 | :param string: строка.
190 | :return: лучший шаблон, количество сильных ошибок, количество слабых ошибок, были ли ошибки.
191 | """
192 | current_states = [State(None, -1, 0, 0, "")]
193 | current_node = self.tree.get_most_left_leaf()
194 | for i, ch in enumerate(string):
195 | new_states = []
196 | for state in current_states:
197 | if state.node is not None:
198 | current_node = self.__get_next_leaf(state.node)
199 | variants = self.__get_variants(current_node)
200 |
201 | # Каждый вариант - новое состояние.
202 | for variant in variants:
203 | assert variant.is_leaf()
204 | strong_errors = state.strong_errors + int(variant.text.isupper() and variant.text != ch)
205 | weak_errors = state.weak_errors + int(variant.text.islower() and variant.text != ch.lower())
206 | new_state = State(variant, i, strong_errors, weak_errors, state.pattern+variant.text)
207 | if new_state.strong_errors + new_state.weak_errors > self.error_border:
208 | continue
209 | new_states.append(new_state)
210 |
211 | if len(new_states) == 0:
212 | # Можем закончить раньше, если по ошибкам порезали ветки, либо если шаблон меньше строки.
213 | current_states = PatternAnalyzer.__filter_states(current_states, self.tree)
214 | pattern, strong_errors, weak_errors = self.__get_min_errors_from_states(current_states)
215 | diff = (len(string) - i)
216 | return pattern, strong_errors + diff, weak_errors + diff, True
217 |
218 | current_states = new_states
219 | current_states = PatternAnalyzer.__filter_states(current_states, self.tree)
220 | return self.__get_min_errors_from_states(current_states) + (False,)
221 |
222 | @staticmethod
223 | def __get_variants(current_node: TreeNode) -> Set[TreeNode]:
224 | """
225 | :param current_node: текущая нода.
226 | :return: варианты ноды на том же символе строки, возникают из-за * и ? в шаблоне.
227 | """
228 | variants = set()
229 | current_variant = current_node
230 | while current_variant is not None:
231 | if current_variant not in variants:
232 | variants.add(current_variant)
233 | else:
234 | current_variant = current_variant.parent
235 | current_variant = PatternAnalyzer.__get_next_variant(current_variant)
236 | return variants
237 |
238 | @staticmethod
239 | def __get_next_variant(node: TreeNode) -> TreeNode:
240 | """
241 | Получение следующего варианта из варинатов текущей ноды.
242 |
243 | :param node: текущий вариант.
244 | :return: следующий вариант.
245 | """
246 | assert node.is_leaf()
247 | while node.parent is not None:
248 | parent = node.parent
249 | grandfather = parent.parent
250 | uncle = parent.get_next_sibling() if grandfather is not None else None
251 | is_variable = node.is_first_leaf() or not node.is_leaf()
252 | if is_variable and uncle is not None:
253 | return uncle.get_most_left_leaf()
254 | elif grandfather is not None and grandfather.text == "*" and grandfather.children[-1] == parent:
255 | return grandfather.get_most_left_leaf()
256 | if is_variable:
257 | node = parent
258 | else:
259 | break
260 | return None
261 |
262 | @staticmethod
263 | def __get_next_leaf(node: TreeNode) -> TreeNode:
264 | """
265 | Получение следующей ноды.
266 |
267 | :param node: текущая нода.
268 | :return: следующая нода.
269 | """
270 | assert node.is_leaf()
271 | while node.parent is not None:
272 | sibling = node.get_next_sibling()
273 | if sibling is not None:
274 | return sibling.get_most_left_leaf()
275 | elif node.parent.text == "*":
276 | return node.parent.get_most_left_leaf()
277 | node = node.parent
278 | return None
279 |
280 | @staticmethod
281 | def __filter_states(states: List[State], root: TreeNode) -> List[State]:
282 | """
283 | Фильтрация по наличию обязательных терминалов.
284 |
285 | :param states: состояния.
286 | :param root: корень дерева.
287 | :return: отфильтрованные состояния.
288 | """
289 | return [state for state in states if root.get_last_child_leaf() is None or
290 | state.node.pattern_pos >= root.get_last_child_leaf().pattern_pos]
291 |
292 | @staticmethod
293 | def __get_min_errors_from_states(states: List[State]) -> Tuple[str, int, int]:
294 | """
295 | :param states: состояния.
296 | :return: лучший шаблон, количество сильных ошибок, количество слабых ошибок.
297 | """
298 | if len(states) == 0:
299 | return "", 0, 0
300 | return min([(state.pattern, state.strong_errors, state.weak_errors) for i, state in enumerate(states)],
301 | key=lambda x: (x[1], x[2], x[0]))
302 |
--------------------------------------------------------------------------------
/rupo/metre/metre_classifier.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Автор: Гусев Илья
3 | # Описание: Классификатор метра.
4 |
5 | from collections import OrderedDict
6 | from typing import List, Dict, Tuple
7 | import jsonpickle
8 | import logging
9 |
10 | from rupo.main.markup import Line, Markup
11 | from rupo.util.mixins import CommonMixin
12 | from rupo.metre.pattern_analyzer import PatternAnalyzer
13 | from rupo.util.preprocess import get_first_vowel_position
14 | from rupo.util.timeit import timeit
15 |
16 |
17 | class StressCorrection(CommonMixin):
18 | """
19 | Исправление ударения.
20 | """
21 | def __init__(self, line_number: int, word_number: int, syllable_number: int,
22 | word_text: str, stress: int) -> None:
23 | """
24 | :param line_number: номер строки.
25 | :param word_number: номер слова.
26 | :param syllable_number: номер слога.
27 | :param word_text: текст слова.
28 | :param stress: позиция ударения (с 0).
29 | """
30 | self.line_number = line_number
31 | self.word_number = word_number
32 | self.syllable_number = syllable_number
33 | self.word_text = word_text
34 | self.stress = stress
35 |
36 |
37 | class ClassificationResult(CommonMixin):
38 | """
39 | Результат классификации стихотворения по метру.
40 | """
41 | def __init__(self, count_lines: int=0) -> None:
42 | """
43 | :param count_lines: количество строк.
44 | """
45 | self.metre = None
46 | self.count_lines = count_lines
47 | self.errors_count = {k: 0 for k in MetreClassifier.metres.keys()} # type: Dict[str, int]
48 | self.corrections = {k: [] for k in MetreClassifier.metres.keys()} # type: Dict[str, List[StressCorrection]]
49 | self.resolutions = {k: [] for k in MetreClassifier.metres.keys()} # type: Dict[str, List[StressCorrection]]
50 | self.additions = {k: [] for k in MetreClassifier.metres.keys()} # type: Dict[str, List[StressCorrection]]
51 |
52 | def get_metre_errors_count(self):
53 | """
54 | :return: получить количество ошибок на заданном метре.
55 | """
56 | return self.errors_count[self.metre]
57 |
58 | def to_json(self):
59 | """
60 | :return: сериализация в json.
61 | """
62 | return jsonpickle.encode(self)
63 |
64 | @staticmethod
65 | def str_corrections(collection: List[StressCorrection]) -> str:
66 | """
67 | :param collection: список исправлений.
68 | :return: его строковое представление.
69 | """
70 | return"\n".join([str((item.word_text, item.syllable_number)) for item in collection])
71 |
72 | def __str__(self):
73 | st = "Метр: " + str(self.metre) + "\n"
74 | st += "Снятая омография: \n" + ClassificationResult.str_corrections(self.resolutions[self.metre]) + "\n"
75 | st += "Неправильные ударения: \n" + ClassificationResult.str_corrections(self.corrections[self.metre]) + "\n"
76 | st += "Новые ударения: \n" + ClassificationResult.str_corrections(self.additions[self.metre]) + "\n"
77 | return st
78 |
79 |
80 | class ErrorsTableRecord:
81 | def __init__(self, strong_errors, weak_errors, pattern, failed=False):
82 | self.strong_errors = strong_errors
83 | self.weak_errors = weak_errors
84 | self.pattern = pattern
85 | self.failed = failed
86 |
87 | def __str__(self):
88 | return self.pattern + " " + str(self.strong_errors) + " " + str(self.weak_errors)
89 |
90 | def __repr__(self):
91 | return self.__str__()
92 |
93 |
94 | class ErrorsTable:
95 | def __init__(self, num_lines):
96 | self.data = {}
97 | self.num_lines = num_lines
98 | self.coef = OrderedDict(
99 | [("iambos", 0.3),
100 | ("choreios", 0.3),
101 | ("daktylos", 0.4),
102 | ("amphibrachys", 0.4),
103 | ("anapaistos", 0.4),
104 | ("dolnik3", 0.5),
105 | ("dolnik2", 0.5),
106 | ("taktovik3", 6.0),
107 | ("taktovik2", 6.0)
108 | ])
109 | self.sum_coef = OrderedDict(
110 | [("iambos", 0.0),
111 | ("choreios", 0.0),
112 | ("daktylos", 0.0),
113 | ("amphibrachys", 0.0),
114 | ("anapaistos", 0.0),
115 | ("dolnik3", 0.035),
116 | ("dolnik2", 0.035),
117 | ("taktovik3", 0.10),
118 | ("taktovik2", 0.10)
119 | ])
120 | for metre_name in MetreClassifier.metres.keys():
121 | self.data[metre_name] = [ErrorsTableRecord(0, 0, "") for _ in range(num_lines)]
122 |
123 | def add_record(self, metre_name, line_num, strong_errors, weak_errors, pattern, failed=False):
124 | self.data[metre_name][line_num] = ErrorsTableRecord(strong_errors, weak_errors, pattern, failed)
125 |
126 | def get_best_metre(self):
127 | for l in range(self.num_lines):
128 | strong_sum = 0
129 | weak_sum = 0
130 | for metre_name in self.data.keys():
131 | strong_sum += self.data[metre_name][l].strong_errors
132 | weak_sum += self.data[metre_name][l].weak_errors
133 | for metre_name, column in self.data.items():
134 | if strong_sum != 0:
135 | column[l].strong_errors = column[l].strong_errors / float(strong_sum)
136 | if weak_sum != 0:
137 | column[l].weak_errors = column[l].weak_errors / float(weak_sum)
138 | sums = dict()
139 | for metre_name in self.data.keys():
140 | sums[metre_name] = (0, 0)
141 | for metre_name, column in self.data.items():
142 | strong_sum = 0
143 | weak_sum = 0
144 | for l in range(self.num_lines):
145 | strong_sum += column[l].strong_errors
146 | weak_sum += column[l].weak_errors
147 | sums[metre_name] = (strong_sum, weak_sum)
148 | for metre_name, pair in sums.items():
149 | sums[metre_name] = self.sum_coef[metre_name] + (pair[0] + pair[1] / 2.0) * self.coef[metre_name] / self.num_lines
150 | logging.debug(sums)
151 | return min(sums, key=sums.get)
152 |
153 |
154 | class MetreClassifier(object):
155 | """
156 | Классификатор, считает отклонения от стандартных шаблонов ритма(метров).
157 | """
158 | metres = OrderedDict(
159 | [("iambos", '(us)*(uS)(U)?(U)?'),
160 | ("choreios", '(su)*(S)(U)?(U)?'),
161 | ("daktylos", '(suu)*(S)(U)?(U)?'),
162 | ("amphibrachys", '(usu)*(uS)(U)?(U)?'),
163 | ("anapaistos", '(uus)*(uuS)(U)?(U)?'),
164 | ("dolnik3", '(u)?(u)?((su)(u)?)*(S)(U)?(U)?'),
165 | ("dolnik2", '(u)?(u)?((s)(u)?)*(S)(U)?(U)?'),
166 | ("taktovik3", '(u)?(u)?((su)(u)?(u)?)*(S)(U)?(U)?'),
167 | ("taktovik2", '(u)?(u)?((s)(u)?(u)?)*(S)(U)?(U)?')
168 | ])
169 |
170 | border_syllables_count = 20
171 |
172 | @staticmethod
173 | @timeit
174 | def classify_metre(markup):
175 | """
176 | Классифицируем стихотворный метр.
177 |
178 | :param markup: разметка.
179 | :return: результат классификации.
180 | """
181 | result = ClassificationResult(len(markup.lines))
182 | num_lines = len(markup.lines)
183 | errors_table = ErrorsTable(num_lines)
184 | for l, line in enumerate(markup.lines):
185 | for metre_name, metre_pattern in MetreClassifier.metres.items():
186 | line_syllables_count = sum([len(word.syllables) for word in line.words])
187 |
188 | # Строчки длиной больше border_syllables_count слогов не обрабатываем.
189 | if line_syllables_count > MetreClassifier.border_syllables_count or line_syllables_count == 0:
190 | continue
191 | error_border = 7
192 | if metre_name == "dolnik2" or metre_name == "dolnik3":
193 | error_border = 3
194 | if metre_name == "taktovik2" or metre_name == "taktovik3":
195 | error_border = 2
196 | pattern, strong_errors, weak_errors, analysis_errored = \
197 | PatternAnalyzer.count_errors(MetreClassifier.metres[metre_name],
198 | MetreClassifier.__get_line_pattern(line),
199 | error_border)
200 | if analysis_errored or len(pattern) == 0:
201 | errors_table.add_record(metre_name, l, strong_errors, weak_errors, pattern, True)
202 | continue
203 | corrections = MetreClassifier.__get_line_pattern_matching_corrections(line, l, pattern)[0]
204 | accentuation_errors = len(corrections)
205 | strong_errors += accentuation_errors
206 | errors_table.add_record(metre_name, l, strong_errors, weak_errors, pattern)
207 | result.metre = errors_table.get_best_metre()
208 |
209 | # Запомним все исправления.
210 | for l, line in enumerate(markup.lines):
211 | pattern = errors_table.data[result.metre][l].pattern
212 | failed = errors_table.data[result.metre][l].failed
213 | if failed or len(pattern) == 0:
214 | continue
215 | corrections, resolutions, additions =\
216 | MetreClassifier.__get_line_pattern_matching_corrections(line, l, pattern)
217 | result.corrections[result.metre] += corrections
218 | result.resolutions[result.metre] += resolutions
219 | result.additions[result.metre] += additions
220 | result.errors_count[result.metre] += len(corrections)
221 | return result
222 |
223 | @staticmethod
224 | def __get_line_pattern(line: Line) -> str:
225 | """
226 | Сопоставляем строку шаблону, считаем ошибки.
227 |
228 | :param line: строка.
229 | :return: количество ошибок
230 | """
231 | pattern = ""
232 | for w, word in enumerate(line.words):
233 | if len(word.syllables) == 0:
234 | pattern += "U"
235 | else:
236 | for syllable in word.syllables:
237 | if syllable.stress != -1:
238 | pattern += "S"
239 | else:
240 | pattern += "U"
241 | return pattern
242 |
243 | @staticmethod
244 | def __get_line_pattern_matching_corrections(line: Line, line_number: int, pattern: str) \
245 | -> Tuple[List[StressCorrection], List[StressCorrection], List[StressCorrection]]:
246 | """
247 | Ударения могут приходиться на слабое место,
248 | если безударный слог того же слова не попадает на икт. Иначе - ошибка.
249 |
250 | :param line: строка.
251 | :param line_number: номер строки.
252 | :param pattern: шаблон.
253 | :return: ошибки, дополнения и снятия
254 | """
255 | corrections = []
256 | resolutions = []
257 | additions = []
258 | number_in_pattern = 0
259 | for w, word in enumerate(line.words):
260 | # Игнорируем слова длиной меньше 2 слогов.
261 | if len(word.syllables) == 0:
262 | continue
263 | if len(word.syllables) == 1:
264 | if pattern[number_in_pattern].lower() == "s" and word.syllables[0].stress == -1:
265 | additions.append(StressCorrection(line_number, w, 0, word.text, word.syllables[0].vowel()))
266 | number_in_pattern += len(word.syllables)
267 | continue
268 | stress_count = word.count_stresses()
269 | for syllable in word.syllables:
270 | if stress_count == 0 and pattern[number_in_pattern].lower() == "s":
271 | # Ударений нет, ставим такое, какое подходит по метру. Возможно несколько.
272 | additions.append(StressCorrection(line_number, w, syllable.number, word.text, syllable.vowel()))
273 | elif pattern[number_in_pattern].lower() == "u" and syllable.stress != -1:
274 | # Ударение есть и оно падает на этот слог, при этом в шаблоне безударная позиция.
275 | # Найдём такой слог, у которого в шаблоне ударная позиция. Это и есть наше исправление.
276 | for other_syllable in word.syllables:
277 | other_number_in_pattern = other_syllable.number - syllable.number + number_in_pattern
278 | if syllable.number == other_syllable.number or pattern[other_number_in_pattern].lower() != "s":
279 | continue
280 | ac = StressCorrection(line_number, w, other_syllable.number, word.text, other_syllable.vowel())
281 | if stress_count == 1 and other_syllable.stress == -1:
282 | corrections.append(ac)
283 | else:
284 | resolutions.append(ac)
285 | number_in_pattern += 1
286 | return corrections, resolutions, additions
287 |
288 | @staticmethod
289 | def get_improved_markup(markup: Markup, result: ClassificationResult) -> Markup:
290 | """
291 | Улучшаем разметку после классификации метра.
292 |
293 | :param markup: начальная разметка.
294 | :param result: результат классификации.
295 | :return: улучшенная разметка.
296 | """
297 | for pos in result.corrections[result.metre] + result.resolutions[result.metre]:
298 | syllables = markup.lines[pos.line_number].words[pos.word_number].syllables
299 | for i, syllable in enumerate(syllables):
300 | syllable.stress = -1
301 | if syllable.number == pos.syllable_number:
302 | syllable.stress = syllable.begin + get_first_vowel_position(syllable.text)
303 | for pos in result.additions[result.metre]:
304 | syllable = markup.lines[pos.line_number].words[pos.word_number].syllables[pos.syllable_number]
305 | syllable.stress = syllable.begin + get_first_vowel_position(syllable.text)
306 |
307 | return markup
308 |
309 | @staticmethod
310 | def improve_markup(markup: Markup) -> \
311 | Tuple[Markup, ClassificationResult]:
312 | """
313 | Улучшение разметки метрическим классификатором.
314 |
315 | :param markup: начальная разметка.
316 | """
317 | result = MetreClassifier.classify_metre(markup)
318 | improved_markup = MetreClassifier.get_improved_markup(markup, result)
319 | return improved_markup, result
320 |
--------------------------------------------------------------------------------
/rupo/data/examples/markup.json:
--------------------------------------------------------------------------------
1 | {
2 | "items": [
3 | {
4 | "version": 2,
5 | "text": "Забывши волнения жизни мятежной,\nОдин жил в пустыне рыбак молодой.\nОднажды на скале прибрежной,\nНад тихой прозрачной рекой\nОн с удой беспечно\nСидел\nИ думой сердечной\nК прошедшему счастью летел.",
6 | "lines": [
7 | {
8 | "words": [
9 | {
10 | "syllables": [
11 | {
12 | "begin": 0,
13 | "end": 2,
14 | "text": "За",
15 | "accent": -1,
16 | "number": 0
17 | },
18 | {
19 | "begin": 2,
20 | "end": 4,
21 | "text": "бы",
22 | "accent": -1,
23 | "number": 1
24 | },
25 | {
26 | "begin": 4,
27 | "end": 7,
28 | "text": "вши",
29 | "accent": 6,
30 | "number": 2
31 | }
32 | ],
33 | "end": 7,
34 | "text": "Забывши",
35 | "begin": 0
36 | },
37 | {
38 | "syllables": [
39 | {
40 | "begin": 0,
41 | "end": 2,
42 | "text": "во",
43 | "accent": -1,
44 | "number": 0
45 | },
46 | {
47 | "begin": 2,
48 | "end": 5,
49 | "text": "лне",
50 | "accent": 4,
51 | "number": 1
52 | },
53 | {
54 | "begin": 5,
55 | "end": 7,
56 | "text": "ни",
57 | "accent": -1,
58 | "number": 2
59 | },
60 | {
61 | "begin": 7,
62 | "end": 8,
63 | "text": "я",
64 | "accent": -1,
65 | "number": 3
66 | }
67 | ],
68 | "end": 16,
69 | "text": "волнения",
70 | "begin": 8
71 | },
72 | {
73 | "syllables": [
74 | {
75 | "begin": 0,
76 | "end": 2,
77 | "text": "жи",
78 | "accent": 1,
79 | "number": 0
80 | },
81 | {
82 | "begin": 2,
83 | "end": 5,
84 | "text": "зни",
85 | "accent": -1,
86 | "number": 1
87 | }
88 | ],
89 | "end": 22,
90 | "text": "жизни",
91 | "begin": 17
92 | },
93 | {
94 | "syllables": [
95 | {
96 | "begin": 0,
97 | "end": 2,
98 | "text": "мя",
99 | "accent": -1,
100 | "number": 0
101 | },
102 | {
103 | "begin": 2,
104 | "end": 4,
105 | "text": "те",
106 | "accent": 3,
107 | "number": 1
108 | },
109 | {
110 | "begin": 4,
111 | "end": 8,
112 | "text": "жной",
113 | "accent": -1,
114 | "number": 2
115 | }
116 | ],
117 | "end": 31,
118 | "text": "мятежной",
119 | "begin": 23
120 | }
121 | ],
122 | "end": 33,
123 | "text": "Забывши волнения жизни мятежной,",
124 | "begin": 0
125 | },
126 | {
127 | "words": [
128 | {
129 | "syllables": [
130 | {
131 | "begin": 0,
132 | "end": 1,
133 | "text": "О",
134 | "accent": -1,
135 | "number": 0
136 | },
137 | {
138 | "begin": 1,
139 | "end": 4,
140 | "text": "дин",
141 | "accent": 2,
142 | "number": 1
143 | }
144 | ],
145 | "end": 37,
146 | "text": "Один",
147 | "begin": 33
148 | },
149 | {
150 | "syllables": [
151 | {
152 | "begin": 0,
153 | "end": 3,
154 | "text": "жил",
155 | "accent": 1,
156 | "number": 0
157 | }
158 | ],
159 | "end": 41,
160 | "text": "жил",
161 | "begin": 38
162 | },
163 | {
164 | "syllables": [],
165 | "end": 43,
166 | "text": "в",
167 | "begin": 42
168 | },
169 | {
170 | "syllables": [
171 | {
172 | "begin": 0,
173 | "end": 2,
174 | "text": "пу",
175 | "accent": -1,
176 | "number": 0
177 | },
178 | {
179 | "begin": 2,
180 | "end": 5,
181 | "text": "сты",
182 | "accent": 4,
183 | "number": 1
184 | },
185 | {
186 | "begin": 5,
187 | "end": 7,
188 | "text": "не",
189 | "accent": -1,
190 | "number": 2
191 | }
192 | ],
193 | "end": 51,
194 | "text": "пустыне",
195 | "begin": 44
196 | },
197 | {
198 | "syllables": [
199 | {
200 | "begin": 0,
201 | "end": 2,
202 | "text": "ры",
203 | "accent": -1,
204 | "number": 0
205 | },
206 | {
207 | "begin": 2,
208 | "end": 5,
209 | "text": "бак",
210 | "accent": 3,
211 | "number": 1
212 | }
213 | ],
214 | "end": 57,
215 | "text": "рыбак",
216 | "begin": 52
217 | },
218 | {
219 | "syllables": [
220 | {
221 | "begin": 0,
222 | "end": 2,
223 | "text": "мо",
224 | "accent": -1,
225 | "number": 0
226 | },
227 | {
228 | "begin": 2,
229 | "end": 4,
230 | "text": "ло",
231 | "accent": -1,
232 | "number": 1
233 | },
234 | {
235 | "begin": 4,
236 | "end": 7,
237 | "text": "дой",
238 | "accent": 5,
239 | "number": 2
240 | }
241 | ],
242 | "end": 65,
243 | "text": "молодой",
244 | "begin": 58
245 | }
246 | ],
247 | "end": 67,
248 | "text": "Один жил в пустыне рыбак молодой.",
249 | "begin": 33
250 | },
251 | {
252 | "words": [
253 | {
254 | "syllables": [
255 | {
256 | "begin": 0,
257 | "end": 1,
258 | "text": "О",
259 | "accent": -1,
260 | "number": 0
261 | },
262 | {
263 | "begin": 1,
264 | "end": 4,
265 | "text": "дна",
266 | "accent": -1,
267 | "number": 1
268 | },
269 | {
270 | "begin": 4,
271 | "end": 7,
272 | "text": "жды",
273 | "accent": 6,
274 | "number": 2
275 | }
276 | ],
277 | "end": 74,
278 | "text": "Однажды",
279 | "begin": 67
280 | },
281 | {
282 | "syllables": [
283 | {
284 | "begin": 0,
285 | "end": 2,
286 | "text": "на",
287 | "accent": 1,
288 | "number": 0
289 | }
290 | ],
291 | "end": 77,
292 | "text": "на",
293 | "begin": 75
294 | },
295 | {
296 | "syllables": [
297 | {
298 | "begin": 0,
299 | "end": 3,
300 | "text": "ска",
301 | "accent": 2,
302 | "number": 0
303 | },
304 | {
305 | "begin": 3,
306 | "end": 5,
307 | "text": "ле",
308 | "accent": -1,
309 | "number": 1
310 | }
311 | ],
312 | "end": 83,
313 | "text": "скале",
314 | "begin": 78
315 | },
316 | {
317 | "syllables": [
318 | {
319 | "begin": 0,
320 | "end": 3,
321 | "text": "при",
322 | "accent": -1,
323 | "number": 0
324 | },
325 | {
326 | "begin": 3,
327 | "end": 6,
328 | "text": "бре",
329 | "accent": 5,
330 | "number": 1
331 | },
332 | {
333 | "begin": 6,
334 | "end": 10,
335 | "text": "жной",
336 | "accent": -1,
337 | "number": 2
338 | }
339 | ],
340 | "end": 94,
341 | "text": "прибрежной",
342 | "begin": 84
343 | }
344 | ],
345 | "end": 96,
346 | "text": "Однажды на скале прибрежной,",
347 | "begin": 67
348 | },
349 | {
350 | "words": [
351 | {
352 | "syllables": [
353 | {
354 | "begin": 0,
355 | "end": 3,
356 | "text": "Над",
357 | "accent": 1,
358 | "number": 0
359 | }
360 | ],
361 | "end": 99,
362 | "text": "Над",
363 | "begin": 96
364 | },
365 | {
366 | "syllables": [
367 | {
368 | "begin": 0,
369 | "end": 2,
370 | "text": "ти",
371 | "accent": 1,
372 | "number": 0
373 | },
374 | {
375 | "begin": 2,
376 | "end": 5,
377 | "text": "хой",
378 | "accent": -1,
379 | "number": 1
380 | }
381 | ],
382 | "end": 105,
383 | "text": "тихой",
384 | "begin": 100
385 | },
386 | {
387 | "syllables": [
388 | {
389 | "begin": 0,
390 | "end": 3,
391 | "text": "про",
392 | "accent": -1,
393 | "number": 0
394 | },
395 | {
396 | "begin": 3,
397 | "end": 6,
398 | "text": "зра",
399 | "accent": 5,
400 | "number": 1
401 | },
402 | {
403 | "begin": 6,
404 | "end": 10,
405 | "text": "чной",
406 | "accent": -1,
407 | "number": 2
408 | }
409 | ],
410 | "end": 116,
411 | "text": "прозрачной",
412 | "begin": 106
413 | },
414 | {
415 | "syllables": [
416 | {
417 | "begin": 0,
418 | "end": 2,
419 | "text": "ре",
420 | "accent": -1,
421 | "number": 0
422 | },
423 | {
424 | "begin": 2,
425 | "end": 5,
426 | "text": "кой",
427 | "accent": 3,
428 | "number": 1
429 | }
430 | ],
431 | "end": 122,
432 | "text": "рекой",
433 | "begin": 117
434 | }
435 | ],
436 | "end": 123,
437 | "text": "Над тихой прозрачной рекой",
438 | "begin": 96
439 | },
440 | {
441 | "words": [
442 | {
443 | "syllables": [
444 | {
445 | "begin": 0,
446 | "end": 2,
447 | "text": "Он",
448 | "accent": 0,
449 | "number": 0
450 | }
451 | ],
452 | "end": 125,
453 | "text": "Он",
454 | "begin": 123
455 | },
456 | {
457 | "syllables": [],
458 | "end": 127,
459 | "text": "с",
460 | "begin": 126
461 | },
462 | {
463 | "syllables": [
464 | {
465 | "begin": 0,
466 | "end": 1,
467 | "text": "у",
468 | "accent": -1,
469 | "number": 0
470 | },
471 | {
472 | "begin": 1,
473 | "end": 4,
474 | "text": "дой",
475 | "accent": 2,
476 | "number": 1
477 | }
478 | ],
479 | "end": 132,
480 | "text": "удой",
481 | "begin": 128
482 | },
483 | {
484 | "syllables": [
485 | {
486 | "begin": 0,
487 | "end": 2,
488 | "text": "бе",
489 | "accent": -1,
490 | "number": 0
491 | },
492 | {
493 | "begin": 2,
494 | "end": 5,
495 | "text": "спе",
496 | "accent": 4,
497 | "number": 1
498 | },
499 | {
500 | "begin": 5,
501 | "end": 8,
502 | "text": "чно",
503 | "accent": -1,
504 | "number": 2
505 | }
506 | ],
507 | "end": 141,
508 | "text": "беспечно",
509 | "begin": 133
510 | }
511 | ],
512 | "end": 142,
513 | "text": "Он с удой беспечно",
514 | "begin": 123
515 | },
516 | {
517 | "words": [
518 | {
519 | "syllables": [
520 | {
521 | "begin": 0,
522 | "end": 2,
523 | "text": "Си",
524 | "accent": -1,
525 | "number": 0
526 | },
527 | {
528 | "begin": 2,
529 | "end": 5,
530 | "text": "дел",
531 | "accent": 3,
532 | "number": 1
533 | }
534 | ],
535 | "end": 147,
536 | "text": "Сидел",
537 | "begin": 142
538 | }
539 | ],
540 | "end": 148,
541 | "text": "Сидел",
542 | "begin": 142
543 | },
544 | {
545 | "words": [
546 | {
547 | "syllables": [
548 | {
549 | "begin": 0,
550 | "end": 1,
551 | "text": "И",
552 | "accent": 0,
553 | "number": 0
554 | }
555 | ],
556 | "end": 149,
557 | "text": "И",
558 | "begin": 148
559 | },
560 | {
561 | "syllables": [
562 | {
563 | "begin": 0,
564 | "end": 2,
565 | "text": "ду",
566 | "accent": 1,
567 | "number": 0
568 | },
569 | {
570 | "begin": 2,
571 | "end": 5,
572 | "text": "мой",
573 | "accent": -1,
574 | "number": 1
575 | }
576 | ],
577 | "end": 155,
578 | "text": "думой",
579 | "begin": 150
580 | },
581 | {
582 | "syllables": [
583 | {
584 | "begin": 0,
585 | "end": 3,
586 | "text": "сер",
587 | "accent": -1,
588 | "number": 0
589 | },
590 | {
591 | "begin": 3,
592 | "end": 5,
593 | "text": "де",
594 | "accent": 4,
595 | "number": 1
596 | },
597 | {
598 | "begin": 5,
599 | "end": 9,
600 | "text": "чной",
601 | "accent": -1,
602 | "number": 2
603 | }
604 | ],
605 | "end": 165,
606 | "text": "сердечной",
607 | "begin": 156
608 | }
609 | ],
610 | "end": 166,
611 | "text": "И думой сердечной",
612 | "begin": 148
613 | },
614 | {
615 | "words": [
616 | {
617 | "syllables": [],
618 | "end": 167,
619 | "text": "К",
620 | "begin": 166
621 | },
622 | {
623 | "syllables": [
624 | {
625 | "begin": 0,
626 | "end": 3,
627 | "text": "про",
628 | "accent": -1,
629 | "number": 0
630 | },
631 | {
632 | "begin": 3,
633 | "end": 5,
634 | "text": "ше",
635 | "accent": 4,
636 | "number": 1
637 | },
638 | {
639 | "begin": 5,
640 | "end": 8,
641 | "text": "дше",
642 | "accent": -1,
643 | "number": 2
644 | },
645 | {
646 | "begin": 8,
647 | "end": 10,
648 | "text": "му",
649 | "accent": -1,
650 | "number": 3
651 | }
652 | ],
653 | "end": 178,
654 | "text": "прошедшему",
655 | "begin": 168
656 | },
657 | {
658 | "syllables": [
659 | {
660 | "begin": 0,
661 | "end": 3,
662 | "text": "сча",
663 | "accent": 2,
664 | "number": 0
665 | },
666 | {
667 | "begin": 3,
668 | "end": 7,
669 | "text": "стью",
670 | "accent": -1,
671 | "number": 1
672 | }
673 | ],
674 | "end": 186,
675 | "text": "счастью",
676 | "begin": 179
677 | },
678 | {
679 | "syllables": [
680 | {
681 | "begin": 0,
682 | "end": 2,
683 | "text": "ле",
684 | "accent": -1,
685 | "number": 0
686 | },
687 | {
688 | "begin": 2,
689 | "end": 5,
690 | "text": "тел",
691 | "accent": 3,
692 | "number": 1
693 | }
694 | ],
695 | "end": 192,
696 | "text": "летел",
697 | "begin": 187
698 | }
699 | ],
700 | "end": 193,
701 | "text": "К прошедшему счастью летел.",
702 | "begin": 166
703 | }
704 | ]
705 | }
706 | ]
707 | }
--------------------------------------------------------------------------------