├── rupo ├── __init__.py ├── dict │ ├── __init__.py │ ├── zaliznyak.py │ ├── wiki.py │ └── cmu.py ├── files │ ├── __init__.py │ ├── test_writer.py │ ├── test_reader.py │ ├── writer.py │ └── reader.py ├── main │ ├── __init__.py │ ├── test_vocabulary.py │ ├── test_markup.py │ ├── morph.py │ ├── test_tokenizer.py │ ├── vocabulary.py │ ├── tokenizer.py │ └── markup.py ├── metre │ ├── __init__.py │ ├── test_pattern_analyzer.py │ ├── test_metre_classifier.py │ ├── pattern_analyzer.py │ └── metre_classifier.py ├── rhymes │ ├── __init__.py │ ├── test_rhymes.py │ └── rhymes.py ├── stress │ ├── __init__.py │ ├── test_dict.py │ ├── test_predictor.py │ ├── word.py │ ├── predictor.py │ └── dict.py ├── util │ ├── __init__.py │ ├── timeit.py │ ├── tqdm_open.py │ ├── data.py │ ├── mixins.py │ └── preprocess.py ├── generate │ ├── __init__.py │ ├── generator.py │ └── transforms.py ├── data │ └── examples │ │ ├── text.txt │ │ ├── text.xml │ │ ├── morph_markup.txt │ │ ├── markup.xml │ │ └── markup.json ├── settings.py ├── test_api.py └── api.py ├── setup.cfg ├── docs ├── source │ ├── modules.rst │ ├── index.rst │ ├── rupo.rhymes.rst │ ├── rupo.util.rst │ ├── rupo.rst │ ├── rupo.files.rst │ ├── rupo.metre.rst │ ├── rupo.main.rst │ ├── rupo.generate.rst │ └── conf.py └── Makefile ├── .gitignore ├── .gitattributes ├── .codeclimate.yml ├── requirements.txt ├── download.sh ├── .travis.yml ├── generate_poem.py ├── setup.py ├── README.md └── LICENSE /rupo/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rupo/dict/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rupo/files/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rupo/main/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rupo/metre/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rupo/rhymes/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rupo/stress/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rupo/util/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rupo/generate/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md -------------------------------------------------------------------------------- /docs/source/modules.rst: -------------------------------------------------------------------------------- 1 | rupo 2 | ==== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | rupo 8 | setup 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.so 3 | *.pyc 4 | *~ 5 | .idea 6 | *.trie 7 | *.pickle 8 | *.h5 9 | dist/ 10 | rupo-* 11 | rupo.* 12 | rupo/data/generator_models/ -------------------------------------------------------------------------------- /rupo/data/examples/text.txt: -------------------------------------------------------------------------------- 1 | Забывши волнения жизни мятежной, 2 | Один жил в пустыне рыбак молодой. 3 | Однажды на скале прибрежной, 4 | Над тихой прозрачной рекой 5 | Он с удой беспечно 6 | Сидел 7 | И думой сердечной 8 | К прошедшему счастью летел. -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | 2 | rupo/data/stress_models/stress_ru_word30_LSTM256_dropout0.4_acc99_wer3.h5 filter=lfs diff=lfs merge=lfs -text 3 | rupo/data/g2p_models/g2p_ru_maxlen40_BLSTM256_BLSTM256_dropout0.2_acc992_wer140.h5 filter=lfs diff=lfs merge=lfs -text 4 | -------------------------------------------------------------------------------- /rupo/util/timeit.py: -------------------------------------------------------------------------------- 1 | import time 2 | import logging 3 | 4 | 5 | def timeit(method): 6 | def timed(*args, **kw): 7 | ts = time.time() 8 | result = method(*args, **kw) 9 | te = time.time() 10 | logging.debug('%s %2.2f sec' % (method.__name__, te-ts)) 11 | return result 12 | return timed -------------------------------------------------------------------------------- /.codeclimate.yml: -------------------------------------------------------------------------------- 1 | --- 2 | engines: 3 | csslint: 4 | enabled: false 5 | duplication: 6 | enabled: true 7 | config: 8 | languages: 9 | - python 10 | eslint: 11 | enabled: false 12 | fixme: 13 | enabled: true 14 | radon: 15 | enabled: true 16 | ratings: 17 | paths: 18 | - "**.py" 19 | exclude_paths: [] 20 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | dicttoxml >= 1.7.4 2 | pygtrie >= 2.2 3 | numpy >= 1.12.1 4 | scipy >= 0.19.0 5 | scikit-learn >= 0.18.1 6 | jsonpickle >= 0.9.4 7 | pymorphy2 >= 0.8 8 | h5py >= 2.7.0 9 | russian-tagsets == 0.6 10 | tqdm >= 4.14.0 11 | jsonpickle >= 0.9.4 12 | rnnmorph >= 0.2.3 13 | sentence_splitter >= 1.2 14 | allennlp == 0.9.0 15 | overrides == 3.0.0 16 | git+https://github.com/IlyaGusev/rulm.git@4e78a49 17 | git+https://github.com/IlyaGusev/russ.git@288fe6a 18 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. rupo documentation master file, created by 2 | sphinx-quickstart on Mon Jul 24 20:49:37 2017. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to rupo's documentation! 7 | ================================ 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: Contents: 12 | 13 | 14 | 15 | Indices and tables 16 | ================== 17 | 18 | * :ref:`genindex` 19 | * :ref:`modindex` 20 | * :ref:`search` 21 | -------------------------------------------------------------------------------- /rupo/rhymes/test_rhymes.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Автор: Гусев Илья 3 | # Описание: Тесты для модуля рифм. 4 | 5 | import unittest 6 | 7 | from rupo.stress.word import StressedWord, Stress 8 | from rupo.rhymes.rhymes import Rhymes 9 | 10 | 11 | class TestRhymes(unittest.TestCase): 12 | def test_rhyme(self): 13 | self.assertTrue(Rhymes.is_rhyme(StressedWord("братишь", {Stress(4)}), StressedWord("грустишь", {Stress(5)}))) 14 | self.assertFalse(Rhymes.is_rhyme(StressedWord("наизусть", {Stress(4)}), StressedWord("сестра", {Stress(5)}))) -------------------------------------------------------------------------------- /rupo/data/examples/text.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Михаил Лермонтов 6 | 1829 7 | 1829 8 | Забывши волнения жизни мятежной, 9 | Один жил в пустыне рыбак молодой. 10 | Однажды на скале прибрежной, 11 | Над тихой прозрачной рекой 12 | Он с удой беспечно 13 | Сидел 14 | И думой сердечной 15 | К прошедшему счастью летел. 16 | Забывши волнения жизни мятежной... 17 | 1829 18 | 19 | -------------------------------------------------------------------------------- /docs/source/rupo.rhymes.rst: -------------------------------------------------------------------------------- 1 | rupo.rhymes package 2 | =================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | rupo.rhymes.rhymes module 8 | ------------------------- 9 | 10 | .. automodule:: rupo.rhymes.rhymes 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | rupo.rhymes.test_rhymes module 16 | ------------------------------ 17 | 18 | .. automodule:: rupo.rhymes.test_rhymes 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | 24 | Module contents 25 | --------------- 26 | 27 | .. automodule:: rupo.rhymes 28 | :members: 29 | :undoc-members: 30 | :show-inheritance: 31 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = rupo 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /download.sh: -------------------------------------------------------------------------------- 1 | wget https://www.dropbox.com/s/dwkui2xqivzsyw5/generator_model.zip 2 | mkdir -p ./rupo/data/generator_models 3 | unzip generator_model.zip -d ./rupo/data/generator_models 4 | rm generator_model.zip 5 | 6 | wget https://www.dropbox.com/s/ajd8b7lpqaao7xt/stress_ru_main.tar.gz 7 | mkdir -p ./rupo/data/stress_models/ru_main 8 | tar -xzvf stress_ru_main.tar.gz --directory ./rupo/data/stress_models/ru_main 9 | rm stress_ru_main.tar.gz 10 | 11 | wget https://www.dropbox.com/s/7rk135fzd3i8kfw/g2p_models.zip 12 | mkdir -p ./rupo/data/g2p_models 13 | unzip g2p_models.zip -d ./rupo/data/g2p_models 14 | rm g2p_models.zip 15 | 16 | wget https://www.dropbox.com/s/znqlrb1xblh3amo/dict.zip 17 | mkdir -p ./rupo/data/dict 18 | unzip dict.zip -d ./rupo/data/dict 19 | rm dict.zip 20 | -------------------------------------------------------------------------------- /docs/source/rupo.util.rst: -------------------------------------------------------------------------------- 1 | rupo.util package 2 | ================= 3 | 4 | Submodules 5 | ---------- 6 | 7 | rupo.util.data module 8 | --------------------- 9 | 10 | .. automodule:: rupo.util.data 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | rupo.util.mixins module 16 | ----------------------- 17 | 18 | .. automodule:: rupo.util.mixins 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | rupo.util.preprocess module 24 | --------------------------- 25 | 26 | .. automodule:: rupo.util.preprocess 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | 32 | Module contents 33 | --------------- 34 | 35 | .. automodule:: rupo.util 36 | :members: 37 | :undoc-members: 38 | :show-inheritance: 39 | -------------------------------------------------------------------------------- /rupo/main/test_vocabulary.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Автор: Гусев Илья 3 | # Описание: Тесты словаря. 4 | 5 | import os 6 | import unittest 7 | 8 | from rupo.main.vocabulary import StressVocabulary 9 | from rupo.settings import EXAMPLES_DIR, MARKUP_XML_EXAMPLE 10 | 11 | 12 | class TestVocabulary(unittest.TestCase): 13 | def test_vocabulary(self): 14 | dump_file = os.path.join(EXAMPLES_DIR, "temp.pickle") 15 | vocabulary = StressVocabulary() 16 | vocabulary.parse(MARKUP_XML_EXAMPLE) 17 | vocabulary.save(dump_file) 18 | self.assertTrue(os.path.exists(dump_file)) 19 | os.remove(dump_file) 20 | try: 21 | self.assertTrue(vocabulary.get_word(0) is not None) 22 | except IndexError: 23 | self.assertTrue(False) 24 | -------------------------------------------------------------------------------- /docs/source/rupo.rst: -------------------------------------------------------------------------------- 1 | rupo package 2 | ============ 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | 9 | rupo.dict 10 | rupo.files 11 | rupo.g2p 12 | rupo.generate 13 | rupo.main 14 | rupo.metre 15 | rupo.morph 16 | rupo.rhymes 17 | rupo.stress 18 | rupo.util 19 | 20 | Submodules 21 | ---------- 22 | 23 | rupo.api module 24 | --------------- 25 | 26 | .. automodule:: rupo.api 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | rupo.settings module 32 | -------------------- 33 | 34 | .. automodule:: rupo.settings 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | rupo.test_api module 40 | -------------------- 41 | 42 | .. automodule:: rupo.test_api 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | 48 | Module contents 49 | --------------- 50 | 51 | .. automodule:: rupo 52 | :members: 53 | :undoc-members: 54 | :show-inheritance: 55 | -------------------------------------------------------------------------------- /docs/source/rupo.files.rst: -------------------------------------------------------------------------------- 1 | rupo.files package 2 | ================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | rupo.files.reader module 8 | ------------------------ 9 | 10 | .. automodule:: rupo.files.reader 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | rupo.files.test_reader module 16 | ----------------------------- 17 | 18 | .. automodule:: rupo.files.test_reader 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | rupo.files.test_writer module 24 | ----------------------------- 25 | 26 | .. automodule:: rupo.files.test_writer 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | rupo.files.writer module 32 | ------------------------ 33 | 34 | .. automodule:: rupo.files.writer 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | 40 | Module contents 41 | --------------- 42 | 43 | .. automodule:: rupo.files 44 | :members: 45 | :undoc-members: 46 | :show-inheritance: 47 | -------------------------------------------------------------------------------- /rupo/util/tqdm_open.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Авторы: Анастасьев Даниил 3 | # Описание: Обертка открытия больших файлов в счетчик tqdm 4 | 5 | from contextlib import contextmanager 6 | from os.path import getsize, basename 7 | from tqdm import tqdm 8 | 9 | 10 | @contextmanager 11 | def tqdm_open(filename, encoding='utf8'): 12 | """ 13 | Открытие файла, обёрнутое в tqdm 14 | """ 15 | total = getsize(filename) 16 | 17 | def wrapped_line_iterator(fd): 18 | with tqdm(total=total, unit="B", unit_scale=True, desc=basename(filename), miniters=1) as pb: 19 | processed_bytes = 0 20 | for line in fd: 21 | processed_bytes += len(line) 22 | if processed_bytes >= 1024 * 1024: 23 | pb.update(processed_bytes) 24 | processed_bytes = 0 25 | yield line 26 | pb.update(processed_bytes) 27 | 28 | with open(filename, encoding=encoding) as fd: 29 | yield wrapped_line_iterator(fd) -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | dist: xenial 2 | sudo: required 3 | language: python 4 | python: 3.6 5 | before_script: 6 | - git lfs pull 7 | install: 8 | - pip install --upgrade pip setuptools wheel 9 | - pip install -r requirements.txt 10 | - sh download.sh 11 | script: 12 | - pytest 13 | deploy: 14 | provider: pypi 15 | user: Phoenix120 16 | password: 17 | secure: ueaFMBiVlNSPmwivJ0uMGJw1ntj6nTuCqIuxYw0IXQDUebPkU6QZLH3o8k59BgD1o5+7yXmpWQHXb7B5UYr0pBC/4wEutatoGLzDmcTv1DaYn7kzrv5PTBSQVEvCQNzA8jNog0j6Ljg9Z7CN3H/vGIIdPRt1Gxmu0dPCrX3rGMlwKZLH5/gRaZlbgxtov/UGfIUEOgJmM1eJvZYS8Y5InmxlUBJmT0U1QDe1cBooax43KlspQzCJSJ6NciMGXSZUi5nPSb9sKbqvbOjRnCydcazeQwoRf14qIwFS3b7nL4TLb+rRSHFKuOJ9cmnAF+f5qo0ytJuYZqo3dNS8LqwuJH0tXyO4fo5T7Xe2k7eIfla4mg1T+uss5zIM0ttfW/ApKQanAr2kZ/tMl6ywWkWLJ1crSYM9RjUewZw8Z1qwYbEDJrcWIBZxkyPfEkzilgjAvlf4rmEUR3eJtm2YBgoz5XiNR2sdTeRFUgAcZUyC7nx+N15FJgw1HTtZeqbedPGgq84sMk31OxGfpGDJQ9iHqHavvTdxiRjA8YLNlxAeZ+Upop6zLznUM7iE742tNAjqjaXhGR6128Viggn4hL2PZuFYlmRx5Rt7LhCr1OgKViodNJwyoWZTFDl3p+b6GoJai9FPIewX6nmfQTAeYweFM8yz38akC8v21P3/kNeJ2/w= 18 | on: 19 | tags: true 20 | -------------------------------------------------------------------------------- /docs/source/rupo.metre.rst: -------------------------------------------------------------------------------- 1 | rupo.metre package 2 | ================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | rupo.metre.metre_classifier module 8 | ---------------------------------- 9 | 10 | .. automodule:: rupo.metre.metre_classifier 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | rupo.metre.patterns module 16 | -------------------------- 17 | 18 | .. automodule:: rupo.metre.patterns 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | rupo.metre.test_metre_classifier module 24 | --------------------------------------- 25 | 26 | .. automodule:: rupo.metre.test_metre_classifier 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | rupo.metre.test_patterns module 32 | ------------------------------- 33 | 34 | .. automodule:: rupo.metre.test_patterns 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | 40 | Module contents 41 | --------------- 42 | 43 | .. automodule:: rupo.metre 44 | :members: 45 | :undoc-members: 46 | :show-inheritance: 47 | -------------------------------------------------------------------------------- /rupo/files/test_writer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Автор: Гусев Илья 3 | # Описание: Тесты записи разметок. 4 | 5 | import unittest 6 | import os 7 | 8 | from rupo.main.markup import Markup 9 | from rupo.files.writer import Writer 10 | from rupo.files.reader import Reader, FileType 11 | from rupo.util.data import MARKUP_EXAMPLE 12 | from rupo.settings import EXAMPLES_DIR 13 | 14 | 15 | class TestWriter(unittest.TestCase): 16 | def test_write(self): 17 | temp_file = os.path.join(EXAMPLES_DIR, "temp.xml") 18 | markup = MARKUP_EXAMPLE 19 | Writer.write_markups(FileType.XML, [markup], temp_file) 20 | processed_xml = Reader.read_markups(temp_file, FileType.XML, is_processed=True) 21 | self.assertEqual(next(processed_xml), markup) 22 | processed_xml.close() 23 | os.remove(temp_file) 24 | 25 | temp_file = os.path.join(EXAMPLES_DIR, "temp.txt") 26 | Writer.write_markups(FileType.RAW, [markup], temp_file) 27 | processed_raw = Reader.read_markups(temp_file, FileType.RAW, is_processed=True) 28 | self.assertIsInstance((next(processed_raw)), Markup) 29 | processed_raw.close() 30 | os.remove(temp_file) 31 | -------------------------------------------------------------------------------- /rupo/util/data.py: -------------------------------------------------------------------------------- 1 | from rupo.main.markup import Markup, Line, Word, Syllable 2 | 3 | MARKUP_EXAMPLE = Markup("Соломка король себя.\n Пора виться майкой в.", [ 4 | Line(0, 20, "Соломка король себя.", [ 5 | Word(0, 7, "Соломка", 6 | [Syllable(0, 2, 0, "Со"), 7 | Syllable(2, 5, 1, "лом", 3), 8 | Syllable(5, 7, 2, "ка")]), 9 | Word(8, 14, "король", 10 | [Syllable(0, 2, 0, "ко"), 11 | Syllable(2, 6, 1, "роль", 3)]), 12 | Word(15, 19, "себя", 13 | [Syllable(0, 2, 0, "се"), 14 | Syllable(2, 4, 1, "бя", 3)])]), 15 | Line(21, 43, " Пора виться майкой в.",[ 16 | Word(22, 26, "Пора", 17 | [Syllable(0, 2, 0, "По", 1), 18 | Syllable(2, 4, 1, "ра", 3)]), 19 | Word(27, 33, "виться", 20 | [Syllable(0, 2, 0, "ви", 1), 21 | Syllable(2, 6, 1, "ться")]), 22 | Word(34, 40, "майкой", 23 | [Syllable(0, 3, 0, "май", 1), 24 | Syllable(3, 6, 1, "кой")]), 25 | Word(41, 42, "в", []) 26 | ])]) -------------------------------------------------------------------------------- /rupo/dict/zaliznyak.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | class ZalyzniakDict: 5 | @staticmethod 6 | def convert_to_accent_only(dict_file, accent_file): 7 | with open(dict_file, 'r', encoding='utf-8') as r: 8 | lines = r.readlines() 9 | with open(accent_file, 'w', encoding='utf-8') as w: 10 | for line in lines: 11 | for word in line.split("#")[1].split(","): 12 | word = word.strip() 13 | pos = -1 14 | clean_word = "" 15 | primary = [] 16 | secondary = [] 17 | for i, ch in enumerate(word): 18 | if ch == "'" or ch == "`": 19 | if ch == "`": 20 | secondary.append(pos) 21 | else: 22 | primary.append(pos) 23 | continue 24 | clean_word += ch 25 | pos += 1 26 | if ch == "ё": 27 | primary.append(pos) 28 | if len(primary) != 0: 29 | w.write(clean_word + "\t" + ",".join([str(a) for a in primary]) + "\t" + 30 | ",".join([str(a) for a in secondary]) + "\n") -------------------------------------------------------------------------------- /docs/source/rupo.main.rst: -------------------------------------------------------------------------------- 1 | rupo.main package 2 | ================= 3 | 4 | Submodules 5 | ---------- 6 | 7 | rupo.main.markup module 8 | ----------------------- 9 | 10 | .. automodule:: rupo.main.markup 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | rupo.main.test_markup module 16 | ---------------------------- 17 | 18 | .. automodule:: rupo.main.test_markup 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | rupo.main.test_tokenizer module 24 | ------------------------------- 25 | 26 | .. automodule:: rupo.main.test_tokenizer 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | rupo.main.test_vocabulary module 32 | -------------------------------- 33 | 34 | .. automodule:: rupo.main.test_vocabulary 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | rupo.main.tokenizer module 40 | -------------------------- 41 | 42 | .. automodule:: rupo.main.tokenizer 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | rupo.main.vocabulary module 48 | --------------------------- 49 | 50 | .. automodule:: rupo.main.vocabulary 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | 55 | 56 | Module contents 57 | --------------- 58 | 59 | .. automodule:: rupo.main 60 | :members: 61 | :undoc-members: 62 | :show-inheritance: 63 | -------------------------------------------------------------------------------- /generate_poem.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from rupo.api import Engine 4 | from rupo.settings import RU_STRESS_DEFAULT_MODEL, ZALYZNYAK_DICT, GENERATOR_MODEL_DIR 5 | 6 | 7 | if __name__ == "__main__": 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument('--model-path', type=str, default=GENERATOR_MODEL_DIR) 10 | parser.add_argument('--token-vocab-path', type=str, default=None) 11 | parser.add_argument('--stress-vocab-path', type=str, default=None) 12 | parser.add_argument('--metre-schema', type=str, default='+-') 13 | parser.add_argument('--rhyme-pattern', type=str, default='abab') 14 | parser.add_argument('--n-syllables', type=int, default=8) 15 | parser.add_argument('--sampling-k', type=int, default=50000) 16 | parser.add_argument('--beam-width', type=int, default=None) 17 | parser.add_argument('--temperature', type=float, default=1.0) 18 | parser.add_argument('--last-text', type=str, default="") 19 | parser.add_argument('--count', type=int, default=100) 20 | args = parser.parse_args() 21 | 22 | kwargs = vars(args) 23 | count = kwargs.pop('count') 24 | 25 | engine = Engine() 26 | engine.load(RU_STRESS_DEFAULT_MODEL, ZALYZNYAK_DICT) 27 | for seed in range(count): 28 | print(seed) 29 | try: 30 | poem = engine.generate_poem(seed=seed, **kwargs) 31 | print(poem) 32 | except AssertionError as e: 33 | print("Error: ", e) 34 | -------------------------------------------------------------------------------- /rupo/stress/test_dict.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Автор: Гусев Илья 3 | # Описание: Тесты для словаря ударений. 4 | 5 | import unittest 6 | 7 | from rupo.stress.dict import StressDict 8 | from rupo.stress.word import Stress, StressedWord 9 | from rupo.util.preprocess import VOWELS 10 | from rupo.settings import RU_GRAPHEME_STRESS_PATH, ZALYZNYAK_DICT, RU_GRAPHEME_STRESS_TRIE_PATH 11 | 12 | 13 | class TestStressDict(unittest.TestCase): 14 | @classmethod 15 | def setUpClass(cls): 16 | cls.dict = StressDict(language="ru", zalyzniak_dict=ZALYZNYAK_DICT, 17 | raw_dict_path=RU_GRAPHEME_STRESS_PATH, trie_path=RU_GRAPHEME_STRESS_TRIE_PATH) 18 | 19 | @classmethod 20 | def tearDownClass(cls): 21 | del cls.dict 22 | 23 | def test_get_stresses(self): 24 | self.assertCountEqual(self.dict.get_stresses("данный", Stress.Type.PRIMARY), [1]) 25 | self.assertCountEqual(self.dict.get_stresses("союза", Stress.Type.PRIMARY), [2]) 26 | self.assertCountEqual(self.dict.get_stresses("англосакс", Stress.Type.SECONDARY), [0]) 27 | self.assertCountEqual(self.dict.get_stresses("англосакс", Stress.Type.ANY), [0, 6]) 28 | self.assertCountEqual(self.dict.get_stresses("пора", Stress.Type.PRIMARY), [1, 3]) 29 | 30 | def test_stress_only_in_vowels(self): 31 | for word, stresses in self.dict.get_all(): 32 | for stress in stresses: 33 | self.assertIn(word[stress.position], VOWELS) 34 | 35 | -------------------------------------------------------------------------------- /rupo/main/test_markup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Автор: Гусев Илья 3 | # Описание: Тесты для разметки. 4 | 5 | import unittest 6 | 7 | from rupo.util.data import MARKUP_EXAMPLE 8 | from rupo.main.markup import Markup 9 | from rupo.stress.predictor import CombinedStressPredictor 10 | from rupo.settings import RU_STRESS_DEFAULT_MODEL, ZALYZNYAK_DICT, CMU_DICT, \ 11 | RU_GRAPHEME_STRESS_PATH, RU_GRAPHEME_STRESS_TRIE_PATH 12 | 13 | 14 | class TestMarkup(unittest.TestCase): 15 | @classmethod 16 | def setUpClass(cls): 17 | cls.stress_predictor = CombinedStressPredictor( 18 | stress_model_path=RU_STRESS_DEFAULT_MODEL, 19 | zalyzniak_dict=ZALYZNYAK_DICT, 20 | cmu_dict=CMU_DICT, 21 | raw_stress_dict_path=RU_GRAPHEME_STRESS_PATH, 22 | stress_trie_path=RU_GRAPHEME_STRESS_TRIE_PATH 23 | ) 24 | 25 | @classmethod 26 | def tearDownClass(cls): 27 | del cls.stress_predictor 28 | 29 | def test_from_to(self): 30 | clean_markup = Markup() 31 | self.assertEqual(MARKUP_EXAMPLE, clean_markup.from_xml(MARKUP_EXAMPLE.to_xml())) 32 | clean_markup = Markup() 33 | self.assertEqual(MARKUP_EXAMPLE, clean_markup.from_json(MARKUP_EXAMPLE.to_json())) 34 | 35 | def test_process_text(self): 36 | text = "Соломка король себя.\n Пора виться майкой в." 37 | markup = Markup.process_text(text, self.stress_predictor) 38 | self.assertEqual(markup.to_json(), MARKUP_EXAMPLE.to_json()) 39 | 40 | -------------------------------------------------------------------------------- /rupo/files/test_reader.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Автор: Гусев Илья 3 | # Описание: Тесты считывателя разметок. 4 | 5 | import unittest 6 | 7 | from rupo.files.reader import Reader, FileType 8 | from rupo.stress.predictor import CombinedStressPredictor 9 | from rupo.main.markup import Markup, Line, Word 10 | from rupo.settings import MARKUP_XML_EXAMPLE, TEXT_XML_EXAMPLE, MARKUP_JSON_EXAMPLE 11 | 12 | 13 | class TestReader(unittest.TestCase): 14 | @classmethod 15 | def setUpClass(cls): 16 | cls.stress_predictor = CombinedStressPredictor() 17 | 18 | def test_read(self): 19 | processed_xml = Reader.read_markups(MARKUP_XML_EXAMPLE, FileType.XML, is_processed=True) 20 | self.__assert_markup_is_correct(next(processed_xml)) 21 | 22 | unprocessed_xml = Reader.read_markups(TEXT_XML_EXAMPLE, FileType.XML, is_processed=False, 23 | stress_predictor=self.stress_predictor) 24 | self.__assert_markup_is_correct(next(unprocessed_xml)) 25 | 26 | processed_json = Reader.read_markups(MARKUP_JSON_EXAMPLE, FileType.JSON, is_processed=True) 27 | self.__assert_markup_is_correct(next(processed_json)) 28 | 29 | def __assert_markup_is_correct(self, markup): 30 | self.assertIsInstance(markup, Markup) 31 | self.assertIsNotNone(markup.text) 32 | self.assertNotEqual(markup.text, "") 33 | self.assertNotEqual(markup.lines, []) 34 | self.assertIsInstance(markup.lines[0], Line) 35 | self.assertIsInstance(markup.lines[0].words[0], Word) 36 | -------------------------------------------------------------------------------- /rupo/stress/test_predictor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Автор: Гусев Илья 3 | # Описание: Тесты предсказателя ударений. 4 | 5 | import unittest 6 | 7 | from rupo.stress.predictor import CombinedStressPredictor 8 | from rupo.settings import RU_STRESS_DEFAULT_MODEL, ZALYZNYAK_DICT, CMU_DICT, \ 9 | RU_GRAPHEME_STRESS_PATH, RU_GRAPHEME_STRESS_TRIE_PATH 10 | 11 | 12 | class TestStressPredictor(unittest.TestCase): 13 | @classmethod 14 | def setUpClass(cls): 15 | cls.stress_predictor = CombinedStressPredictor( 16 | stress_model_path=RU_STRESS_DEFAULT_MODEL, 17 | zalyzniak_dict=ZALYZNYAK_DICT, 18 | cmu_dict=CMU_DICT, 19 | raw_stress_dict_path=RU_GRAPHEME_STRESS_PATH, 20 | stress_trie_path=RU_GRAPHEME_STRESS_TRIE_PATH 21 | ) 22 | 23 | def test_stress(self): 24 | checks = { 25 | 'я': [0], 26 | 'в': [], 27 | 'он': [0], 28 | 'майка': [1], 29 | 'соломка': [3], 30 | 'изжить': [3], 31 | 'виться': [1], 32 | 'данный': [1], 33 | 'зорька': [1], 34 | 'банка': [1], 35 | 'оттечь': [3], 36 | 'советского': [3], 37 | 'союза': [2], 38 | 'пора': [3, 1], 39 | 'изжила': [5], 40 | 'меда': [1], 41 | 'автоподъёмник': [8], 42 | 'каракуля': [3], 43 | 'супервайзер': [6], 44 | 'колесом': [5] 45 | } 46 | for word, pos in checks.items(): 47 | self.assertEqual(sorted(self.stress_predictor.predict(word)), sorted(pos)) 48 | -------------------------------------------------------------------------------- /rupo/util/mixins.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Автор: Гусев Илья 3 | # Описание: Служебные миксины для удобства сериализации. 4 | 5 | 6 | def to_dict(obj): 7 | """ 8 | Преобразование объекта в словарь. 9 | 10 | :param obj: объект, который нужно превратить в словарь 11 | :return data: получившийся словарь 12 | """ 13 | if isinstance(obj, dict): 14 | data = {} 15 | for (k, v) in obj.items(): 16 | data[k] = to_dict(v) 17 | return data 18 | elif hasattr(obj, "__iter__") and not isinstance(obj, str): 19 | return [to_dict(v) for v in obj] 20 | elif hasattr(obj, "__dict__"): 21 | data = dict([(key, to_dict(value)) for key, value in obj.__dict__.items() 22 | if not callable(value) and not key.startswith('_')]) 23 | return data 24 | else: 25 | return obj 26 | 27 | 28 | class CommonMixin(object): 29 | """ 30 | Mixin для удобного сравнения и преобразования в dict. 31 | """ 32 | def __eq__(self, other): 33 | if isinstance(other, self.__class__): 34 | return self.__dict__ == other.__dict__ 35 | return NotImplemented 36 | 37 | def __ne__(self, other): 38 | if isinstance(other, self.__class__): 39 | return not self.__eq__(other) 40 | return NotImplemented 41 | 42 | def __hash__(self): 43 | return hash(tuple(sorted(self.__dict__.items()))) 44 | 45 | def __repr__(self): 46 | return str(self.to_dict()) 47 | 48 | def __str__(self): 49 | return str(self.to_dict()) 50 | 51 | def to_dict(self): 52 | return to_dict(self) -------------------------------------------------------------------------------- /rupo/data/examples/morph_markup.txt: -------------------------------------------------------------------------------- 1 | забывши забыть VERB Aspect=Perf|Tense=Past|VerbForm=Trans 2 | волнения волнение NOUN Animacy=Inan|Case=Gen|Gender=Neut|Number=Sing 3 | жизни жизнь NOUN Animacy=Inan|Case=Gen|Gender=Fem|Number=Sing 4 | мятежной мятежный ADJ Case=Gen|Gender=Fem|Number=Sing 5 | 6 | один один DET Case=Nom|Gender=Masc|Number=Sing 7 | жил жить VERB Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin 8 | в в ADP _ 9 | пустыне пустыня NOUN Animacy=Inan|Case=Loc|Gender=Fem|Number=Sing 10 | рыбак рыбак NOUN Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing 11 | молодой молодая NOUN Animacy=Anim|Case=Gen|Gender=Fem|Number=Sing 12 | 13 | однажды однажды ADV _ 14 | на на ADP _ 15 | скале скал NOUN Animacy=Inan|Case=Loc|Gender=Masc|Number=Sing 16 | прибрежной прибрежный ADJ Case=Gen|Gender=Fem|Number=Sing 17 | 18 | над над ADP _ 19 | тихой тихий ADJ Case=Gen|Gender=Fem|Number=Sing 20 | прозрачной прозрачный ADJ Case=Gen|Gender=Fem|Number=Sing 21 | рекой река NOUN Animacy=Inan|Case=Ins|Gender=Fem|Number=Sing 22 | 23 | он он PRON Case=Nom|Gender=Masc|Number=Sing|Person=3 24 | с с ADP _ 25 | удой уда PROPN Animacy=Inan|Case=Ins|Gender=Fem|Number=Sing 26 | беспечно беспечно ADV _ 27 | 28 | сидел сидеть VERB Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin 29 | 30 | и и CONJ _ 31 | думой дума NOUN Animacy=Inan|Case=Ins|Gender=Fem|Number=Sing 32 | сердечной сердечный ADJ Case=Gen|Gender=Fem|Number=Sing 33 | 34 | к к ADP _ 35 | прошедшему прошедшее NOUN Animacy=Inan|Case=Dat|Gender=Neut|Number=Sing 36 | счастью счастие NOUN Animacy=Inan|Case=Dat|Gender=Neut|Number=Sing 37 | летел лететь VERB Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin 38 | 39 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | from setuptools.command.develop import develop 3 | from setuptools.command.install import install 4 | 5 | 6 | class PostDevelopCommand(develop): 7 | def run(self): 8 | develop.run(self) 9 | 10 | 11 | class PostInstallCommand(install): 12 | def run(self): 13 | install.run(self) 14 | 15 | 16 | setup( 17 | name='rupo', 18 | packages=find_packages(), 19 | version='0.2.8', 20 | description='RuPo: library for russian poetry analysis and generation', 21 | author='Ilya Gusev', 22 | author_email='phoenixilya@gmail.com', 23 | url='https://github.com/IlyaGusev/rupo', 24 | download_url='https://github.com/IlyaGusev/rupo/archive/0.2.8.tar.gz', 25 | keywords=['poetry', 'nlp', 'russian'], 26 | package_data={ 27 | 'rupo': ['data/examples/*', 'data/hyphen-tokens.txt'] 28 | }, 29 | install_requires=[ 30 | 'dicttoxml>=1.7.4', 31 | 'pygtrie>=2.2', 32 | 'numpy>=1.11.3', 33 | 'scipy>=0.18.1', 34 | 'scikit-learn>=0.18.1', 35 | 'jsonpickle>=0.9.4', 36 | 'pymorphy2>=0.8', 37 | 'h5py>=2.7.0', 38 | 'russian-tagsets==0.6', 39 | 'tqdm>=4.14.0', 40 | 'rnnmorph==0.2.3', 41 | 'sentence_splitter>=1.2', 42 | 'rulm==0.0.2', 43 | 'russ==0.0.1' 44 | ], 45 | cmdclass={ 46 | 'develop': PostDevelopCommand, 47 | 'install': PostInstallCommand, 48 | }, 49 | classifiers=[ 50 | 'Development Status :: 4 - Beta', 51 | 'Intended Audience :: Developers', 52 | 53 | 'Topic :: Text Processing :: Linguistic', 54 | 55 | 'License :: OSI Approved :: Apache Software License', 56 | 57 | 'Natural Language :: Russian', 58 | 59 | 'Programming Language :: Python :: 3.5', 60 | 'Programming Language :: Python :: 3.5', 61 | 'Programming Language :: Python :: 3.6', 62 | ], 63 | ) 64 | -------------------------------------------------------------------------------- /rupo/util/preprocess.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Автор: Гусев Илья 3 | # Описание: Служебные функции и константы. 4 | 5 | import re 6 | 7 | CYRRILIC_LOWER_VOWELS = "аоэиуыеёюя" 8 | CYRRILIC_LOWER_CONSONANTS = "йцкнгшщзхъфвпрлджчсмтьб" 9 | VOWELS = "aeiouAEIOUаоэиуыеёюяАОЭИУЫЕЁЮЯ" 10 | CLOSED_SYLLABLE_CHARS = "рлймнРЛЙМН" 11 | 12 | 13 | def text_to_wordlist(sentence, cyrillic=False): 14 | regexp = "[^а-яА-Яёa-zA-Z]" 15 | if cyrillic: 16 | regexp = "[^а-яА-Яё]" 17 | sentence = re.sub(regexp, " ", sentence) 18 | result = sentence.lower().split() 19 | return result 20 | 21 | 22 | def text_to_sentences(text): 23 | regexp = "[\.\?!](?=[\s\n]*[A-ZА-Я])|;|:-|:—|:—|: —|: —|: -" 24 | regexps = ["(?<=[^A-zА-я][A-ZА-Я])\.", 25 | "(?<=[^A-zА-я][A-zА-я])\.[ ]?(?=[A-zА-я][^A-zА-я])", 26 | "\.(?=,)" 27 | ] 28 | for reg in regexps: 29 | text = "$".join(re.split(reg,text)) 30 | 31 | result = re.split(regexp, text) 32 | result = map(lambda x: x.strip().replace("$", "."), result) 33 | return result 34 | 35 | 36 | def to_cyrrilic(text): 37 | return text.replace("x", "х") \ 38 | .replace("a", "а") \ 39 | .replace("y", "у") \ 40 | .replace("o", "о") \ 41 | .replace("c", "с") \ 42 | .replace("ё", "е") 43 | 44 | 45 | def normilize_line(text): 46 | regexp = "[^а-яА-Яёa-zA-Z0-9]" 47 | text = re.sub(regexp, " ", text) 48 | result = to_cyrrilic("".join(text.lower().split())) 49 | return result 50 | 51 | 52 | def count_vowels(string): 53 | num_vowels = 0 54 | for char in string: 55 | if char in VOWELS: 56 | num_vowels += 1 57 | return num_vowels 58 | 59 | 60 | def get_first_vowel_position(string): 61 | for i, ch in enumerate(string): 62 | if ch in VOWELS: 63 | return i 64 | return -1 65 | 66 | 67 | def etree_to_dict(t): 68 | return {t.tag: map(etree_to_dict, t.iterchildren()) or t.text} -------------------------------------------------------------------------------- /rupo/main/morph.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Автор: Гусев Илья 3 | # Описание: Разметка по грамматическим значениям 4 | 5 | import os 6 | from typing import List, TextIO 7 | 8 | from sentence_splitter import SentenceSplitter 9 | from rnnmorph.predictor import RNNMorphPredictor 10 | 11 | from rupo.main.tokenizer import Tokenizer, Token 12 | 13 | 14 | class Morph: 15 | @staticmethod 16 | def get_morph_markup(input_filenames: List[str], output_filename: str): 17 | """ 18 | Разметка по грамматическим значениям 19 | 20 | :param input_filenames: входные текстовые файлы 21 | :param output_filename: путь к файлу, куда будет сохранена разметка 22 | """ 23 | if os.path.exists(output_filename): 24 | os.remove(output_filename) 25 | 26 | sentence_splitter = SentenceSplitter(language='ru') 27 | morph_predictor = RNNMorphPredictor() 28 | 29 | for filename in input_filenames: 30 | with open(filename, "r", encoding="utf-8") as r, open(output_filename, "w+", encoding="utf-8") as w: 31 | for line in r: 32 | Morph.__process_line(line, w, sentence_splitter, morph_predictor) 33 | 34 | @staticmethod 35 | def __process_line(line: str, output_file: TextIO, sentence_splitter: SentenceSplitter, 36 | morph_predictor: RNNMorphPredictor): 37 | sentences = sentence_splitter.split(line) 38 | for sentence in sentences: 39 | words = [token.text for token in Tokenizer.tokenize(sentence) 40 | if token.text != '' and token.token_type != Token.TokenType.SPACE] 41 | if not words: 42 | continue 43 | forms = morph_predictor.predict_sentence_tags(words) 44 | for form in forms: 45 | if form.pos == "PUNCT": 46 | continue 47 | output_file.write("%s\t%s\t%s\t%s\n" % (form.word, form.normal_form, form.pos, form.tag)) 48 | output_file.write("\n") 49 | -------------------------------------------------------------------------------- /rupo/stress/word.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Автор: Гусев Илья 3 | # Описание: Класс слова с ударением. 4 | 5 | from enum import Enum 6 | from typing import List, Set 7 | from russ.syllables import get_syllables 8 | 9 | 10 | class Stress: 11 | """ 12 | Ударение 13 | """ 14 | 15 | class Type(Enum): 16 | ANY = -1 17 | PRIMARY = 0 18 | SECONDARY = 1 19 | 20 | def __init__(self, position: int, stress_type: Type=Type.PRIMARY) -> None: 21 | self.position = position 22 | self.type = stress_type 23 | 24 | def __hash__(self): 25 | return hash(self.position) 26 | 27 | def __eq__(self, other: 'Stress'): 28 | return self.position == other.position and self.type == other.type 29 | 30 | def __str__(self): 31 | return str(self.position) + "\t" + str(self.type) 32 | 33 | def __repr__(self): 34 | return self.__str__() 35 | 36 | 37 | class StressedWord: 38 | """ 39 | Слово и его ударения. 40 | """ 41 | 42 | def __init__(self, text: str, stresses: Set[Stress]) -> None: 43 | self.stresses = stresses 44 | self.text = text 45 | self.syllables = get_syllables(text) 46 | self.__accent_syllables() 47 | 48 | def get_primary_stresses(self) -> List[int]: 49 | return [stress.position for stress in self.stresses if stress.type == Stress.Type.PRIMARY] 50 | 51 | def get_secondary_stresses(self) -> List[int]: 52 | return [stress.position for stress in self.stresses if stress.type == Stress.Type.SECONDARY] 53 | 54 | def add_stress(self, position: int, stress_type: Stress.Type=Stress.Type.PRIMARY) -> None: 55 | self.stresses.add(Stress(position, stress_type)) 56 | self.__accent_syllables() 57 | 58 | def add_stresses(self, stresses: List[Stress]) -> None: 59 | self.stresses = set(self.stresses).union(set(stresses)) 60 | self.__accent_syllables() 61 | 62 | def __accent_syllables(self): 63 | for syllable in self.syllables: 64 | if Stress(syllable.vowel()) in self.stresses: 65 | syllable.stress = syllable.vowel() 66 | else: 67 | syllable.stress = -1 68 | 69 | def __str__(self): 70 | return self.text + "\t" + ",".join([str(i) for i in self.get_primary_stresses()])+ \ 71 | "\t" + ",".join([str(i) for i in self.get_secondary_stresses()]) 72 | 73 | def __repr__(self): 74 | return self.__str__() 75 | 76 | def __hash__(self): 77 | return hash(self.text) 78 | 79 | def __eq__(self, other: 'StressedWord'): 80 | return self.text == other.text 81 | -------------------------------------------------------------------------------- /rupo/dict/wiki.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from rupo.settings import RU_GRAPHEME_SET 4 | 5 | from russ.syllables import VOWELS 6 | 7 | 8 | class WikiDict: 9 | @staticmethod 10 | def convert_to_g2p_only(source_file, destination_file): 11 | with open(source_file, 'r', encoding='utf-8') as r: 12 | lines = r.readlines() 13 | with open(destination_file, 'w', encoding='utf-8') as w: 14 | words = [] 15 | phonetic_words = [] 16 | for line in lines: 17 | words.append(line.split("\t")[0].strip()) 18 | phonetic_words.append(line.split("\t")[1].replace("'", "").replace("ˌ", "").strip()) 19 | for i, word in enumerate(words): 20 | w.write(word + "\t" + phonetic_words[i] + "\n") 21 | 22 | @staticmethod 23 | def first_clean_up(filename): 24 | words = [] 25 | phonetic_words = [] 26 | with open(filename, "r") as f: 27 | lines = f.readlines() 28 | print(len(lines)) 29 | for line in lines: 30 | word = line.split("#")[0] 31 | word = word.lower() 32 | phonetic_word = line.split("#")[1] 33 | if "'" not in phonetic_word and "ˈ" not in phonetic_word: 34 | continue 35 | phonetic_word = phonetic_word.split("/")[0].strip() 36 | phonetic_word = phonetic_word.split("~")[0].strip() 37 | phonetic_word = phonetic_word.split(";")[0].strip() 38 | phonetic_word = phonetic_word.split(",")[0].strip() 39 | phonetic_word = phonetic_word.replace("ˈ", "'") 40 | phonetic_word = phonetic_word.replace(":", "ː") 41 | phonetic_word = re.sub(r"[\s̟̥̻.̞]", "", phonetic_word) 42 | phonetic_word = re.sub(r"[(⁽][^)⁾]*[)⁾]", "", phonetic_word) 43 | phonetic_word = Phonemes.clean(phonetic_word) 44 | wrong_chars = [ch for ch in word if ch not in RU_GRAPHEME_SET] 45 | if len(wrong_chars) != 0: 46 | continue 47 | if len(word) == 0 or len(phonetic_word) == 0: 48 | continue 49 | if sum([1 for ch in word if ch in "еуаоэяиюёы"]) != \ 50 | sum([1 for ch in phonetic_word if ch in VOWELS]): 51 | continue 52 | words.append(word) 53 | phonetic_words.append(phonetic_word) 54 | print(len(words)) 55 | with open(filename, "w") as f: 56 | for i, word in enumerate(words): 57 | f.write(word + "\t" + phonetic_words[i] + "\n") -------------------------------------------------------------------------------- /docs/source/rupo.generate.rst: -------------------------------------------------------------------------------- 1 | rupo.generate package 2 | ===================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | rupo.generate.corpora_information_loader module 8 | ----------------------------------------------- 9 | 10 | .. automodule:: rupo.generate.corpora_information_loader 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | rupo.generate.filters module 16 | ---------------------------- 17 | 18 | .. automodule:: rupo.generate.filters 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | rupo.generate.gen module 24 | ------------------------ 25 | 26 | .. automodule:: rupo.generate.gen 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | rupo.generate.generator module 32 | ------------------------------ 33 | 34 | .. automodule:: rupo.generate.generator 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | rupo.generate.grammeme_vectorizer module 40 | ---------------------------------------- 41 | 42 | .. automodule:: rupo.generate.grammeme_vectorizer 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | rupo.generate.lstm module 48 | ------------------------- 49 | 50 | .. automodule:: rupo.generate.lstm 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | 55 | rupo.generate.markov module 56 | --------------------------- 57 | 58 | .. automodule:: rupo.generate.markov 59 | :members: 60 | :undoc-members: 61 | :show-inheritance: 62 | 63 | rupo.generate.model_container module 64 | ------------------------------------ 65 | 66 | .. automodule:: rupo.generate.model_container 67 | :members: 68 | :undoc-members: 69 | :show-inheritance: 70 | 71 | rupo.generate.test_generator module 72 | ----------------------------------- 73 | 74 | .. automodule:: rupo.generate.test_generator 75 | :members: 76 | :undoc-members: 77 | :show-inheritance: 78 | 79 | rupo.generate.test_markov module 80 | -------------------------------- 81 | 82 | .. automodule:: rupo.generate.test_markov 83 | :members: 84 | :undoc-members: 85 | :show-inheritance: 86 | 87 | rupo.generate.tqdm_open module 88 | ------------------------------ 89 | 90 | .. automodule:: rupo.generate.tqdm_open 91 | :members: 92 | :undoc-members: 93 | :show-inheritance: 94 | 95 | rupo.generate.word_form module 96 | ------------------------------ 97 | 98 | .. automodule:: rupo.generate.word_form 99 | :members: 100 | :undoc-members: 101 | :show-inheritance: 102 | 103 | rupo.generate.word_form_vocabulary module 104 | ----------------------------------------- 105 | 106 | .. automodule:: rupo.generate.word_form_vocabulary 107 | :members: 108 | :undoc-members: 109 | :show-inheritance: 110 | 111 | 112 | Module contents 113 | --------------- 114 | 115 | .. automodule:: rupo.generate 116 | :members: 117 | :undoc-members: 118 | :show-inheritance: 119 | -------------------------------------------------------------------------------- /rupo/settings.py: -------------------------------------------------------------------------------- 1 | from pkg_resources import resource_filename 2 | foo_config = resource_filename(__name__, 'foo.conf') 3 | 4 | CLASSIFIER_DIR = resource_filename(__name__, "data/classifier/") 5 | 6 | DATA_DIR = resource_filename(__name__, "data") 7 | 8 | DICT_DIR = resource_filename(__name__, "data/dict") 9 | CMU_DICT = resource_filename(__name__, "data/dict/cmu.txt") 10 | ZALYZNYAK_DICT = resource_filename(__name__, "data/dict/zaliznyak.txt") 11 | RU_WIKI_DICT = resource_filename(__name__, "data/dict/wiki_ru.txt") 12 | 13 | RU_ALIGNER_DEFAULT_PATH = resource_filename(__name__, "data/g2p_models/ru_aligner.pickle") 14 | EN_ALIGNER_DEFAULT_PATH = resource_filename(__name__, "data/g2p_models/en_aligner.pickle") 15 | 16 | RU_GRAPHEME_STRESS_PATH = resource_filename(__name__, "data/dict/ru_grapheme_stress.txt") 17 | RU_GRAPHEME_STRESS_TRIE_PATH = resource_filename(__name__, "data/dict/ru_grapheme_stress.trie") 18 | RU_G2P_DICT_PATH = resource_filename(__name__, "data/dict/ru_g2p.txt") 19 | RU_PHONEME_STRESS_PATH = resource_filename(__name__, "data/dict/ru_phoneme_stress.txt") 20 | RU_PHONEME_STRESS_BIG_PATH = resource_filename(__name__, "data/dict/ru_phoneme_stress_big.txt") 21 | RU_PHONEME_STRESS_TRIE_PATH = resource_filename(__name__, "data/dict/ru_phoneme_stress.trie") 22 | RU_G2P_DEFAULT_MODEL = resource_filename(__name__, "data/g2p_models/g2p_ru_maxlen40_BLSTM256_BLSTM256_dropout0.2_acc992_wer140.h5") 23 | RU_STRESS_DEFAULT_MODEL = resource_filename(__name__, "data/stress_models/ru_main") 24 | 25 | EN_G2P_DICT_PATH = resource_filename(__name__, "data/dict/en_g2p.txt") 26 | EN_PHONEME_STRESS_PATH = resource_filename(__name__, "data/dict/en_phoneme_stress.txt") 27 | EN_PHONEME_STRESS_TRIE_PATH = resource_filename(__name__, "data/dict/en_phoneme_stress.trie") 28 | EN_G2P_DEFAULT_MODEL = resource_filename(__name__, "data/g2p_models/g2p_en_maxlen40_BLSTM256+LSTM256_LSTM128_dropout0.4_acc977_wer379.h5") 29 | EN_STRESS_DEFAULT_MODEL = resource_filename(__name__, "data/stress_models/stress_en_LSTM128_dropout0.2_acc99_wer10.h5") 30 | 31 | EXAMPLES_DIR = resource_filename(__name__, "data/examples/") 32 | MARKUP_XML_EXAMPLE = resource_filename(__name__, "data/examples/markup.xml") 33 | MARKUP_JSON_EXAMPLE = resource_filename(__name__, "data/examples/markup.json") 34 | TEXT_XML_EXAMPLE = resource_filename(__name__, "data/examples/text.xml") 35 | TEXT_TXT_EXAMPLE = resource_filename(__name__, "data/examples/text.txt") 36 | HYPHEN_TOKENS = resource_filename(__name__, "data/hyphen-tokens.txt") 37 | 38 | G2P_CURRENT_MODEL_DIR = resource_filename(__name__, "data/g2p_models/") 39 | ACCENT_CURRENT_MODEL_DIR = resource_filename(__name__, "data/stress_models/") 40 | 41 | GENERATOR_MODEL_DIR = resource_filename(__name__, "data/generator_models/") 42 | GENERATOR_WORD_FORM_VOCAB_PATH = resource_filename(__name__, "data/generator_models/vocabulary") 43 | GENERATOR_VOCAB_PATH = resource_filename(__name__, "data/generator_models/stress.pickle") 44 | 45 | TEMP_PATH = resource_filename(__name__, "data/temp.txt") 46 | 47 | RU_GRAPHEME_SET = " абвгдеёжзийклмнопрстуфхцчшщьыъэюя-" 48 | EN_GRAPHEME_SET = " abcdefghijklmnopqrstuvwxyz.'-" 49 | -------------------------------------------------------------------------------- /rupo/main/test_tokenizer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Автор: Гусев Илья 3 | # Описание: Тесты для токенизатора. 4 | 5 | import unittest 6 | 7 | from rupo.main.tokenizer import Tokenizer, SentenceTokenizer, Token 8 | 9 | 10 | class TestTokenizer(unittest.TestCase): 11 | def test_tokenizer(self): 12 | text = "О, когда-нибудь, когда?" 13 | self.assertEqual(Tokenizer.tokenize(text), [ 14 | Token('О', Token.TokenType.WORD, 0, 1), 15 | Token(',', Token.TokenType.PUNCTUATION, 1, 2), 16 | Token(' ', Token.TokenType.SPACE, 2, 3), 17 | Token('когда-нибудь', Token.TokenType.WORD, 3, 15), 18 | Token(',', Token.TokenType.PUNCTUATION, 15, 16), 19 | Token(' ', Token.TokenType.SPACE, 16, 17), 20 | Token('когда', Token.TokenType.WORD, 17, 22), 21 | Token('?', Token.TokenType.PUNCTUATION, 22, 23)]) 22 | 23 | text = " Пора" 24 | self.assertEqual(Tokenizer.tokenize(text), [ 25 | Token(' ', Token.TokenType.SPACE, 0, 1), 26 | Token('Пора', Token.TokenType.WORD, 1, 5)]) 27 | 28 | def test_numbers(self): 29 | text = "Очевидно, 1 января 1970 года..." 30 | self.assertEqual(Tokenizer.tokenize(text), [ 31 | Token('Очевидно', Token.TokenType.WORD, 0, 8), 32 | Token(',', Token.TokenType.PUNCTUATION, 8, 9), 33 | Token(' ', Token.TokenType.SPACE, 9, 10), 34 | Token('1', Token.TokenType.NUMBER, 10, 11), 35 | Token(' ', Token.TokenType.SPACE, 11, 12), 36 | Token('января', Token.TokenType.WORD, 12, 18), 37 | Token(' ', Token.TokenType.SPACE, 18, 19), 38 | Token('1970', Token.TokenType.NUMBER, 19, 23), 39 | Token(' ', Token.TokenType.SPACE, 23, 24), 40 | Token('года', Token.TokenType.WORD, 24, 28), 41 | Token('...', Token.TokenType.PUNCTUATION, 28, 31)]) 42 | 43 | self.assertEqual(Tokenizer.tokenize(text, replace_numbers=True), [ 44 | Token('Очевидно', Token.TokenType.WORD, 0, 8), 45 | Token(',', Token.TokenType.PUNCTUATION, 8, 9), 46 | Token(' ', Token.TokenType.SPACE, 9, 10), 47 | Token('ЧИСЛО', Token.TokenType.WORD, 10, 11), 48 | Token(' ', Token.TokenType.SPACE, 11, 12), 49 | Token('января', Token.TokenType.WORD, 12, 18), 50 | Token(' ', Token.TokenType.SPACE, 18, 19), 51 | Token('ЧИСЛО', Token.TokenType.WORD, 19, 23), 52 | Token(' ', Token.TokenType.SPACE, 23, 24), 53 | Token('года', Token.TokenType.WORD, 24, 28), 54 | Token('...', Token.TokenType.PUNCTUATION, 28, 31)]) 55 | 56 | 57 | class TestSentenceTokenizer(unittest.TestCase): 58 | def test_tokenizer(self): 59 | text1 = "Конкурс учреждён в 2005 году!!! Официальный партнёр конкурса – Президентский центр Б.Н. Ельцина." 60 | self.assertEqual(SentenceTokenizer.tokenize(text1), 61 | ['Конкурс учреждён в 2005 году!!!', 62 | 'Официальный партнёр конкурса – Президентский центр Б.Н. Ельцина.']) 63 | -------------------------------------------------------------------------------- /rupo/test_api.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Автор: Гусев Илья 3 | # Описание: Тесты для API библиотеки. 4 | 5 | import unittest 6 | import os 7 | import random 8 | 9 | from rupo.settings import MARKUP_XML_EXAMPLE, EXAMPLES_DIR, GENERATOR_MODEL_DIR, \ 10 | GENERATOR_WORD_FORM_VOCAB_PATH, GENERATOR_VOCAB_PATH, RU_STRESS_DEFAULT_MODEL,\ 11 | ZALYZNYAK_DICT 12 | from rupo.main.markup import Markup 13 | from rupo.api import Engine 14 | 15 | 16 | class TestApi(unittest.TestCase): 17 | @classmethod 18 | def setUpClass(cls): 19 | cls.engine = Engine(language="ru") 20 | cls.engine.load( 21 | stress_model_path=RU_STRESS_DEFAULT_MODEL, 22 | zalyzniak_dict=ZALYZNYAK_DICT 23 | ) 24 | 25 | @classmethod 26 | def tearDownClass(cls): 27 | del cls.engine 28 | 29 | def test_stress(self): 30 | self.assertEqual(self.engine.get_stresses("корова"), [3]) 31 | self.assertEqual(self.engine.get_stresses("триплекс"), [2]) 32 | self.assertEqual(self.engine.get_stresses("горит"), [3]) 33 | self.assertEqual(self.engine.get_stresses("восток"), [4]) 34 | self.assertEqual(self.engine.get_stresses("зарёю"), [3]) 35 | self.assertEqual(self.engine.get_stresses("новой"), [1]) 36 | self.assertEqual(self.engine.get_stresses("равнине"), [4]) 37 | self.assertEqual(self.engine.get_stresses("холмам"), [4]) 38 | self.assertEqual(self.engine.get_stresses("грохочут"), [4]) 39 | self.assertCountEqual(self.engine.get_stresses("пушки"), [4, 1]) 40 | self.assertEqual(self.engine.get_stresses("багровый"), [4]) 41 | self.assertEqual(self.engine.get_stresses("кругами"), [4]) 42 | self.assertEqual(self.engine.get_stresses("уж"), [0]) 43 | self.assertEqual(self.engine.get_stresses('колесом'), [5]) 44 | 45 | def test_get_word_syllables(self): 46 | self.assertEqual(self.engine.get_word_syllables("корова"), ["ко", "ро", "ва"]) 47 | 48 | def test_count_syllables(self): 49 | self.assertEqual(self.engine.count_syllables("корова"), 3) 50 | 51 | def test_is_rhyme(self): 52 | self.assertTrue(self.engine.is_rhyme("корова", "здорова")) 53 | 54 | def test_get_markup(self): 55 | self.assertIsInstance(self.engine.get_markup("корова"), Markup) 56 | 57 | def test_get_improved_markup(self): 58 | self.assertIsInstance(self.engine.get_improved_markup("корова")[0], Markup) 59 | 60 | def test_classify_metre(self): 61 | text = "Горит восток зарёю новой.\n" \ 62 | "Уж на равнине, по холмам\n" \ 63 | "Грохочут пушки. Дым багровый\n" \ 64 | "Кругами всходит к небесам." 65 | self.assertEqual(self.engine.classify_metre(text), "iambos") 66 | 67 | def test_generate_poem(self): 68 | random.seed(42) 69 | model_path = GENERATOR_MODEL_DIR 70 | vocab_path = GENERATOR_WORD_FORM_VOCAB_PATH 71 | stress_path = GENERATOR_VOCAB_PATH 72 | poem = self.engine.generate_poem( 73 | model_path, 74 | vocab_path, 75 | stress_path, 76 | sampling_k=10000, 77 | n_syllables=8, 78 | rhyme_pattern="abab", 79 | metre_schema="-+") 80 | self.assertIsNotNone(poem) 81 | 82 | def test_get_word_rhymes(self): 83 | vocab_dump_file = os.path.join(EXAMPLES_DIR, "vocab_rhymes.pickle") 84 | self.assertEqual(self.engine.get_word_rhymes("глядел", vocab_dump_file, MARKUP_XML_EXAMPLE), ["сидел", "летел"]) 85 | 86 | -------------------------------------------------------------------------------- /rupo/stress/predictor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Автор: Гусев Илья 3 | # Описание: Класс для определения ударения. 4 | 5 | from typing import List 6 | 7 | from rupo.stress.dict import StressDict 8 | from rupo.util.preprocess import count_vowels, get_first_vowel_position 9 | from rupo.settings import CMU_DICT, ZALYZNYAK_DICT, RU_STRESS_DEFAULT_MODEL 10 | from rupo.stress.word import Stress 11 | 12 | from russ.stress.model import StressModel 13 | 14 | 15 | class StressPredictor: 16 | def predict(self, word: str) -> List[int]: 17 | raise NotImplementedError() 18 | 19 | 20 | class DictStressPredictor(StressPredictor): 21 | def __init__(self, language="ru", raw_dict_path=None, trie_path=None, 22 | zalyzniak_dict=ZALYZNYAK_DICT, cmu_dict=CMU_DICT): 23 | self.stress_dict = StressDict(language, raw_dict_path=raw_dict_path, trie_path=trie_path, 24 | zalyzniak_dict=zalyzniak_dict, cmu_dict=cmu_dict) 25 | 26 | def predict(self, word: str) -> List[int]: 27 | """ 28 | Определение ударения в слове по словарю. Возможно несколько вариантов ударения. 29 | 30 | :param word: слово для простановки ударений. 31 | :return stresses: позиции букв, на которые падает ударение. 32 | """ 33 | stresses = [] 34 | if count_vowels(word) == 0: 35 | # Если гласных нет, то и ударений нет. 36 | pass 37 | elif count_vowels(word) == 1: 38 | # Если одна гласная, то на неё и падает ударение. 39 | stresses.append(get_first_vowel_position(word)) 40 | elif word.find("ё") != -1: 41 | # Если есть буква "ё", то только на неё может падать ударение. 42 | stresses.append(word.find("ё")) 43 | else: 44 | # Проверяем словарь на наличие форм с ударениями. 45 | stresses = self.stress_dict.get_stresses(word, Stress.Type.PRIMARY) +\ 46 | self.stress_dict.get_stresses(word, Stress.Type.SECONDARY) 47 | if 'е' not in word: 48 | return stresses 49 | # Находим все возможные варинаты преобразований 'е' в 'ё'. 50 | positions = [i for i in range(len(word)) if word[i] == 'е'] 51 | beam = [word[:positions[0]]] 52 | for i in range(len(positions)): 53 | new_beam = [] 54 | for prefix in beam: 55 | n = positions[i+1] if i+1 < len(positions) else len(word) 56 | new_beam.append(prefix + 'ё' + word[positions[i]+1:n]) 57 | new_beam.append(prefix + 'е' + word[positions[i]+1:n]) 58 | beam = new_beam 59 | # И проверяем их по словарю. 60 | for permutation in beam: 61 | if len(self.stress_dict.get_stresses(permutation)) != 0: 62 | yo_pos = permutation.find("ё") 63 | if yo_pos != -1: 64 | stresses.append(yo_pos) 65 | return stresses 66 | 67 | 68 | class CombinedStressPredictor(StressPredictor): 69 | def __init__(self, language="ru", stress_model_path: str=RU_STRESS_DEFAULT_MODEL, raw_stress_dict_path=None, 70 | stress_trie_path=None, zalyzniak_dict=ZALYZNYAK_DICT, cmu_dict=CMU_DICT): 71 | self.rnn = StressModel.load(stress_model_path) 72 | self.dict = DictStressPredictor(language, raw_stress_dict_path, stress_trie_path, zalyzniak_dict, cmu_dict) 73 | 74 | def predict(self, word: str) -> List[int]: 75 | stresses = self.dict.predict(word) 76 | if len(stresses) == 0: 77 | return self.rnn.predict(word) 78 | else: 79 | return stresses 80 | -------------------------------------------------------------------------------- /rupo/main/vocabulary.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Автор: Гусев Илья 3 | # Описание: Индексы слов для языковой модели. 4 | 5 | from typing import Dict 6 | import pickle 7 | 8 | from allennlp.data.vocabulary import Vocabulary 9 | 10 | from rupo.main.markup import Markup 11 | from rupo.files.reader import Reader, FileType 12 | from rupo.stress.word import StressedWord, Stress 13 | from rupo.stress.predictor import StressPredictor 14 | 15 | 16 | class StressVocabulary(object): 17 | """ 18 | Индексированный словарь. 19 | """ 20 | def __init__(self) -> None: 21 | self.word_to_index = {} # type: Dict[StressedWord, int] 22 | self.index_to_word = {} # type: Dict[int, StressedWord] 23 | 24 | def save(self, dump_filename: str) -> None: 25 | """ 26 | Сохранение словаря. 27 | """ 28 | with open(dump_filename, "wb") as f: 29 | pickle.dump(self, f, pickle.HIGHEST_PROTOCOL) 30 | 31 | def load(self, dump_filename: str): 32 | """ 33 | Загрузка словаря. 34 | """ 35 | with open(dump_filename, "rb") as f: 36 | vocab = pickle.load(f) 37 | self.__dict__.update(vocab.__dict__) 38 | 39 | def parse(self, markup_path: str, from_voc: bool=False): 40 | if from_voc: 41 | word_indexes = Reader.read_vocabulary(markup_path) 42 | for word, index in word_indexes: 43 | self.add_word(word.to_stressed_word(), index) 44 | else: 45 | markups = Reader.read_markups(markup_path, FileType.XML, is_processed=True) 46 | for markup in markups: 47 | self.add_markup(markup) 48 | 49 | def add_markup(self, markup: Markup) -> None: 50 | """ 51 | Добавление слов из разметки в словарь. 52 | 53 | :param markup: разметка. 54 | """ 55 | for line in markup.lines: 56 | for word in line.words: 57 | self.add_word(word.to_stressed_word()) 58 | 59 | def add_word(self, word: StressedWord, index: int=-1) -> bool: 60 | """ 61 | Добавление слова. 62 | 63 | :param word: слово. 64 | :param index: индекс, если задан заранее. 65 | :return: слово новое или нет. 66 | """ 67 | if word in self.word_to_index: 68 | if index != -1: 69 | self.index_to_word[index] = word 70 | return False 71 | self.word_to_index[word] = self.size() if index == -1 else index 72 | self.index_to_word[self.size() if index == -1 else index] = word 73 | return True 74 | 75 | def get_word_index(self, word: StressedWord) -> int: 76 | """ 77 | Получить индекс слова. 78 | 79 | :param word: слово (Word). 80 | :return: индекс. 81 | """ 82 | if word in self.word_to_index: 83 | return self.word_to_index[word] 84 | raise IndexError("Can't find word: " + word.text) 85 | 86 | def get_word(self, index: int) -> StressedWord: 87 | """ 88 | Получить слово по индексу. 89 | 90 | :param index: индекс. 91 | :return: слово. 92 | """ 93 | return self.index_to_word[index] 94 | 95 | def size(self): 96 | """ 97 | :return: получить размер словаря. 98 | """ 99 | return len(self.index_to_word) 100 | 101 | 102 | def inflate_stress_vocabulary(vocabulary: Vocabulary, stress_predictor: StressPredictor): 103 | vocab = StressVocabulary() 104 | for index, word in vocabulary.get_index_to_token_vocabulary("tokens").items(): 105 | stresses = [Stress(pos, Stress.Type.PRIMARY) for pos in stress_predictor.predict(word)] 106 | word = StressedWord(word, set(stresses)) 107 | vocab.add_word(word, index) 108 | return vocab 109 | -------------------------------------------------------------------------------- /rupo/files/writer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Автор: Гусев Илья 3 | # Описание: Запись в файлы разных расширений. 4 | 5 | import os 6 | from typing import List 7 | 8 | from rupo.files.reader import RAW_SEPARATOR 9 | from rupo.main.markup import Markup 10 | from rupo.files.reader import FileType 11 | 12 | 13 | class Writer(object): 14 | """ 15 | Запись в файл. 16 | """ 17 | def __init__(self, destination_type: FileType, path: str) -> None: 18 | """ 19 | Нужно, когда хотим записывать разметки по одной (экономия памяти). 20 | 21 | :param destination_type: тип файла. 22 | :param path: путь к файлу. 23 | """ 24 | self.type = destination_type 25 | self.path = path 26 | self.file = None 27 | try: 28 | os.remove(path) 29 | except OSError: 30 | pass 31 | 32 | def open(self) -> None: 33 | """ 34 | Открываем файл, вызывать до начала записи. 35 | """ 36 | self.file = open(self.path, "w", encoding="utf-8") 37 | if self.type == FileType.XML: 38 | self.file.write('') 39 | 40 | def write_markup(self, markup: Markup) -> None: 41 | """ 42 | Запись разметки в уже открытый файл. 43 | :param markup: разметка. 44 | """ 45 | assert self.file is not None 46 | if self.type == FileType.XML: 47 | xml = markup.to_xml().encode('utf-8')\ 48 | .replace(b'', b'').decode('utf-8') 49 | self.file.write(xml) 50 | elif self.type == FileType.RAW: 51 | Writer.__write_markup_raw(markup, self.file) 52 | 53 | def close(self) -> None: 54 | """ 55 | Закрываем файл. 56 | """ 57 | if self.type == FileType.XML: 58 | self.file.write('') 59 | self.file.close() 60 | 61 | @staticmethod 62 | def write_markups(destination_type: FileType, markups: List[Markup], path: str) -> None: 63 | """ 64 | Запись разметок в файл. 65 | 66 | :param destination_type: тип файла. 67 | :param markups: разметки. 68 | :param path: путь к файлу. 69 | """ 70 | with open(path, "w", encoding="utf-8") as file: 71 | if destination_type == FileType.XML: 72 | file.write('') 73 | for markup in markups: 74 | xml = markup.to_xml().encode('utf-8')\ 75 | .replace(b'', b'').decode('utf-8') 76 | file.write(xml) 77 | file.write("\n") 78 | file.write('') 79 | elif destination_type == FileType.JSON: 80 | file.write("[") 81 | for markup in markups: 82 | file.write(markup.to_json()) 83 | file.write(",") 84 | file.seek(0, 2) 85 | size = file.tell() 86 | file.truncate(size - 1) 87 | file.write(']') 88 | elif destination_type == FileType.RAW: 89 | for markup in markups: 90 | Writer.__write_markup_raw(markup, file) 91 | 92 | @staticmethod 93 | def __write_markup_raw(markup: Markup, file) -> None: 94 | """ 95 | Запись разметки в текстовом виде (слово+ударение). 96 | 97 | :param markup: разметка. 98 | :param file: открытый файл. 99 | """ 100 | lines = [] 101 | for line in markup.lines: 102 | lines.append(" ".join([word.get_short() for word in line.words])) 103 | file.write("\n".join(lines)) 104 | file.write(RAW_SEPARATOR) 105 | -------------------------------------------------------------------------------- /rupo/generate/generator.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Автор: Гусев Илья 3 | # Описание: Модуль создания стихотворений. 4 | 5 | from typing import Optional 6 | 7 | from allennlp.data.vocabulary import Vocabulary 8 | from rulm.language_model import LanguageModel 9 | 10 | from rupo.main.vocabulary import StressVocabulary 11 | from rupo.generate.transforms import PoemTransform 12 | 13 | 14 | class Generator(object): 15 | """ 16 | Генератор стихов 17 | """ 18 | def __init__(self, 19 | model: LanguageModel, 20 | token_vocabulary: Vocabulary, 21 | stress_vocabulary: StressVocabulary, 22 | eos_index: int): 23 | self.model = model # type: LanguageModel 24 | self.token_vocabulary = token_vocabulary # type: Vocabulary 25 | self.stress_vocabulary = stress_vocabulary # type: StressVocabulary 26 | self.eos_index = eos_index 27 | 28 | def generate_poem(self, 29 | metre_schema: str="+-", 30 | rhyme_pattern: str="aabb", 31 | n_syllables: int=8, 32 | letters_to_rhymes: dict=None, 33 | beam_width: int=None, 34 | sampling_k: int=None, 35 | rhyme_score_border: int=4, 36 | temperature: float=1.0, 37 | seed: int=1337, 38 | last_text: str="") -> Optional[str]: 39 | assert beam_width or sampling_k, "Set sampling_k or beam_width" 40 | self.model.set_seed(seed) 41 | 42 | poem_transform = PoemTransform( 43 | stress_vocabulary=self.stress_vocabulary, 44 | metre_pattern=metre_schema, 45 | rhyme_pattern=rhyme_pattern, 46 | n_syllables=n_syllables, 47 | eos_index=self.eos_index, 48 | letters_to_rhymes=letters_to_rhymes, 49 | score_border=rhyme_score_border 50 | ) 51 | 52 | if last_text: 53 | words = last_text.lower().split(" ") 54 | last_text = " ".join(words[::-1]) 55 | filled_syllables = 0 56 | for word in last_text.split(): 57 | index = self.token_vocabulary.get_token_index(word) 58 | word = self.stress_vocabulary.get_word(index) 59 | syllables_count = len(word.syllables) 60 | filled_syllables += syllables_count 61 | poem_transform.stress_position -= filled_syllables 62 | poem_transform.rhyme_position -= 1 63 | last_index = self.token_vocabulary.get_token_index(words[-1]) 64 | last_word = self.stress_vocabulary.get_word(last_index) 65 | poem_transform.letters_to_rhymes[rhyme_pattern[-1]].add(last_word) 66 | 67 | self.model.transforms.append(poem_transform) 68 | 69 | try: 70 | if beam_width: 71 | poem = self.model.beam_decoding(last_text, beam_width=beam_width, temperature=temperature) 72 | elif sampling_k: 73 | poem = self.model.sample_decoding(last_text, k=sampling_k, temperature=temperature) 74 | else: 75 | assert False 76 | except Exception as e: 77 | self.model.transforms.pop() 78 | raise e 79 | 80 | self.model.transforms.pop() 81 | 82 | words = poem.split(" ") 83 | words = words[::-1] 84 | result_words = [] 85 | current_n_syllables = 0 86 | for word in words: 87 | result_words.append(word) 88 | index = self.token_vocabulary.get_token_index(word) 89 | word = self.stress_vocabulary.get_word(index) 90 | syllables_count = len(word.syllables) 91 | current_n_syllables += syllables_count 92 | if n_syllables == current_n_syllables: 93 | current_n_syllables = 0 94 | result_words.append("\n") 95 | poem = " ".join(result_words) 96 | poem = "\n".join([line.strip() for line in poem.split("\n")]) 97 | return poem 98 | 99 | -------------------------------------------------------------------------------- /rupo/dict/cmu.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Автор: Гусев Илья 3 | # Описание: Конвертер CMU словаря. 4 | 5 | 6 | class CMUDict: 7 | aprabet2ipa = { 8 | "AO": "ɔ", 9 | "AA": "ɑ", 10 | "IY": "i", 11 | "UW": "u", 12 | "EH": "ɛ", 13 | "IH": "ɪ", 14 | "UH": "ʊ", 15 | "AH": "ʌ", 16 | "AX": "ə", 17 | "AE": "æ", 18 | "EY": "eɪ", 19 | "AY": "aɪ", 20 | "OW": "oʊ", 21 | "AW": "aʊ", 22 | "OY": "ɔɪ", 23 | "ER": "ɝ", 24 | "AXR": "ɚ", 25 | "P": "p", 26 | "B": "b", 27 | "T": "t", 28 | "D": "d", 29 | "K": "k", 30 | "G": "ɡ", 31 | "CH": "ʦ", 32 | "JH": "ʤ", 33 | "F": "f", 34 | "V": "v", 35 | "TH": "θ", 36 | "DH": "ð", 37 | "S": "s", 38 | "Z": "z", 39 | "SH": "ʃ", 40 | "ZH": "ʒ", 41 | "HH": "h", 42 | "M": "m", 43 | "EM": "m", 44 | "N": "n", 45 | "EN": "n", 46 | "NG": "ŋ", 47 | "ENG": "ŋ", 48 | "L": "ɫ", 49 | "EL": "ɫ", 50 | "R": "r", 51 | "DX": "ɾ", 52 | "NX": "ɾ", 53 | "Y": "j", 54 | "W": "w", 55 | "Q": "ʔ" 56 | } 57 | 58 | diphtongs = ["EY", "AY", "OW", "AW", "OY"] 59 | 60 | @staticmethod 61 | def convert_to_g2p_only(source_file, destination_file): 62 | clean = [] 63 | with open(source_file, 'r', encoding="utf-8", errors="ignore") as f: 64 | lines = f.readlines() 65 | for line in lines: 66 | g = line.split(" ")[0].lower() 67 | if not ("a" <= g[0] <= "z"): 68 | continue 69 | if "(" in g: 70 | continue 71 | p = line.split(" ")[1].strip() 72 | phonemes = p.split(" ") 73 | for i, phoneme in enumerate(phonemes): 74 | if not ("A" <= phoneme[-1] <= "Z"): 75 | phonemes[i] = phoneme[:-1] 76 | p = "".join([CMUDict.aprabet2ipa[phoneme] for phoneme in phonemes]) 77 | clean.append((g, p)) 78 | with open(destination_file, 'w', encoding="utf-8") as w: 79 | for g, p in clean: 80 | w.write(g+"\t"+p+"\n") 81 | 82 | @staticmethod 83 | def convert_to_phoneme_stress(source_file, destination_file): 84 | clean = [] 85 | with open(source_file, 'r', encoding="utf-8", errors="ignore") as f: 86 | for line in f: 87 | g = line.split(" ")[0].lower() 88 | if not ("a" <= g[0] <= "z"): 89 | continue 90 | p = line.split(" ")[1].strip() 91 | if "(1)" in g: 92 | g = g.replace("(1)", "") 93 | if "(2)" in g: 94 | g = g.replace("(2)", "") 95 | if "(" in g: 96 | continue 97 | 98 | phonemes = p.split(" ") 99 | primary = [] 100 | secondary = [] 101 | diphtongs_count = 0 102 | for i, phoneme in enumerate(phonemes): 103 | if not ("A" <= phoneme[-1] <= "Z"): 104 | if int(phoneme[-1]) == 1: 105 | primary.append(str(i+diphtongs_count)) 106 | if int(phoneme[-1]) == 2: 107 | secondary.append(str(i+diphtongs_count)) 108 | phonemes[i] = phoneme[:-1] 109 | if phonemes[i] in CMUDict.diphtongs: 110 | diphtongs_count += 1 111 | p = "".join([CMUDict.aprabet2ipa[phoneme] for phoneme in phonemes]) 112 | clean.append((p, primary, secondary)) 113 | with open(destination_file, 'w', encoding="utf-8") as w: 114 | for p, f, s in clean: 115 | w.write(p + "\t" + ",".join(f) + "\t" + ",".join(s) + "\n") -------------------------------------------------------------------------------- /rupo/rhymes/rhymes.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Автор: Гусев Илья 3 | # Описание: Класс рифм. 4 | 5 | from rupo.stress.word import StressedWord 6 | from rupo.util.preprocess import VOWELS 7 | 8 | 9 | class RhymeProfile: 10 | def __init__(self, syllable_count: int, stressed_syllable_number: int, 11 | stressed_syllable_text: str, next_syllable_text: str, next_char: str): 12 | self.syllable_count = syllable_count 13 | self.stressed_syllable_number = stressed_syllable_number 14 | self.stressed_syllable_text = stressed_syllable_text 15 | self.next_syllable_text = next_syllable_text 16 | self.next_char = next_char 17 | 18 | def __str__(self): 19 | return "Syllable count: {}; Stressed syllable: {}; " \ 20 | "Stressed syllable text: {}; Next syllable: {}; " \ 21 | "Next char: {}".format(self.syllable_count, self.stressed_syllable_number, 22 | self.stressed_syllable_text, self.next_syllable_text, self.next_char) 23 | 24 | def __repr__(self): 25 | return self.__str__() 26 | 27 | 28 | class Rhymes(object): 29 | """ 30 | Поиск рифм. 31 | """ 32 | 33 | @staticmethod 34 | def is_rhyme(word1: StressedWord, 35 | word2: StressedWord, 36 | score_border: int=4, 37 | syllable_number_border: int=4) -> bool: 38 | """ 39 | Проверка рифмованности 2 слов. 40 | 41 | :param word1: первое слово для проверки рифмы, уже акцентуированное (Word). 42 | :param word2: второе слово для проверки рифмы, уже акцентуированное (Word). 43 | :param score_border: граница определния рифмы, чем выше, тем строже совпадение. 44 | :param syllable_number_border: ограничение на номер слога с конца, на который падает ударение. 45 | :return result: является рифмой или нет. 46 | """ 47 | profile1 = Rhymes.__get_rhyme_profile(word1) 48 | profile2 = Rhymes.__get_rhyme_profile(word2) 49 | score = 0 50 | for i, ch1 in enumerate(profile1.stressed_syllable_text): 51 | for j, ch2 in enumerate(profile2.stressed_syllable_text[i:]): 52 | if ch1 != ch2: 53 | continue 54 | if ch1 in VOWELS: 55 | score += 3 56 | else: 57 | score += 1 58 | if profile1.next_syllable_text == profile2.next_syllable_text and profile1.next_syllable_text != '': 59 | score += 3 60 | elif profile1.next_char == profile2.next_char and profile1.next_char != '': 61 | score += 1 62 | return (profile1.stressed_syllable_number == profile2.stressed_syllable_number and 63 | profile1.syllable_count == profile2.syllable_count and 64 | profile1.stressed_syllable_number <= syllable_number_border and 65 | score >= score_border) 66 | 67 | @staticmethod 68 | def __get_rhyme_profile(word: StressedWord) -> 'RhymeProfile': 69 | """ 70 | Получение профиля рифмовки (набора признаков для сопоставления). 71 | 72 | :param word: уже акцентуированное слово (Word). 73 | :return profile: профиль рифмовки. 74 | """ 75 | # TODO: Переход на фонетическое слово, больше признаков. 76 | 77 | profile = RhymeProfile(syllable_count=0, 78 | stressed_syllable_number=-1, 79 | stressed_syllable_text="", 80 | next_syllable_text="", 81 | next_char="") 82 | syllables = list(word.syllables) 83 | profile.syllable_count = len(syllables) 84 | for i, syllable in enumerate(reversed(syllables)): 85 | if syllable.stress == -1: 86 | continue 87 | profile.stressed_syllable_text = syllable.text 88 | profile.stressed_syllable_number = -i-1 89 | if i != 0: 90 | profile.next_syllable = syllables[-i].text 91 | if syllable.stress + 1 < len(word.text): 92 | profile.next_char = word.text[syllable.stress + 1] 93 | break 94 | return profile 95 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Python library for analysis and generation of poems in Russian # 2 | 3 | [![Current version on PyPI](http://img.shields.io/pypi/v/rupo.svg)](https://pypi.python.org/pypi/rupo) 4 | [![Python versions](https://img.shields.io/pypi/pyversions/rupo.svg)](https://pypi.python.org/pypi/rupo) 5 | [![Build Status](https://travis-ci.org/IlyaGusev/rupo.svg?branch=master)](https://travis-ci.org/IlyaGusev/rupo) 6 | [![Code Climate](https://codeclimate.com/github/IlyaGusev/rupo/badges/gpa.svg)](https://codeclimate.com/github/IlyaGusev/rupo) 7 | [![Documentation Status](https://readthedocs.org/projects/rupo/badge/?version=latest)](http://rupo.readthedocs.io/en/latest/?badge=latest) 8 | 9 | ### Install ### 10 | Warning: Python 3.9+ is not supported! Use Python 3.8. 11 | 12 | ``` 13 | git clone https://github.com/IlyaGusev/rupo 14 | cd rupo 15 | pip install -r requirements.txt 16 | sh download.sh 17 | ``` 18 | 19 | ### Example ### 20 | https://colab.research.google.com/drive/1WBl9erJvC9Oc9PjCD8JyC_40TDUqahCx 21 | 22 | ### Usage manual ### 23 | #### Analysis #### 24 | ``` 25 | >>> from rupo.api import Engine 26 | >>> engine = Engine(language="ru") 27 | >>> engine.load(, ) 28 | >>> engine.get_stresses("корова") 29 | [3] 30 | 31 | >>> engine.get_word_syllables("корова") 32 | ["ко", "ро", "ва"] 33 | 34 | >>> engine.is_rhyme("корова", "здорова") 35 | True 36 | 37 | >>> text = "Горит восток зарёю новой.\nУж на равнине, по холмам\nГрохочут пушки. Дым багровый\nКругами всходит к небесам." 38 | >>> engine.classify_metre(text) 39 | iambos 40 | ``` 41 | 42 | #### Generation #### 43 | Script for poem generation. It can work in two different modes: sampling or beam search. 44 | 45 | ``` 46 | python generate_poem.py 47 | ``` 48 | 49 | | Argument | Default | Description | 50 | |:--------------------|:--------|:-------------------------------------------| 51 | | --metre-schema | +- | feet type: -+ (iambos), +- (trochee), ... | 52 | | --rhyme-pattern | abab | rhyme pattern | 53 | | --n-syllables | 8 | number of syllables in line | 54 | | --sampling-k | 50000 | top-k words to sample from (sampling mode) | 55 | | --beam-width | None | width of beam search (beam search mode) | 56 | | --temperature | 1.0 | sampling softmax temperature | 57 | | --last-text | None | custom last line | 58 | | --count | 100 | count of poems to generate | 59 | | --model-path | None | optional path to generator model directory | 60 | | --token-vocab-path | None | optional path to vocabulary | 61 | | --stress-vocab-path | None | optional path to stress vocabulary | 62 | 63 | ## Models ### 64 | * Generator: https://www.dropbox.com/s/dwkui2xqivzsyw5/generator_model.zip 65 | * Stress predictor: https://www.dropbox.com/s/i9tarc8pum4e40p/stress_models_14_05_17.zip 66 | * G2P: https://www.dropbox.com/s/7rk135fzd3i8kfw/g2p_models.zip 67 | * Dictionaries: https://www.dropbox.com/s/znqlrb1xblh3amo/dict.zip 68 | 69 | ### Литература ### 70 | * Брейдо, 1996, [Автоматический анализ метрики русского стиха](http://search.rsl.ru/ru/record/01000000124) 71 | * Каганов, 1996, [Лингвистическое конструирование в системах искусственного интеллекта](http://lleo.me/soft/text_dip.htm) 72 | * Козьмин, 2006, [Автоматический анализ стиха в системе Starling](http://www.dialog-21.ru/digests/dialog2006/materials/html/Kozmin.htm) 73 | * Гришина, 2008, [Поэтический корпус в рамках НКРЯ: общая структура и перспективы использования](http://ruscorpora.ru/sbornik2008/05.pdf) 74 | * Пильщиков, Старостин, 2012, [Автоматическое распознавание метра: проблемы и решения](http://www.academia.edu/11465228/%D0%90%D0%B2%D1%82%D0%BE%D0%BC%D0%B0%D1%82%D0%B8%D1%87%D0%B5%D1%81%D0%BA%D0%BE%D0%B5_%D1%80%D0%B0%D1%81%D0%BF%D0%BE%D0%B7%D0%BD%D0%B0%D0%B2%D0%B0%D0%BD%D0%B8%D0%B5_%D0%BC%D0%B5%D1%82%D1%80%D0%B0_%D0%BF%D1%80%D0%BE%D0%B1%D0%BB%D0%B5%D0%BC%D1%8B_%D0%B8_%D1%80%D0%B5%D1%88%D0%B5%D0%BD%D0%B8%D1%8F) 75 | * Барахнин, 2015, [Алгоритмы комплексного анализа русских поэтических текстов с целью автоматизации процесса создания метрических справочников и конкордансов](http://ceur-ws.org/Vol-1536/paper21.pdf), [сама система](http://poem.ict.nsc.ru/) 76 | -------------------------------------------------------------------------------- /rupo/generate/transforms.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | import numpy as np 3 | from collections import defaultdict 4 | 5 | from rulm.transform import Transform 6 | 7 | from rupo.rhymes.rhymes import Rhymes 8 | from rupo.main.vocabulary import StressVocabulary 9 | from rupo.stress.word import StressedWord 10 | 11 | 12 | class PoemTransform(Transform): 13 | """ 14 | Фильтр по шаблону метра. 15 | """ 16 | def __init__(self, 17 | stress_vocabulary: StressVocabulary, 18 | metre_pattern: str, 19 | rhyme_pattern: str, 20 | n_syllables: int, 21 | eos_index: int, 22 | letters_to_rhymes: dict=None, 23 | score_border=4): 24 | self.stress_vocabulary = stress_vocabulary 25 | 26 | self.n_syllables = n_syllables 27 | 28 | mul = n_syllables // len(metre_pattern) 29 | if n_syllables % len(metre_pattern) != 0: 30 | mul += 1 31 | 32 | self.metre_pattern = metre_pattern * mul 33 | self.stress_position = len(self.metre_pattern) - 1 34 | self.eos_index = eos_index 35 | 36 | self.rhyme_pattern = rhyme_pattern 37 | self.rhyme_position = len(self.rhyme_pattern) - 1 38 | self.score_border = score_border 39 | 40 | self.letters_to_rhymes = defaultdict(set) 41 | if letters_to_rhymes is not None: 42 | for letter, words in letters_to_rhymes.items(): 43 | for word in words: 44 | self.letters_to_rhymes[letter].add(word) 45 | 46 | def __call__(self, probabilities: np.array) -> np.array: 47 | if self.rhyme_position < 0 and self.stress_position == len(self.metre_pattern) - 1: 48 | probabilities = np.zeros(probabilities.shape, dtype="float") 49 | probabilities[self.eos_index] = 1. 50 | return probabilities 51 | 52 | for index in range(probabilities.shape[0]): 53 | word = self.stress_vocabulary.get_word(index) 54 | is_good_by_stress = self._filter_word_by_stress(word) 55 | is_good_by_rhyme = True 56 | if self.stress_position == len(self.metre_pattern) - 1: 57 | is_good_by_rhyme = self._filter_word_by_rhyme(word) 58 | if not is_good_by_stress or not is_good_by_rhyme: 59 | probabilities[index] = 0. 60 | 61 | assert np.sum(probabilities > 0) != 0, "Poem transform filtered out all words" 62 | return probabilities 63 | 64 | def advance(self, index: int): 65 | word = self.stress_vocabulary.get_word(index) 66 | syllables_count = len(word.syllables) 67 | 68 | if self.stress_position == len(self.metre_pattern) - 1: 69 | letter = self.rhyme_pattern[self.rhyme_position] 70 | self.letters_to_rhymes[letter].add(word) 71 | self.rhyme_position -= 1 72 | 73 | self.stress_position -= syllables_count 74 | 75 | if self.stress_position < 0: 76 | self.stress_position = len(self.metre_pattern) - 1 77 | 78 | def _filter_word_by_stress(self, word: StressedWord) -> bool: 79 | syllables = word.syllables 80 | syllables_count = len(syllables) 81 | if syllables_count == 0: 82 | return False 83 | if self.stress_position - syllables_count < -1: 84 | return False 85 | for i in range(syllables_count): 86 | syllable = syllables[i] 87 | syllable_number = self.stress_position - syllables_count + i + 1 88 | if syllables_count >= 2 and syllable.stress == -1 and self.metre_pattern[syllable_number] == "+": 89 | for j in range(syllables_count): 90 | other_syllable = syllables[j] 91 | other_syllable_number = other_syllable.number - syllable.number + syllable_number 92 | if i != j and other_syllable.stress != -1 and self.metre_pattern[other_syllable_number] == "-": 93 | return False 94 | return True 95 | 96 | def _filter_word_by_rhyme(self, word: StressedWord) -> bool: 97 | if len(word.syllables) <= 1: 98 | return False 99 | rhyming_words = self.letters_to_rhymes[self.rhyme_pattern[self.rhyme_position]] 100 | if len(rhyming_words) == 0: 101 | return True 102 | first_word = list(rhyming_words)[0] 103 | 104 | is_rhyme = Rhymes.is_rhyme(first_word, word, 105 | score_border=self.score_border, 106 | syllable_number_border=2) and first_word.text != word.text 107 | return is_rhyme 108 | 109 | def __copy__(self): 110 | obj = type(self)(self.stress_vocabulary, self.metre_pattern, self.rhyme_pattern, self.n_syllables, 111 | self.eos_index, self.letters_to_rhymes, self.score_border) 112 | obj.stress_position = self.stress_position 113 | obj.rhyme_position = self.rhyme_position 114 | obj.letters_to_rhymes = deepcopy(self.letters_to_rhymes) 115 | return obj 116 | -------------------------------------------------------------------------------- /rupo/stress/dict.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Автор: Гусев Илья 3 | # Описание: Класс для удобной работы со словарём ударений. 4 | 5 | import pygtrie 6 | import os 7 | import pickle 8 | from typing import List, Dict, ItemsView, Set 9 | 10 | from rupo.dict.cmu import CMUDict 11 | from rupo.settings import RU_GRAPHEME_STRESS_PATH, RU_GRAPHEME_STRESS_TRIE_PATH, \ 12 | EN_PHONEME_STRESS_PATH, EN_PHONEME_STRESS_TRIE_PATH, ZALYZNYAK_DICT, CMU_DICT 13 | 14 | from rupo.stress.word import Stress 15 | 16 | 17 | class StressDict: 18 | """ 19 | Класс данных, для сериализации словаря как префиксного дерева и быстрой загрузки в память. 20 | """ 21 | 22 | class Mode: 23 | GRAPHEMES = 0 24 | PHONEMES = 0 25 | 26 | def __init__(self, language: str="ru", mode: Mode=Mode.GRAPHEMES, raw_dict_path=None, trie_path=None, 27 | zalyzniak_dict=ZALYZNYAK_DICT, cmu_dict=CMU_DICT) -> None: 28 | self.data = pygtrie.Trie() # type: Dict[str, Set[Stress]] 29 | self.raw_dict_path = raw_dict_path 30 | self.trie_path = trie_path 31 | if language == "ru" and mode == self.Mode.GRAPHEMES: 32 | self.__init_defaults(RU_GRAPHEME_STRESS_PATH, RU_GRAPHEME_STRESS_TRIE_PATH) 33 | if not os.path.exists(self.raw_dict_path): 34 | from rupo.dict.zaliznyak import ZalyzniakDict 35 | ZalyzniakDict.convert_to_accent_only(zalyzniak_dict, self.raw_dict_path) 36 | elif mode == self.Mode.PHONEMES and language == "en": 37 | self.__init_defaults(EN_PHONEME_STRESS_PATH, EN_PHONEME_STRESS_TRIE_PATH) 38 | if not os.path.exists(self.raw_dict_path): 39 | CMUDict.convert_to_phoneme_stress(cmu_dict, self.raw_dict_path) 40 | else: 41 | assert False 42 | if not os.path.isfile(self.raw_dict_path): 43 | raise FileNotFoundError("Dictionary raw file not found.") 44 | if os.path.isfile(self.trie_path): 45 | self.load(self.trie_path) 46 | else: 47 | self.create(self.raw_dict_path, self.trie_path) 48 | 49 | def __init_defaults(self, raw_dict_path, trie_path): 50 | if self.raw_dict_path is None: 51 | self.raw_dict_path = raw_dict_path 52 | if self.trie_path is None: 53 | self.trie_path = trie_path 54 | 55 | def create(self, src_filename: str, dst_filename: str) -> None: 56 | """ 57 | Загрузка словаря из файла. 58 | 59 | :param src_filename: имя файла с оригинальным словарём. 60 | :param dst_filename: имя файла, в который будет сохранён дамп. 61 | """ 62 | with open(src_filename, 'r', encoding='utf-8') as f: 63 | for line in f: 64 | word, primary, secondary = line.split("\t") 65 | stresses = [Stress(int(a), Stress.Type.PRIMARY) for a in primary.strip().split(",")] 66 | if secondary.strip() != "": 67 | stresses += [Stress(int(a), Stress.Type.SECONDARY) for a in secondary.strip().split(",")] 68 | self.update(word, stresses) 69 | self.save(dst_filename) 70 | 71 | def save(self, dst_filename: str) -> None: 72 | """ 73 | Сохранение дампа. 74 | 75 | :param dst_filename: имя файла, в который сохраняем дамп словаря. 76 | """ 77 | with open(dst_filename, "wb") as f: 78 | pickle.dump(self.data, f, pickle.HIGHEST_PROTOCOL) 79 | 80 | def load(self, dump_filename: str) -> None: 81 | """ 82 | Загрузка дампа словаря. 83 | 84 | :param dump_filename: откуда загружаем. 85 | """ 86 | with open(dump_filename, "rb") as f: 87 | self.data = pickle.load(f) 88 | 89 | def get_stresses(self, word: str, stress_type: Stress.Type=Stress.Type.ANY) -> List[int]: 90 | """ 91 | Получение ударений нужного типа у слова. 92 | 93 | :param word: слово, которое мы хотим посмотреть в словаре. 94 | :param stress_type: тип ударения. 95 | :return forms: массив всех ударений. 96 | """ 97 | if word in self.data: 98 | if stress_type == Stress.Type.ANY: 99 | return [stress.position for stress in self.data[word]] 100 | else: 101 | return [stress.position for stress in self.data[word] if stress.type == stress_type] 102 | return [] 103 | 104 | def get_all(self) -> ItemsView[str, Set[Stress]]: 105 | """ 106 | :return items: все ключи и ударения словаря. 107 | """ 108 | return self.data.items() 109 | 110 | def update(self, word: str, stresses: List[Stress]) -> None: 111 | """ 112 | Обновление словаря. 113 | 114 | :param word: слово. 115 | :param stresses: набор ударений. 116 | """ 117 | if word not in self.data: 118 | self.data[word] = set(stresses) 119 | else: 120 | self.data[word].update(stresses) 121 | 122 | def update_primary_only(self, word: str, stresses: List[int]) -> None: 123 | self.update(word, [Stress(stress, Stress.Type.PRIMARY) for stress in stresses]) -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # rupo documentation build configuration file, created by 5 | # sphinx-quickstart on Sat Mar 18 02:33:48 2017. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | # 20 | import os 21 | import sys 22 | sys.path.insert(0, os.path.abspath("../..")) 23 | 24 | 25 | # -- General configuration ------------------------------------------------ 26 | 27 | # If your documentation needs a minimal Sphinx version, state it here. 28 | # 29 | # needs_sphinx = '1.0' 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 33 | # ones. 34 | extensions = ['sphinx.ext.autodoc', 35 | 'sphinx.ext.doctest', 36 | 'sphinx.ext.viewcode', 37 | 'sphinx.ext.githubpages'] 38 | 39 | # Add any paths that contain templates here, relative to this directory. 40 | templates_path = ['_templates'] 41 | 42 | # The suffix(es) of source filenames. 43 | # You can specify multiple suffix as a list of string: 44 | # 45 | # source_suffix = ['.rst', '.md'] 46 | source_suffix = '.rst' 47 | 48 | # The master toctree document. 49 | master_doc = 'index' 50 | 51 | # General information about the project. 52 | project = 'rupo' 53 | copyright = '2017, Ilya Gusev' 54 | author = 'Ilya Gusev' 55 | 56 | # The version info for the project you're documenting, acts as replacement for 57 | # |version| and |release|, also used in various other places throughout the 58 | # built documents. 59 | # 60 | # The short X.Y version. 61 | version = '0.2.4' 62 | # The full version, including alpha/beta/rc tags. 63 | release = '0.2.4' 64 | 65 | # The language for content autogenerated by Sphinx. Refer to documentation 66 | # for a list of supported languages. 67 | # 68 | # This is also used if you do content translation via gettext catalogs. 69 | # Usually you set "language" from the command line for these cases. 70 | language = 'ru' 71 | 72 | # List of patterns, relative to source directory, that match files and 73 | # directories to ignore when looking for source files. 74 | # This patterns also effect to html_static_path and html_extra_path 75 | exclude_patterns = [] 76 | 77 | # The name of the Pygments (syntax highlighting) style to use. 78 | pygments_style = 'sphinx' 79 | 80 | # If true, `todo` and `todoList` produce output, else they produce nothing. 81 | todo_include_todos = False 82 | 83 | 84 | # -- Options for HTML output ---------------------------------------------- 85 | 86 | # The theme to use for HTML and HTML Help pages. See the documentation for 87 | # a list of builtin themes. 88 | # 89 | html_theme = 'default' 90 | 91 | # Theme options are theme-specific and customize the look and feel of a theme 92 | # further. For a list of options available for each theme, see the 93 | # documentation. 94 | # 95 | # html_theme_options = {} 96 | 97 | # Add any paths that contain custom static files (such as style sheets) here, 98 | # relative to this directory. They are copied after the builtin static files, 99 | # so a file named "default.css" will overwrite the builtin "default.css". 100 | html_static_path = ['_static'] 101 | 102 | 103 | # -- Options for HTMLHelp output ------------------------------------------ 104 | 105 | # Output file base name for HTML help builder. 106 | htmlhelp_basename = 'rupodoc' 107 | 108 | 109 | # -- Options for LaTeX output --------------------------------------------- 110 | 111 | latex_elements = { 112 | # The paper size ('letterpaper' or 'a4paper'). 113 | # 114 | # 'papersize': 'letterpaper', 115 | 116 | # The font size ('10pt', '11pt' or '12pt'). 117 | # 118 | # 'pointsize': '10pt', 119 | 120 | # Additional stuff for the LaTeX preamble. 121 | # 122 | # 'preamble': '', 123 | 124 | # Latex figure (float) alignment 125 | # 126 | # 'figure_align': 'htbp', 127 | } 128 | 129 | # Grouping the document tree into LaTeX files. List of tuples 130 | # (source start file, target name, title, 131 | # author, documentclass [howto, manual, or own class]). 132 | latex_documents = [ 133 | (master_doc, 'rupo.tex', 'rupo Documentation', 134 | 'Ilya Gusev', 'manual'), 135 | ] 136 | 137 | 138 | # -- Options for manual page output --------------------------------------- 139 | 140 | # One entry per manual page. List of tuples 141 | # (source start file, name, description, authors, manual section). 142 | man_pages = [ 143 | (master_doc, 'rupo', 'rupo Documentation', 144 | [author], 1) 145 | ] 146 | 147 | 148 | # -- Options for Texinfo output ------------------------------------------- 149 | 150 | # Grouping the document tree into Texinfo files. List of tuples 151 | # (source start file, target name, title, author, 152 | # dir menu entry, description, category) 153 | texinfo_documents = [ 154 | (master_doc, 'rupo', 'rupo Documentation', 155 | author, 'rupo', 'One line description of project.', 156 | 'Miscellaneous'), 157 | ] 158 | -------------------------------------------------------------------------------- /rupo/metre/test_pattern_analyzer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Автор: Гусев Илья 3 | # Описание: Тесты к компилятору выражений. 4 | 5 | import unittest 6 | 7 | from rupo.metre.pattern_analyzer import PatternAnalyzer 8 | 9 | 10 | class TestPatternAnalyzer(unittest.TestCase): 11 | def test_pattern_analyzer(self): 12 | self.assertEqual(PatternAnalyzer.count_errors("(s)*", "uuu"), ('sss', 0, 3, False)) 13 | self.assertEqual(PatternAnalyzer.count_errors("(s)*", "uus"), ('sss', 0, 2, False)) 14 | self.assertEqual(PatternAnalyzer.count_errors("(s)*", "usu"), ('sss', 0, 2, False)) 15 | self.assertEqual(PatternAnalyzer.count_errors("(s)*", "uss"), ('sss', 0, 1, False)) 16 | self.assertEqual(PatternAnalyzer.count_errors("(s)*", "suu"), ('sss', 0, 2, False)) 17 | self.assertEqual(PatternAnalyzer.count_errors("(s)*", "sus"), ('sss', 0, 1, False)) 18 | self.assertEqual(PatternAnalyzer.count_errors("(s)*", "ssu"), ('sss', 0, 1, False)) 19 | self.assertEqual(PatternAnalyzer.count_errors("(s)*", "sss"), ('sss', 0, 0, False)) 20 | 21 | self.assertEqual(PatternAnalyzer.count_errors("(sus)*(u)?", 'suu'), ('sus', 0, 1, False)) 22 | 23 | self.assertEqual(PatternAnalyzer.count_errors("((sus)*u)*s", 'susss'), ('susus', 0, 1, False)) 24 | 25 | self.assertEqual(PatternAnalyzer.count_errors("(s((s)*u)*)*", 'susss'), ('susss', 0, 0, False)) 26 | self.assertEqual(PatternAnalyzer.count_errors("(s((s)*u)*)*", 'usss'), ('ssss', 0, 1, False)) 27 | self.assertEqual(PatternAnalyzer.count_errors("(s((s)*u)*)*", 'suuu'), ('suuu', 0, 0, False)) 28 | self.assertEqual(PatternAnalyzer.count_errors("(s((s)*u)*)*", 'suuusuuus'), ('suuusuuus', 0, 0, False)) 29 | 30 | self.assertEqual(PatternAnalyzer.count_errors("(sss((sus)*uss)*)*", 'ssssussususs'), ('ssssussususs', 0, 0, False)) 31 | self.assertEqual(PatternAnalyzer.count_errors("(sss((sus)*uss)*)*", 'ssssuuuss'), ('ssssususs', 0, 1, False)) 32 | 33 | self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", "uuuu"), ('susu', 0, 2, False)) 34 | self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", "uuus"), ('ssus', 0, 2, False)) 35 | self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", "uusu"), ('susu', 0, 1, False)) 36 | self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", "uuss"), ('suss', 0, 1, False)) 37 | self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", "usuu"), ('sssu', 0, 2, False)) 38 | self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", "usus"), ('ssus', 0, 1, False)) 39 | self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", "ussu"), ('sssu', 0, 1, False)) 40 | self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", "usss"), ('ssss', 0, 1, False)) 41 | self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", "suuu"), ('susu', 0, 1, False)) 42 | self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", "suus"), ('ssus', 0, 1, False)) 43 | self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", "susu"), ('susu', 0, 0, False)) 44 | self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", "suss"), ('suss', 0, 0, False)) 45 | self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", "ssuu"), ('sssu', 0, 1, False)) 46 | self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", 'ssus'), ('ssus', 0, 0, False)) 47 | self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", 'sssu'), ('sssu', 0, 0, False)) 48 | self.assertEqual(PatternAnalyzer.count_errors("((s)(u)?)*", "ssss"), ('ssss', 0, 0, False)) 49 | 50 | self.assertEqual(PatternAnalyzer.count_errors("(s)?(u)?(S)?", "su"), ('su', 0, 0, False)) 51 | self.assertEqual(PatternAnalyzer.count_errors("(s)?(u)?(S)?", "ss"), ('su', 0, 1, False)) 52 | self.assertEqual(PatternAnalyzer.count_errors("(s)?(u)?(S)?", "uS"), ('uS', 0, 0, False)) 53 | self.assertEqual(PatternAnalyzer.count_errors("(s)?(u)?(S)?", "sS"), ('sS', 0, 0, False)) 54 | 55 | self.assertEqual(PatternAnalyzer.count_errors("(s)?(u)(s)*", "u"), ('u', 0, 0, False)) 56 | self.assertEqual(PatternAnalyzer.count_errors("(s)?(u)(s)*", "su"), ('su', 0, 0, False)) 57 | self.assertEqual(PatternAnalyzer.count_errors("(s)?(u)(s)*", "us"), ('us', 0, 0, False)) 58 | self.assertEqual(PatternAnalyzer.count_errors("(s)?(u)(s)*", "sus"), ('sus', 0, 0, False)) 59 | self.assertEqual(PatternAnalyzer.count_errors("(s)?(u)(s)*", "uss"), ('uss', 0, 0, False)) 60 | 61 | self.assertEqual(PatternAnalyzer.count_errors("(us)*(uS)(U)?(U)?", "usuS"), ('usuS', 0, 0, False)) 62 | self.assertEqual(PatternAnalyzer.count_errors("(us)*(uS)(U)?(U)?", "uSUU"), ('uSUU', 0, 0, False)) 63 | 64 | self.assertEqual(PatternAnalyzer.count_errors("(su(u)?)*", "su"), ('su', 0, 0, False)) 65 | self.assertEqual(PatternAnalyzer.count_errors("(su(u)?)*", "suu"), ('suu', 0, 0, False)) 66 | self.assertEqual(PatternAnalyzer.count_errors("(su(u)?)*", "susu"), ('susu', 0, 0, False)) 67 | self.assertEqual(PatternAnalyzer.count_errors("(su(u)?)*", "suusuu"), ('suusuu', 0, 0, False)) 68 | self.assertEqual(PatternAnalyzer.count_errors("(su(u)?)*", "ssussu"), ('suusuu', 0, 2, False)) 69 | 70 | self.assertEqual(PatternAnalyzer.count_errors("(u)?(u)?((s)(u)?(u)?)*(S)(U)?(U)?", "sssuSU"), ('sssuSU', 0, 0, False)) 71 | self.assertEqual(PatternAnalyzer.count_errors("(u)?(u)?((s)(u)?(u)?)*(S)(U)?(U)?", "ussuSU"), ('ussuSU', 0, 0, False)) 72 | self.assertEqual(PatternAnalyzer.count_errors("(u)?(u)?((s)(u)?(u)?)*(S)(U)?(U)?", "susuuSU"), ('susuuSU', 0, 0, False)) 73 | self.assertEqual(PatternAnalyzer.count_errors("(u)?(u)?((s)(u)?(u)?)*(S)(U)?(U)?", "uusuuSU"), ('uusuuSU', 0, 0, False)) 74 | -------------------------------------------------------------------------------- /rupo/main/tokenizer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Автор: Гусев Илья 3 | # Описание: Модуль токенизации. 4 | 5 | import re 6 | from typing import List 7 | from enum import Enum, unique 8 | 9 | from rupo.settings import HYPHEN_TOKENS 10 | 11 | 12 | class Token: 13 | @unique 14 | class TokenType(Enum): 15 | """ 16 | Тип токена. 17 | """ 18 | UNKNOWN = -1 19 | WORD = 0 20 | PUNCTUATION = 1 21 | SPACE = 2 22 | ENDLINE = 3 23 | NUMBER = 4 24 | 25 | def __str__(self): 26 | return str(self.name) 27 | 28 | def __repr__(self): 29 | return self.__str__() 30 | 31 | def __init__(self, text: str, token_type: TokenType, begin: int, end: int): 32 | """ 33 | :param text: исходный текст. 34 | :param token_type: тип токена. 35 | :param begin: начало позиции токена в тексте. 36 | :param end: конец позиции токена в тексте. 37 | """ 38 | self.token_type = token_type 39 | self.begin = begin 40 | self.end = end 41 | self.text = text 42 | 43 | def __str__(self): 44 | return "'" + self.text + "'" + "|" + str(self.token_type) + " (" + str(self.begin) + ", " + str(self.end) + ")" 45 | 46 | def __repr__(self): 47 | return self.__str__() 48 | 49 | def __eq__(self, other): 50 | return self.text == other.text and self.token_type == other.token_type 51 | 52 | 53 | class Tokenizer(object): 54 | """ 55 | Класс токенизации. 56 | """ 57 | @staticmethod 58 | def tokenize(text: str, remove_punct=False, remove_unknown=False, replace_numbers=False) -> List[Token]: 59 | """ 60 | Токенизация текстов на русском языке с учётом знаков препинания и слов с дефисами. 61 | 62 | :param text: исходный текст. 63 | :return: список токенов. 64 | """ 65 | tokens = [] 66 | punctuation = ".,?:;!—" 67 | begin = -1 68 | for i, ch in enumerate(text): 69 | if ch.isalpha() or ch == "-": 70 | if begin == -1: 71 | begin = i 72 | else: 73 | if begin != -1: 74 | tokens.append(Tokenizer.__form_token(text, begin, i)) 75 | begin = -1 76 | token_type = Token.TokenType.UNKNOWN 77 | if ch in punctuation: 78 | token_type = Token.TokenType.PUNCTUATION 79 | elif ch == "\n": 80 | token_type = Token.TokenType.ENDLINE 81 | elif ch == " ": 82 | token_type = Token.TokenType.SPACE 83 | elif ch.isdigit(): 84 | token_type = Token.TokenType.NUMBER 85 | if len(tokens) != 0 and tokens[-1].token_type == token_type: 86 | tokens[-1].text += ch 87 | tokens[-1].end += 1 88 | else: 89 | tokens.append(Token(ch, token_type, i, i + 1)) 90 | if begin != -1: 91 | tokens.append(Tokenizer.__form_token(text, begin, len(text))) 92 | tokens = Tokenizer.__hyphen_map(tokens) 93 | if remove_punct: 94 | tokens = [token for token in tokens if token.token_type != Token.TokenType.PUNCTUATION] 95 | if remove_unknown: 96 | tokens = [token for token in tokens if token.token_type != Token.TokenType.UNKNOWN] 97 | if replace_numbers: 98 | for token in tokens: 99 | if token.token_type != Token.TokenType.NUMBER: 100 | continue 101 | token.text = "ЧИСЛО" 102 | token.token_type = Token.TokenType.WORD 103 | return tokens 104 | 105 | @staticmethod 106 | def __form_token(text, begin, end): 107 | word = text[begin:end] 108 | if word != "-": 109 | return Token(word, Token.TokenType.WORD, begin, end) 110 | else: 111 | return Token("-", Token.TokenType.PUNCTUATION, begin, begin + 1) 112 | 113 | @staticmethod 114 | def __hyphen_map(tokens: List[Token]) -> List[Token]: 115 | """ 116 | Слова из словаря оставляем с дефисом, остальные разделяем. 117 | 118 | :param tokens: токены. 119 | :return: токены после обработки. 120 | """ 121 | new_tokens = [] 122 | hyphen_tokens = Tokenizer.__get_hyphen_tokens() 123 | for token in tokens: 124 | if token.token_type != Token.TokenType.WORD: 125 | new_tokens.append(token) 126 | continue 127 | is_one_word = True 128 | if "-" in token.text: 129 | is_one_word = False 130 | for hyphen_token in hyphen_tokens: 131 | if hyphen_token in token.text or token.text in hyphen_token: 132 | is_one_word = True 133 | if is_one_word: 134 | new_tokens.append(token) 135 | else: 136 | texts = token.text.split("-") 137 | pos = token.begin 138 | for text in texts: 139 | new_tokens.append(Token(text, Token.TokenType.WORD, pos, pos+len(text))) 140 | pos += len(text) + 1 141 | return new_tokens 142 | 143 | @staticmethod 144 | def __get_hyphen_tokens(): 145 | """ 146 | :return: содержание словаря, в котором прописаны слова с дефисом. 147 | """ 148 | with open(HYPHEN_TOKENS, "r", encoding="utf-8") as file: 149 | hyphen_tokens = [token.strip() for token in file.readlines()] 150 | return hyphen_tokens 151 | 152 | 153 | class SentenceTokenizer(object): 154 | @staticmethod 155 | def tokenize(text: str) -> List[str]: 156 | m = re.split(r'(?<=[^А-ЯЁ].[^А-ЯЁ][.?!;]) +(?=[А-ЯЁ])', text) 157 | return m 158 | -------------------------------------------------------------------------------- /rupo/files/reader.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Автор: Гусев Илья 3 | # Описание: Считыватель файлов разных расширений. 4 | 5 | import os 6 | import xml.etree.ElementTree as etree 7 | import json 8 | from enum import Enum 9 | from typing import Iterator 10 | 11 | from rupo.main.markup import Markup 12 | from rupo.metre.metre_classifier import MetreClassifier 13 | from rupo.stress.predictor import StressPredictor 14 | 15 | 16 | RAW_SEPARATOR = "\n\n\n" 17 | 18 | 19 | class FileType(Enum): 20 | """ 21 | Тип файла. 22 | """ 23 | RAW = ".txt" 24 | XML = ".xml" 25 | JSON = ".json" 26 | VOCAB = ".voc" 27 | 28 | 29 | class Reader(object): 30 | """ 31 | Считывание из файлов. 32 | """ 33 | @staticmethod 34 | def read_markups(path: str, source_type: FileType, is_processed: bool, 35 | stress_predictor: StressPredictor=None) -> Iterator[Markup]: 36 | """ 37 | Считывание разметок (включая разметку по сырым текстам). 38 | 39 | :param path: путь к файлу/папке. 40 | :param source_type: тип файлов. 41 | :param is_processed: уже размеченные тексты? 42 | :param stress_predictor: классификатор ударений (для неразмеченных текстов). 43 | """ 44 | paths = Reader.get_paths(path, source_type.value) 45 | for filename in paths: 46 | with open(filename, "r", encoding="utf-8") as file: 47 | if is_processed: 48 | if source_type == FileType.XML: 49 | for elem in Reader.__xml_iter(file, 'markup'): 50 | yield Markup().from_xml(etree.tostring(elem, encoding='utf-8', method='xml')) 51 | elif source_type == FileType.JSON: 52 | j = json.load(file) 53 | for item in j['items']: 54 | yield Markup().from_dict(item) 55 | elif source_type == FileType.RAW: 56 | separator_count = 0 57 | text = "" 58 | for line in file: 59 | if line == "\n": 60 | separator_count += 1 61 | else: 62 | text += line 63 | if separator_count == 3: 64 | separator_count = 0 65 | yield Markup().from_raw(text) 66 | if text != "": 67 | yield Markup().from_raw(text) 68 | else: 69 | assert stress_predictor is not None 70 | for text in Reader.read_texts(filename, source_type): 71 | yield Reader.__markup_text(text, stress_predictor) 72 | 73 | @staticmethod 74 | def read_vocabulary(path: str): 75 | """ 76 | Считывание словаря. 77 | 78 | :param path: путь к словарю. 79 | :return: слово и его индекс. 80 | """ 81 | paths = Reader.get_paths(path, FileType.VOCAB.value) 82 | for filename in paths: 83 | with open(filename, "r", encoding="utf-8") as file: 84 | for line in file: 85 | fields = line.strip().split('\t') 86 | yield Markup().from_raw(fields[0]).lines[0].words[0], int(fields[1]) 87 | 88 | @staticmethod 89 | def read_texts(path: str, source_type: FileType) -> Iterator[str]: 90 | """ 91 | Считывание текстов. 92 | 93 | :param path: путь к файлу/папке. 94 | :param source_type: тип файлов. 95 | """ 96 | paths = Reader.get_paths(path, source_type.value) 97 | for filename in paths: 98 | with open(filename, "r", encoding="utf-8") as file: 99 | if source_type == FileType.XML: 100 | for elem in Reader.__xml_iter(file, 'item'): 101 | yield elem.find(".//text").text 102 | elif source_type == FileType.JSON: 103 | # TODO: ленивый парсинг 104 | j = json.load(file) 105 | for item in j['items']: 106 | yield item['text'] 107 | elif source_type == FileType.RAW: 108 | text = file.read() 109 | for t in text.split(RAW_SEPARATOR): 110 | yield t 111 | 112 | @staticmethod 113 | def get_paths(path: str, ext: str) -> Iterator[str]: 114 | """ 115 | Получение всех файлов заданного типа по заданному пути. 116 | 117 | :param path: путь к файлу/папке. 118 | :param ext: требуемое расширение. 119 | """ 120 | if os.path.isfile(path): 121 | if ext == os.path.splitext(path)[1]: 122 | yield path 123 | else: 124 | for root, folders, files in os.walk(path): 125 | for file in files: 126 | if ext == os.path.splitext(file)[1]: 127 | yield os.path.join(root, file) 128 | for folder in folders: 129 | return Reader.get_paths(folder, ext) 130 | 131 | @staticmethod 132 | def __markup_text(text: str, stress_predictor: StressPredictor) -> Markup: 133 | """ 134 | Разметка текста. 135 | 136 | :param text: текст. 137 | :return: разметка. 138 | """ 139 | markup = Markup.process_text(text, stress_predictor) 140 | markup = MetreClassifier.improve_markup(markup)[0] 141 | return markup 142 | 143 | @staticmethod 144 | def __xml_iter(file, tag): 145 | """ 146 | :param file: xml файл. 147 | :param tag: заданный тег. 148 | :return: все элементы с заданными тегами в xml. 149 | """ 150 | return (elem for event, elem in etree.iterparse(file, events=['end']) if event == 'end' and elem.tag == tag) 151 | -------------------------------------------------------------------------------- /rupo/metre/test_metre_classifier.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Автор: Гусев Илья 3 | # Описание: Тесты к классификатору метра. 4 | 5 | import unittest 6 | import jsonpickle 7 | import copy 8 | import logging 9 | import sys 10 | 11 | from rupo.main.markup import Markup 12 | from rupo.stress.predictor import CombinedStressPredictor 13 | from rupo.metre.metre_classifier import MetreClassifier, ClassificationResult, StressCorrection 14 | from rupo.settings import RU_STRESS_DEFAULT_MODEL, ZALYZNYAK_DICT, CMU_DICT, RU_GRAPHEME_STRESS_PATH, \ 15 | RU_GRAPHEME_STRESS_TRIE_PATH 16 | 17 | 18 | class TestMetreClassifier(unittest.TestCase): 19 | @classmethod 20 | def setUpClass(cls): 21 | cls.stress_predictor = CombinedStressPredictor( 22 | stress_model_path=RU_STRESS_DEFAULT_MODEL, 23 | zalyzniak_dict=ZALYZNYAK_DICT, 24 | cmu_dict=CMU_DICT, 25 | raw_stress_dict_path=RU_GRAPHEME_STRESS_PATH, 26 | stress_trie_path=RU_GRAPHEME_STRESS_TRIE_PATH 27 | ) 28 | logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) 29 | 30 | @classmethod 31 | def tearDownClass(cls): 32 | del cls.stress_predictor 33 | 34 | def test_classification_result(self): 35 | result = ClassificationResult(5) 36 | result.additions["iambos"].append(StressCorrection(0, 0, 0, "", 0)) 37 | self.assertEqual(result, jsonpickle.decode(result.to_json())) 38 | 39 | def test_metre_classifier1(self): 40 | text = "Горит восток зарёю новой.\n" \ 41 | "Уж на равнине, по холмам\n" \ 42 | "Грохочут пушки. Дым багровый\n" \ 43 | "Кругами всходит к небесам." 44 | markup, result = MetreClassifier.improve_markup(Markup.process_text(text, self.stress_predictor)) 45 | self.assertIsInstance(markup, Markup) 46 | self.assertIsInstance(result, ClassificationResult) 47 | self.assertEqual(result.metre, "iambos") 48 | 49 | def test_metre_classifier2(self): 50 | text = "Буря мглою небо кроет,\n" \ 51 | "Вихри снежные крутя;\n" \ 52 | "То, как зверь, она завоет,\n" \ 53 | "То заплачет, как дитя..." 54 | markup, result = MetreClassifier.improve_markup(Markup.process_text(text, self.stress_predictor)) 55 | self.assertEqual(result.metre, "choreios") 56 | 57 | def test_metre_classifier3(self): 58 | text = "На стеклах нарастает лед,\n"\ 59 | "Часы твердят: «Не трусь!»\n"\ 60 | "Услышать, что ко мне идет,\n"\ 61 | "И мертвой я боюсь.\n"\ 62 | "Как идола, молю я дверь;\n"\ 63 | "«Не пропускай беду!»\n"\ 64 | "Кто воет за стеной, как зверь,\n"\ 65 | "Кто прячется в саду?" 66 | markup, result = MetreClassifier.improve_markup(Markup.process_text(text, self.stress_predictor)) 67 | self.assertEqual(result.metre, "iambos") 68 | 69 | def test_metre_classifier4(self): 70 | text = "Вот уж вечер. Роса\n" \ 71 | "Блестит на крапиве.\n"\ 72 | "Я стою у дороги,\n"\ 73 | "Прислонившись к иве.\n"\ 74 | "От луны свет большой\n"\ 75 | "Прямо на нашу крышу.\n"\ 76 | "Где-то песнь соловья\n"\ 77 | "Хорошо и тепло,\n"\ 78 | "Как зимой у печки.\n"\ 79 | "И березы стоят,\n"\ 80 | "Как большие свечки.\n"\ 81 | "И вдали за рекой,\n"\ 82 | "Видно, за опушкой,\n"\ 83 | "Сонный сторож стучит\n"\ 84 | "Мертвой колотушкой." 85 | markup, result = MetreClassifier.improve_markup(Markup.process_text(text, self.stress_predictor)) 86 | self.assertTrue(result.metre == "dolnik3" or result.metre == "dolnik2") 87 | 88 | def test_metre_classifier5(self): 89 | text = "Глыбу кварца разбили молотом,\n" \ 90 | "И, веселым огнем горя,\n" \ 91 | "Заблестели крупинки золота\n" \ 92 | "В свете тусклого фонаря.\n" \ 93 | "И вокруг собрались откатчики:\n" \ 94 | "Редкий случай, чтоб так, в руде!\n" \ 95 | "И от ламп заплясали зайчики,\n" \ 96 | "Отражаясь в черной воде...\n" \ 97 | "Прислонившись к мокрой стене,\n" \ 98 | "Мы стояли вокруг.\n" \ 99 | "Курили,\n" \ 100 | "Прислонившись к мокрой стене,\n" \ 101 | "И мечтательно говорили\n" \ 102 | "Не о золоте — о весне.\n" \ 103 | "И о том, что скоро, наверно,\n" \ 104 | "На заливе вспотеет лед\n" \ 105 | "И, снега огласив сиреной,\n" \ 106 | "Наконец придет пароход...\n" \ 107 | "Покурили еще немного,\n" \ 108 | "Золотинки в кисет смели\n" \ 109 | "И опять — по своим дорогам,\n" \ 110 | "К вагонеткам своим пошли.\n" \ 111 | "Что нам золото? В дни тяжелые\n" \ 112 | "Я от жадности злой не слеп.\n" \ 113 | "Самородки большие, желтые\n" \ 114 | "Отдавал за табак и хлеб.\n" \ 115 | "Не о золоте были мысли...\n" \ 116 | "В ночь таежную у костра\n" \ 117 | "Есть над чем поразмыслить в жизни,\n" \ 118 | "Кроме\n" \ 119 | "Золота-серебра." 120 | markup, result = MetreClassifier.improve_markup(Markup.process_text(text, self.stress_predictor)) 121 | self.assertTrue(result.metre == "dolnik3" or result.metre == "dolnik2") 122 | 123 | def test_metre_classifier6(self): 124 | text = "Лючинь печальная читала вечером ручьисто-вкрадчиво,\n" \ 125 | "Так чутко чувствуя журчащий вычурно чужой ей плач,\n" \ 126 | "И, в человечестве чтя нечто вечное, чем чушь Бокаччио,\n" \ 127 | "От чар отчаянья кручинно-скучная, чла час удач." 128 | markup, result = MetreClassifier.improve_markup(Markup.process_text(text, self.stress_predictor)) 129 | self.assertTrue(result.metre == "iambos") 130 | 131 | def test_improve(self): 132 | text = "Буря мглою небо кроет,\n" \ 133 | "Вихри снежные крутя;\n" \ 134 | "То, как зверь, она завоет,\n" \ 135 | "То заплачет, как дитя..." 136 | initial_markup = Markup.process_text(text, self.stress_predictor) 137 | markup, result = MetreClassifier.improve_markup(copy.deepcopy(initial_markup)) 138 | self.assertNotEqual(markup.lines[0].words[0].syllables[0].stress, -1) 139 | self.assertEqual(markup.lines[0].words[0].syllables[1].stress, -1) 140 | 141 | 142 | 143 | 144 | -------------------------------------------------------------------------------- /rupo/data/examples/markup.xml: -------------------------------------------------------------------------------- 1 | 20Забывши волнения жизни мятежной,330-10За2132бы42-14вши70Забывши70-10во2142лне52-15ни73-17я88волнения16010жи21-12зни517жизни220-10мя2132те42-14жной823мятежной3133Один жил в пустыне рыбак молодой.670-10О1121дин433Один37010жил338жил4142в430-10пу2142сты52-15не744пустыне510-10ры2132бак552рыбак570-10мо21-12ло4254дой758молодой6567Однажды на скале прибрежной,960-10О1131дна42-14жды767Однажды74010на275на77020ска31-13ле578скале830-10при3153бре62-16жной1084прибрежной9496Над тихой прозрачной рекой123010Над396Над99010ти21-12хой5100тихой1050-10про3153зра62-16чной10106прозрачной1160-10ре2132кой5117рекой122123Он с удой беспечно142000Он2123Он125126с127000у11-11дой4128удой1320-10бе2142спе52-15чно8133беспечно141142Сидел1480-10Си2132дел5142Сидел147148И думой сердечной166000И1148И149010ду21-12мой5150думой1550-10сер3143де52-15чной9156сердечной165166К прошедшему счастью летел.193166К1670-10про3143ше52-15дше83-18му10168прошедшему178020сча31-13стью7179счастью1860-10ле2132тел5187летел192Забывши волнения жизни мятежной,\nОдин жил в пустыне рыбак молодой.\nОднажды на скале прибрежной,\nНад тихой прозрачной рекой\nОн с удой беспечно\nСидел\nИ думой сердечной\nК прошедшему счастью летел. -------------------------------------------------------------------------------- /rupo/api.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Автор: Гусев Илья 3 | # Описание: Набор внешних методов для работы с библиотекой. 4 | 5 | import os 6 | from typing import List, Tuple, Dict 7 | 8 | from rulm.language_model import LanguageModel 9 | 10 | from rupo.files.reader import FileType, Reader 11 | from rupo.files.writer import Writer 12 | from rupo.main.markup import Markup 13 | from rupo.metre.metre_classifier import MetreClassifier, ClassificationResult 14 | from rupo.rhymes.rhymes import Rhymes 15 | from rupo.settings import ZALYZNYAK_DICT, CMU_DICT, DATA_DIR, DICT_DIR 16 | from rupo.stress.predictor import StressPredictor, CombinedStressPredictor 17 | from rupo.main.vocabulary import StressVocabulary, inflate_stress_vocabulary 18 | from rupo.generate.generator import Generator 19 | 20 | from allennlp.data.vocabulary import Vocabulary, DEFAULT_OOV_TOKEN 21 | from allennlp.common.util import END_SYMBOL 22 | from rulm.transform import ExcludeTransform 23 | from russ.syllables import get_syllables 24 | 25 | 26 | class Engine: 27 | def __init__(self, language="ru"): 28 | self.language = language # type: str 29 | self.vocabulary = None # type: StressVocabulary 30 | self.generator = None # type: Generator 31 | self.stress_predictors = dict() # type: Dict[str, StressPredictor] 32 | 33 | def load(self, stress_model_path: str, zalyzniak_dict: str, raw_stress_dict_path=None, 34 | stress_trie_path=None): 35 | self.stress_predictors = dict() 36 | if not os.path.isdir(DATA_DIR): 37 | os.makedirs(DATA_DIR) 38 | if not os.path.isdir(DICT_DIR): 39 | os.makedirs(DICT_DIR) 40 | self.get_stress_predictor(self.language, stress_model_path, raw_stress_dict_path, 41 | stress_trie_path, zalyzniak_dict) 42 | 43 | def get_vocabulary(self, dump_path: str, markup_path: str) -> StressVocabulary: 44 | if self.vocabulary is None: 45 | self.vocabulary = StressVocabulary() 46 | if os.path.isfile(dump_path): 47 | self.vocabulary.load(dump_path) 48 | elif markup_path is not None: 49 | self.vocabulary.parse(markup_path) 50 | return self.vocabulary 51 | 52 | def get_generator(self, 53 | model_path: str, 54 | token_vocab_path: str, 55 | stress_vocab_dump_path: str) -> Generator: 56 | if self.generator is None: 57 | assert os.path.isdir(model_path) and os.path.isdir(token_vocab_path) 58 | vocabulary = Vocabulary.from_files(token_vocab_path) 59 | stress_vocabulary = StressVocabulary() 60 | if not os.path.isfile(stress_vocab_dump_path): 61 | stress_vocabulary = inflate_stress_vocabulary(vocabulary, self.get_stress_predictor()) 62 | stress_vocabulary.save(stress_vocab_dump_path) 63 | else: 64 | stress_vocabulary.load(stress_vocab_dump_path) 65 | 66 | eos_index = vocabulary.get_token_index(END_SYMBOL) 67 | unk_index = vocabulary.get_token_index(DEFAULT_OOV_TOKEN) 68 | exclude_transform = ExcludeTransform((unk_index, eos_index)) 69 | 70 | model = LanguageModel.load(model_path, vocabulary_dir=token_vocab_path, 71 | transforms=[exclude_transform, ]) 72 | self.generator = Generator(model, vocabulary, stress_vocabulary, eos_index) 73 | return self.generator 74 | 75 | def get_stress_predictor(self, language="ru", stress_model_path: str=None, raw_stress_dict_path=None, 76 | stress_trie_path=None, zalyzniak_dict=ZALYZNYAK_DICT, cmu_dict=CMU_DICT): 77 | if self.stress_predictors.get(language) is None: 78 | self.stress_predictors[language] = CombinedStressPredictor(language, stress_model_path, 79 | raw_stress_dict_path, stress_trie_path, 80 | zalyzniak_dict, cmu_dict) 81 | return self.stress_predictors[language] 82 | 83 | def get_stresses(self, word: str, language: str="ru") -> List[int]: 84 | """ 85 | :param word: слово. 86 | :param language: язык. 87 | :return: ударения слова. 88 | """ 89 | return self.get_stress_predictor(language).predict(word) 90 | 91 | @staticmethod 92 | def get_word_syllables(word: str) -> List[str]: 93 | """ 94 | :param word: слово. 95 | :return: его слоги. 96 | """ 97 | return [syllable.text for syllable in get_syllables(word)] 98 | 99 | @staticmethod 100 | def count_syllables(word: str) -> int: 101 | """ 102 | :param word: слово. 103 | :return: количество слогов в нём. 104 | """ 105 | return len(get_syllables(word)) 106 | 107 | def get_markup(self, text: str, language: str="ru") -> Markup: 108 | """ 109 | :param text: текст. 110 | :param language: язык. 111 | :return: его разметка по словарю. 112 | """ 113 | return Markup.process_text(text, self.get_stress_predictor(language)) 114 | 115 | def get_improved_markup(self, text: str, language: str="ru") -> Tuple[Markup, ClassificationResult]: 116 | """ 117 | :param text: текст. 118 | :param language: язык. 119 | :return: его разметка по словарю, классификатору метру и ML классификатору. 120 | """ 121 | markup = Markup.process_text(text, self.get_stress_predictor(language)) 122 | return MetreClassifier.improve_markup(markup) 123 | 124 | def classify_metre(self, text: str, language: str="ru") -> str: 125 | """ 126 | :param text: текст. 127 | :param language: язык. 128 | :return: его метр. 129 | """ 130 | return MetreClassifier.classify_metre(Markup.process_text(text, self.get_stress_predictor(language))).metre 131 | 132 | def generate_markups(self, input_path: str, input_type: FileType, output_path: str, output_type: FileType) -> None: 133 | """ 134 | Генерация разметок по текстам. 135 | 136 | :param input_path: путь к папке/файлу с текстом. 137 | :param input_type: тип файлов с текстов. 138 | :param output_path: путь к файлу с итоговыми разметками. 139 | :param output_type: тип итогового файла. 140 | """ 141 | markups = Reader.read_markups(input_path, input_type, False, self.get_stress_predictor()) 142 | writer = Writer(output_type, output_path) 143 | writer.open() 144 | for markup in markups: 145 | writer.write_markup(markup) 146 | writer.close() 147 | 148 | def is_rhyme(self, word1: str, word2: str) -> bool: 149 | """ 150 | :param word1: первое слово. 151 | :param word2: второе слово. 152 | :return: рифмуются ли слова. 153 | """ 154 | markup_word1 = self.get_markup(word1).lines[0].words[0] 155 | markup_word1.set_stresses(self.get_stresses(word1)) 156 | markup_word2 = self.get_markup(word2).lines[0].words[0] 157 | markup_word2.set_stresses(self.get_stresses(word2)) 158 | return Rhymes.is_rhyme(markup_word1, markup_word2) 159 | 160 | def generate_poem(self, 161 | model_path: str, 162 | token_vocab_path: str=None, 163 | stress_vocab_path: str=None, 164 | metre_schema: str="-+", 165 | rhyme_pattern: str="abab", 166 | n_syllables: int=8, 167 | sampling_k: int=None, 168 | beam_width: int=None, 169 | seed: int=1337, 170 | temperature: float=1.0, 171 | last_text: str="") -> str: 172 | """ 173 | Сгенерировать стих. Нужно задать либо sampling_k, либо beam_width. 174 | 175 | :param model_path: путь к модели. 176 | :param token_vocab_path: путь к словарю. 177 | :param stress_vocab_path: путь к словарю ударений. 178 | :param metre_schema: схема метра. 179 | :param rhyme_pattern: схема рифм. 180 | :param n_syllables: количество слогов в строке. 181 | :param sampling_k: top-k при семплинге 182 | :param beam_width: ширина лучевого поиска. 183 | :param seed: seed 184 | :param temperature: температура генерации 185 | :param last_text: последняя строчка 186 | :return: стих. None, если генерация не была успешной. 187 | """ 188 | token_vocab_path = token_vocab_path or os.path.join(model_path, "vocabulary") 189 | stress_vocab_path = stress_vocab_path or os.path.join(model_path, "stress.pickle") 190 | generator = self.get_generator(model_path, token_vocab_path, stress_vocab_path) 191 | poem = generator.generate_poem( 192 | metre_schema=metre_schema, 193 | rhyme_pattern=rhyme_pattern, 194 | n_syllables=n_syllables, 195 | sampling_k=sampling_k, 196 | beam_width=beam_width, 197 | temperature=temperature, 198 | seed=seed, 199 | last_text=last_text 200 | ) 201 | return poem 202 | 203 | def get_word_rhymes(self, word: str, vocab_dump_path: str, markup_path: str=None) -> List[str]: 204 | """ 205 | Поиск рифмы для данного слова. 206 | 207 | :param word: слово. 208 | :param vocab_dump_path: путь, куда сохраняется словарь. 209 | :param markup_path: путь к разметкам. 210 | :return: список рифм. 211 | """ 212 | markup_word = self.get_markup(word).lines[0].words[0] 213 | markup_word.set_stresses(self.get_stresses(word)) 214 | rhymes = [] 215 | vocabulary = self.get_vocabulary(vocab_dump_path, markup_path) 216 | for i in range(vocabulary.size()): 217 | if Rhymes.is_rhyme(markup_word, vocabulary.get_word(i)): 218 | rhymes.append(vocabulary.get_word(i).text.lower()) 219 | return rhymes 220 | -------------------------------------------------------------------------------- /rupo/main/markup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Автор: Гусев Илья 3 | # Описание: Модуль для описания разметки по ударениям и слогам. 4 | 5 | import json 6 | from typing import List, Set 7 | import xml.etree.ElementTree as etree 8 | 9 | from dicttoxml import dicttoxml 10 | 11 | from rupo.util.preprocess import get_first_vowel_position 12 | from rupo.util.mixins import CommonMixin 13 | from rupo.main.tokenizer import Tokenizer, Token 14 | from rupo.util.timeit import timeit 15 | from russ.syllables import get_syllables 16 | 17 | 18 | class Annotation(CommonMixin): 19 | """ 20 | Класс аннотации. 21 | Содержит начальную и конечную позицию в тексте, а также текст аннотации. 22 | """ 23 | def __init__(self, begin: int, end: int, text: str) -> None: 24 | self.begin = begin 25 | self.end = end 26 | self.text = text 27 | 28 | 29 | class Syllable(Annotation): 30 | """ 31 | Разметка слога. Включает в себя аннотацию и номер слога, а также ударение. 32 | Если ударение падает не на этот слог, -1. 33 | """ 34 | def __init__(self, begin: int, end: int, number: int, text: str, stress: int=-1) -> None: 35 | super(Syllable, self).__init__(begin, end, text) 36 | self.number = number 37 | self.stress = stress 38 | 39 | def vowel(self) -> int: 40 | """ 41 | :return: позиция гласной буквы этого слога в слове (с 0). 42 | """ 43 | return get_first_vowel_position(self.text) + self.begin 44 | 45 | def from_dict(self, d: dict) -> 'Syllable': 46 | self.__dict__.update(d) 47 | if "accent" in self.__dict__: 48 | self.stress = self.__dict__["accent"] 49 | return self 50 | 51 | 52 | class Word(Annotation): 53 | """ 54 | Разметка слова. Включает в себя аннотацию слова и его слоги. 55 | """ 56 | def __init__(self, begin: int, end: int, text: str, syllables: List[Syllable]) -> None: 57 | super(Word, self).__init__(begin, end, text) 58 | self.syllables = syllables 59 | 60 | def count_stresses(self) -> int: 61 | """ 62 | :return: количество ударений в слове. 63 | """ 64 | return sum(syllable.stress != -1 for syllable in self.syllables) 65 | 66 | def stress(self) -> int: 67 | """ 68 | :return: последнее ударение в слове, если нет, то -1. 69 | """ 70 | stress = -1 71 | for syllable in self.syllables: 72 | if syllable.stress != -1: 73 | stress = syllable.stress 74 | return stress 75 | 76 | def get_stressed_syllables_numbers(self) -> List[int]: 77 | """ 78 | :return: номера слогов, на которые падают ударения. 79 | """ 80 | return [syllable.number for syllable in self.syllables if syllable.stress != -1] 81 | 82 | def get_stresses(self) -> Set[int]: 83 | """ 84 | :return: все ударения. 85 | """ 86 | stresses = set() 87 | for syllable in self.syllables: 88 | if syllable.stress != -1: 89 | stresses.add(syllable.stress) 90 | return stresses 91 | 92 | def set_stresses(self, stresses: List[int]) -> None: 93 | """ 94 | Задать ударения, все остальные убираются. 95 | 96 | :param stresses: позиции ударения в слове. 97 | """ 98 | for syllable in self.syllables: 99 | if syllable.vowel() in stresses: 100 | syllable.stress = syllable.vowel() 101 | else: 102 | syllable.stress = -1 103 | 104 | def get_short(self) -> str: 105 | """ 106 | :return: слово в форме "текст"+"последнее ударение". 107 | """ 108 | return self.text.lower() + str(self.stress()) 109 | 110 | def from_dict(self, d: dict) -> 'Word': 111 | self.__dict__.update(d) 112 | syllables = d["syllables"] # type: List[dict] 113 | self.syllables = [Syllable(0, 0, 0, "").from_dict(syllable) for syllable in syllables] 114 | return self 115 | 116 | def to_stressed_word(self): 117 | from rupo.stress.word import StressedWord, Stress 118 | return StressedWord(self.text, set([Stress(pos, Stress.Type.PRIMARY) for pos in self.get_stresses()])) 119 | 120 | def __hash__(self) -> int: 121 | """ 122 | :return: хеш разметки. 123 | """ 124 | return hash(self.get_short()) 125 | 126 | 127 | class Line(Annotation): 128 | """ 129 | Разметка строки. Включает в себя аннотацию строки и её слова. 130 | """ 131 | def __init__(self, begin: int, end: int, text: str, words: List[Word]) -> None: 132 | super(Line, self).__init__(begin, end, text) 133 | self.words = words 134 | 135 | def from_dict(self, d) -> 'Line': 136 | self.__dict__.update(d) 137 | words = d["words"] # type: List[dict] 138 | self.words = [Word(0, 0, "", []).from_dict(word) for word in words] 139 | return self 140 | 141 | def count_vowels(self): 142 | num_vowels = 0 143 | for word in self.words: 144 | for syllable in word.syllables: 145 | if get_first_vowel_position(syllable.text) != -1: 146 | num_vowels += 1 147 | return num_vowels 148 | 149 | 150 | class Markup(CommonMixin): 151 | """ 152 | Класс данных для разметки в целом с экспортом/импортом в XML и JSON. 153 | """ 154 | def __init__(self, text: str=None, lines: List[Line]=None) -> None: 155 | self.text = text 156 | self.lines = lines 157 | self.version = 2 158 | 159 | def to_json(self) -> str: 160 | return json.dumps(self.to_dict(), ensure_ascii=False) 161 | 162 | def from_json(self, st) -> 'Markup': 163 | d = json.loads(st) 164 | return self.from_dict(d) 165 | 166 | def from_dict(self, d) -> 'Markup': 167 | self.__dict__.update(d) 168 | lines = d["lines"] # type: List[dict] 169 | self.lines = [Line(0, 0, "", []).from_dict(line) for line in lines] 170 | return self 171 | 172 | def to_xml(self) -> str: 173 | """ 174 | Экспорт в XML. 175 | 176 | :return self: строка в формате XML 177 | """ 178 | return dicttoxml(self.to_dict(), custom_root='markup', attr_type=False).decode('utf-8').replace("\n", "\\n") 179 | 180 | def from_xml(self, xml: str) -> 'Markup': 181 | """ 182 | Импорт из XML. 183 | 184 | :param xml: XML-разметка 185 | :return self: получившийся объект Markup 186 | """ 187 | root = etree.fromstring(xml) 188 | if root.find("version") is None or int(root.find("version").text) != self.version: 189 | raise TypeError("Другая версия разметки") 190 | lines_node = root.find("lines") 191 | lines = [] 192 | for line_node in lines_node.findall("item"): 193 | words_node = line_node.find("words") 194 | words = [] 195 | for word_node in words_node.findall("item"): 196 | syllables_node = word_node.find("syllables") 197 | syllables = [] 198 | for syllable_node in syllables_node.findall("item"): 199 | stress_node = syllable_node.find("accent") \ 200 | if syllable_node.find("accent") is not None \ 201 | else syllable_node.find("stress") 202 | stress = int(stress_node.text) 203 | syllables.append(Syllable(int(syllable_node.find("begin").text), 204 | int(syllable_node.find("end").text), 205 | int(syllable_node.find("number").text), 206 | syllable_node.find("text").text, 207 | stress)) 208 | words.append(Word(int(word_node.find("begin").text), int(word_node.find("end").text), 209 | word_node.find("text").text, syllables)) 210 | lines.append(Line(int(line_node.find("begin").text), int(line_node.find("end").text), 211 | line_node.find("text").text, words)) 212 | self.text = root.find("text").text.replace("\\n", "\n") 213 | self.lines = lines 214 | return self 215 | 216 | def from_raw(self, text: str) -> 'Markup': 217 | """ 218 | Импорт из сырого текста с ударениями в конце слов 219 | 220 | :param text: текст. 221 | :return: разметка. 222 | """ 223 | 224 | pos = 0 225 | lines = [] 226 | for line in text.split("\n"): 227 | if line == "": 228 | continue 229 | line_tokens = [] 230 | for word in line.split(" "): 231 | i = -1 232 | ch = word[i] 233 | stress = "" 234 | while ch.isdigit() or ch == "-": 235 | stress += ch 236 | i -= 1 237 | ch = word[i] 238 | line_tokens.append((word[:i+1], int(stress[::-1]))) 239 | words = [] 240 | line_begin = pos 241 | for pair in line_tokens: 242 | token = pair[0] 243 | stress = pair[1] 244 | syllables = get_syllables(token) 245 | for j in range(len(syllables)): 246 | syllables[j].begin += pos 247 | syllables[j].end += pos 248 | word = Word(pos, pos + len(token), token, syllables) 249 | word.set_stresses([stress]) 250 | words.append(word) 251 | pos += len(token) + 1 252 | lines.append(Line(line_begin, pos, " ".join([pair[0] for pair in line_tokens]), words)) 253 | self.text = "\n".join([line.text for line in lines]) 254 | self.lines = lines 255 | return self 256 | 257 | @staticmethod 258 | @timeit 259 | def process_text(text: str, stress_predictor) -> 'Markup': 260 | """ 261 | Получение начального варианта разметки по слогам и ударениям. 262 | 263 | :param text: текст для разметки 264 | :param stress_predictor: предсказатель ударений. 265 | :return markup: разметка по слогам и ударениям 266 | """ 267 | begin_line = 0 268 | lines = [] 269 | words = [] 270 | text_lines = text.split("\n") 271 | for text_line in text_lines: 272 | tokens = [token for token in Tokenizer.tokenize(text_line) if token.token_type == Token.TokenType.WORD] 273 | for token in tokens: 274 | word = Word(begin_line + token.begin, begin_line + token.end, token.text, get_syllables(token.text)) 275 | # Проставляем ударения. 276 | stresses = stress_predictor.predict(token.text.lower()) 277 | # Сопоставляем ударения слогам. 278 | if len(word.syllables) > 1: 279 | word.set_stresses(stresses) 280 | words.append(word) 281 | end_line = begin_line + len(text_line) 282 | lines.append(Line(begin_line, end_line, text_line, words)) 283 | words = [] 284 | begin_line = end_line + 1 285 | return Markup(text, lines) 286 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /rupo/metre/pattern_analyzer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Автор: Гусев Илья 3 | # Описание: Сопоставление шаблону. 4 | 5 | from typing import List, Set, Tuple 6 | 7 | 8 | class TreeNode: 9 | """ 10 | Нода дерева разбора шаблона. 11 | """ 12 | leaf_chars = "usUS" 13 | non_leaf_chars = "*?w" 14 | 15 | def __init__(self, parent: 'TreeNode', children: List['TreeNode'], text: str, pattern_pos: int): 16 | """ 17 | :param parent: родитель ноды. 18 | :param children: дети ноды. 19 | :param text: символ, соответствующий ноде. 20 | :param pattern_pos: позиция символа в шаблоне 21 | """ 22 | self.parent = parent # type: TreeNode 23 | self.children = children # type: List[TreeNode] 24 | self.text = text # type: str 25 | self.pattern_pos = pattern_pos # type: int 26 | 27 | def get_level(self) -> int: 28 | """ 29 | :return: высота ноды в дереве. 30 | """ 31 | parent = self.parent 32 | level = 0 33 | while parent is not None: 34 | parent = parent.parent 35 | level += 1 36 | return level 37 | 38 | def get_next_sibling(self) -> 'TreeNode': 39 | """ 40 | :return: соседняя нода справа. 41 | """ 42 | siblings = self.parent.children 43 | index = siblings.index(self) + 1 44 | if index < len(siblings): 45 | return siblings[index] 46 | return None 47 | 48 | def get_last_child_leaf(self) -> 'TreeNode': 49 | """ 50 | :return: последняя нода изе детей, которая является листом. 51 | """ 52 | for child in reversed(self.children): 53 | if child.is_leaf(): 54 | return child 55 | return None 56 | 57 | def is_first_leaf(self) -> bool: 58 | if not self.is_leaf(): 59 | return False 60 | return [child for child in self.parent.children if child.is_leaf()][0] == self 61 | 62 | def is_last_leaf(self) -> bool: 63 | if not self.is_leaf(): 64 | return False 65 | return [child for child in self.parent.children if child.is_leaf()][-1] == self 66 | 67 | def get_most_left_leaf(self) -> 'TreeNode': 68 | """ 69 | :return: самый левый потомок. 70 | """ 71 | node = self 72 | while len(node.children) != 0: 73 | node = node.children[0] 74 | assert node.is_leaf() 75 | return node 76 | 77 | def print_tree(self) -> None: 78 | """ 79 | Вывод дерева с корнем в этой ноде. 80 | """ 81 | stack = list() 82 | stack.append(self) 83 | while len(stack) != 0: 84 | current_node = stack.pop() 85 | print("\t" * current_node.get_level(), current_node) 86 | stack += current_node.children 87 | 88 | def is_leaf(self) -> bool: 89 | """ 90 | :return: является ли нода листом дерева. 91 | """ 92 | return self.text in TreeNode.leaf_chars 93 | 94 | def __str__(self) -> str: 95 | return self.text + " " + str(self.pattern_pos) 96 | 97 | def __repr__(self) -> str: 98 | return self.__str__() 99 | 100 | def __hash__(self): 101 | return hash(self.pattern_pos) 102 | 103 | def __eq__(self, other): 104 | return self.pattern_pos == other.pattern_pos 105 | 106 | 107 | class State: 108 | """ 109 | Состояние разбора. 110 | """ 111 | def __init__(self, node: TreeNode, string_pos: int, strong_errors: int, weak_errors: int, pattern: str): 112 | """ 113 | :param node: нода дерева, соответствующая состоянию. 114 | :param string_pos: позиция в сопоставляемой строке. 115 | :param strong_errors: количество ошибок в U и S. 116 | :param weak_errors: количество ошибок в u и s. 117 | :param pattern: шаблон - путь, до этого состояния. 118 | """ 119 | self.node = node # type: TreeNode 120 | self.string_pos = string_pos # type: int 121 | self.strong_errors = strong_errors # type: int 122 | self.weak_errors = weak_errors # type: int 123 | self.pattern = pattern # type: str 124 | 125 | def __str__(self) -> str: 126 | return str(self.node) + " " + str(self.string_pos) + " " + str(self.strong_errors) + " " + str(self.weak_errors) 127 | 128 | def __repr__(self) -> str: 129 | return self.__str__() 130 | 131 | 132 | class PatternAnalyzer: 133 | """ 134 | Сопоставлятель шаблона и строки. 135 | """ 136 | def __init__(self, pattern: str, error_border: int=8): 137 | """ 138 | :param error_border: граница по ошибкам. 139 | :param pattern: шаблон. 140 | """ 141 | self.pattern = pattern # type: str 142 | self.tree = self.__build_tree(pattern) # type: TreeNode 143 | self.error_border = error_border 144 | 145 | @staticmethod 146 | def count_errors(pattern: str, string: str, error_border: int=8) -> Tuple[str, int, int, bool]: 147 | """ 148 | :param pattern: шаблон. 149 | :param string: строка. 150 | :param error_border: граница по ошибкам. 151 | :return: лучший шаблон, количество сильных ошибок, количество слабых ошибок. 152 | """ 153 | analyzer = PatternAnalyzer(pattern, error_border) 154 | return analyzer.__accept(string) 155 | 156 | @staticmethod 157 | def __build_tree(pattern: str) -> TreeNode: 158 | """ 159 | Построение дерева шаблона. 160 | 161 | :param pattern: шаблон. 162 | :return: корень дерева. 163 | """ 164 | root_node = TreeNode(None, list(), "R", -1) 165 | current_node = root_node 166 | for i, ch in enumerate(pattern): 167 | if ch == "(": 168 | node = TreeNode(current_node, list(), "()", i) 169 | current_node.children.append(node) 170 | current_node = node 171 | if ch == ")": 172 | node = current_node 173 | current_node = current_node.parent 174 | # Убираем бессмысленные скобки. 175 | if i + 1 < len(pattern) and pattern[i + 1] not in "*?": 176 | current_node.children = current_node.children[:-1] + node.children 177 | for child in node.children: 178 | child.parent = current_node 179 | if ch in TreeNode.leaf_chars: 180 | current_node.children.append(TreeNode(current_node, list(), ch, i)) 181 | # Заменяем скобки на нетерминалы. 182 | if ch in TreeNode.non_leaf_chars: 183 | current_node.children[-1].text = ch 184 | current_node.children[-1].pattern_pos = i 185 | return root_node 186 | 187 | def __accept(self, string: str) -> Tuple[str, int, int, bool]: 188 | """ 189 | :param string: строка. 190 | :return: лучший шаблон, количество сильных ошибок, количество слабых ошибок, были ли ошибки. 191 | """ 192 | current_states = [State(None, -1, 0, 0, "")] 193 | current_node = self.tree.get_most_left_leaf() 194 | for i, ch in enumerate(string): 195 | new_states = [] 196 | for state in current_states: 197 | if state.node is not None: 198 | current_node = self.__get_next_leaf(state.node) 199 | variants = self.__get_variants(current_node) 200 | 201 | # Каждый вариант - новое состояние. 202 | for variant in variants: 203 | assert variant.is_leaf() 204 | strong_errors = state.strong_errors + int(variant.text.isupper() and variant.text != ch) 205 | weak_errors = state.weak_errors + int(variant.text.islower() and variant.text != ch.lower()) 206 | new_state = State(variant, i, strong_errors, weak_errors, state.pattern+variant.text) 207 | if new_state.strong_errors + new_state.weak_errors > self.error_border: 208 | continue 209 | new_states.append(new_state) 210 | 211 | if len(new_states) == 0: 212 | # Можем закончить раньше, если по ошибкам порезали ветки, либо если шаблон меньше строки. 213 | current_states = PatternAnalyzer.__filter_states(current_states, self.tree) 214 | pattern, strong_errors, weak_errors = self.__get_min_errors_from_states(current_states) 215 | diff = (len(string) - i) 216 | return pattern, strong_errors + diff, weak_errors + diff, True 217 | 218 | current_states = new_states 219 | current_states = PatternAnalyzer.__filter_states(current_states, self.tree) 220 | return self.__get_min_errors_from_states(current_states) + (False,) 221 | 222 | @staticmethod 223 | def __get_variants(current_node: TreeNode) -> Set[TreeNode]: 224 | """ 225 | :param current_node: текущая нода. 226 | :return: варианты ноды на том же символе строки, возникают из-за * и ? в шаблоне. 227 | """ 228 | variants = set() 229 | current_variant = current_node 230 | while current_variant is not None: 231 | if current_variant not in variants: 232 | variants.add(current_variant) 233 | else: 234 | current_variant = current_variant.parent 235 | current_variant = PatternAnalyzer.__get_next_variant(current_variant) 236 | return variants 237 | 238 | @staticmethod 239 | def __get_next_variant(node: TreeNode) -> TreeNode: 240 | """ 241 | Получение следующего варианта из варинатов текущей ноды. 242 | 243 | :param node: текущий вариант. 244 | :return: следующий вариант. 245 | """ 246 | assert node.is_leaf() 247 | while node.parent is not None: 248 | parent = node.parent 249 | grandfather = parent.parent 250 | uncle = parent.get_next_sibling() if grandfather is not None else None 251 | is_variable = node.is_first_leaf() or not node.is_leaf() 252 | if is_variable and uncle is not None: 253 | return uncle.get_most_left_leaf() 254 | elif grandfather is not None and grandfather.text == "*" and grandfather.children[-1] == parent: 255 | return grandfather.get_most_left_leaf() 256 | if is_variable: 257 | node = parent 258 | else: 259 | break 260 | return None 261 | 262 | @staticmethod 263 | def __get_next_leaf(node: TreeNode) -> TreeNode: 264 | """ 265 | Получение следующей ноды. 266 | 267 | :param node: текущая нода. 268 | :return: следующая нода. 269 | """ 270 | assert node.is_leaf() 271 | while node.parent is not None: 272 | sibling = node.get_next_sibling() 273 | if sibling is not None: 274 | return sibling.get_most_left_leaf() 275 | elif node.parent.text == "*": 276 | return node.parent.get_most_left_leaf() 277 | node = node.parent 278 | return None 279 | 280 | @staticmethod 281 | def __filter_states(states: List[State], root: TreeNode) -> List[State]: 282 | """ 283 | Фильтрация по наличию обязательных терминалов. 284 | 285 | :param states: состояния. 286 | :param root: корень дерева. 287 | :return: отфильтрованные состояния. 288 | """ 289 | return [state for state in states if root.get_last_child_leaf() is None or 290 | state.node.pattern_pos >= root.get_last_child_leaf().pattern_pos] 291 | 292 | @staticmethod 293 | def __get_min_errors_from_states(states: List[State]) -> Tuple[str, int, int]: 294 | """ 295 | :param states: состояния. 296 | :return: лучший шаблон, количество сильных ошибок, количество слабых ошибок. 297 | """ 298 | if len(states) == 0: 299 | return "", 0, 0 300 | return min([(state.pattern, state.strong_errors, state.weak_errors) for i, state in enumerate(states)], 301 | key=lambda x: (x[1], x[2], x[0])) 302 | -------------------------------------------------------------------------------- /rupo/metre/metre_classifier.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Автор: Гусев Илья 3 | # Описание: Классификатор метра. 4 | 5 | from collections import OrderedDict 6 | from typing import List, Dict, Tuple 7 | import jsonpickle 8 | import logging 9 | 10 | from rupo.main.markup import Line, Markup 11 | from rupo.util.mixins import CommonMixin 12 | from rupo.metre.pattern_analyzer import PatternAnalyzer 13 | from rupo.util.preprocess import get_first_vowel_position 14 | from rupo.util.timeit import timeit 15 | 16 | 17 | class StressCorrection(CommonMixin): 18 | """ 19 | Исправление ударения. 20 | """ 21 | def __init__(self, line_number: int, word_number: int, syllable_number: int, 22 | word_text: str, stress: int) -> None: 23 | """ 24 | :param line_number: номер строки. 25 | :param word_number: номер слова. 26 | :param syllable_number: номер слога. 27 | :param word_text: текст слова. 28 | :param stress: позиция ударения (с 0). 29 | """ 30 | self.line_number = line_number 31 | self.word_number = word_number 32 | self.syllable_number = syllable_number 33 | self.word_text = word_text 34 | self.stress = stress 35 | 36 | 37 | class ClassificationResult(CommonMixin): 38 | """ 39 | Результат классификации стихотворения по метру. 40 | """ 41 | def __init__(self, count_lines: int=0) -> None: 42 | """ 43 | :param count_lines: количество строк. 44 | """ 45 | self.metre = None 46 | self.count_lines = count_lines 47 | self.errors_count = {k: 0 for k in MetreClassifier.metres.keys()} # type: Dict[str, int] 48 | self.corrections = {k: [] for k in MetreClassifier.metres.keys()} # type: Dict[str, List[StressCorrection]] 49 | self.resolutions = {k: [] for k in MetreClassifier.metres.keys()} # type: Dict[str, List[StressCorrection]] 50 | self.additions = {k: [] for k in MetreClassifier.metres.keys()} # type: Dict[str, List[StressCorrection]] 51 | 52 | def get_metre_errors_count(self): 53 | """ 54 | :return: получить количество ошибок на заданном метре. 55 | """ 56 | return self.errors_count[self.metre] 57 | 58 | def to_json(self): 59 | """ 60 | :return: сериализация в json. 61 | """ 62 | return jsonpickle.encode(self) 63 | 64 | @staticmethod 65 | def str_corrections(collection: List[StressCorrection]) -> str: 66 | """ 67 | :param collection: список исправлений. 68 | :return: его строковое представление. 69 | """ 70 | return"\n".join([str((item.word_text, item.syllable_number)) for item in collection]) 71 | 72 | def __str__(self): 73 | st = "Метр: " + str(self.metre) + "\n" 74 | st += "Снятая омография: \n" + ClassificationResult.str_corrections(self.resolutions[self.metre]) + "\n" 75 | st += "Неправильные ударения: \n" + ClassificationResult.str_corrections(self.corrections[self.metre]) + "\n" 76 | st += "Новые ударения: \n" + ClassificationResult.str_corrections(self.additions[self.metre]) + "\n" 77 | return st 78 | 79 | 80 | class ErrorsTableRecord: 81 | def __init__(self, strong_errors, weak_errors, pattern, failed=False): 82 | self.strong_errors = strong_errors 83 | self.weak_errors = weak_errors 84 | self.pattern = pattern 85 | self.failed = failed 86 | 87 | def __str__(self): 88 | return self.pattern + " " + str(self.strong_errors) + " " + str(self.weak_errors) 89 | 90 | def __repr__(self): 91 | return self.__str__() 92 | 93 | 94 | class ErrorsTable: 95 | def __init__(self, num_lines): 96 | self.data = {} 97 | self.num_lines = num_lines 98 | self.coef = OrderedDict( 99 | [("iambos", 0.3), 100 | ("choreios", 0.3), 101 | ("daktylos", 0.4), 102 | ("amphibrachys", 0.4), 103 | ("anapaistos", 0.4), 104 | ("dolnik3", 0.5), 105 | ("dolnik2", 0.5), 106 | ("taktovik3", 6.0), 107 | ("taktovik2", 6.0) 108 | ]) 109 | self.sum_coef = OrderedDict( 110 | [("iambos", 0.0), 111 | ("choreios", 0.0), 112 | ("daktylos", 0.0), 113 | ("amphibrachys", 0.0), 114 | ("anapaistos", 0.0), 115 | ("dolnik3", 0.035), 116 | ("dolnik2", 0.035), 117 | ("taktovik3", 0.10), 118 | ("taktovik2", 0.10) 119 | ]) 120 | for metre_name in MetreClassifier.metres.keys(): 121 | self.data[metre_name] = [ErrorsTableRecord(0, 0, "") for _ in range(num_lines)] 122 | 123 | def add_record(self, metre_name, line_num, strong_errors, weak_errors, pattern, failed=False): 124 | self.data[metre_name][line_num] = ErrorsTableRecord(strong_errors, weak_errors, pattern, failed) 125 | 126 | def get_best_metre(self): 127 | for l in range(self.num_lines): 128 | strong_sum = 0 129 | weak_sum = 0 130 | for metre_name in self.data.keys(): 131 | strong_sum += self.data[metre_name][l].strong_errors 132 | weak_sum += self.data[metre_name][l].weak_errors 133 | for metre_name, column in self.data.items(): 134 | if strong_sum != 0: 135 | column[l].strong_errors = column[l].strong_errors / float(strong_sum) 136 | if weak_sum != 0: 137 | column[l].weak_errors = column[l].weak_errors / float(weak_sum) 138 | sums = dict() 139 | for metre_name in self.data.keys(): 140 | sums[metre_name] = (0, 0) 141 | for metre_name, column in self.data.items(): 142 | strong_sum = 0 143 | weak_sum = 0 144 | for l in range(self.num_lines): 145 | strong_sum += column[l].strong_errors 146 | weak_sum += column[l].weak_errors 147 | sums[metre_name] = (strong_sum, weak_sum) 148 | for metre_name, pair in sums.items(): 149 | sums[metre_name] = self.sum_coef[metre_name] + (pair[0] + pair[1] / 2.0) * self.coef[metre_name] / self.num_lines 150 | logging.debug(sums) 151 | return min(sums, key=sums.get) 152 | 153 | 154 | class MetreClassifier(object): 155 | """ 156 | Классификатор, считает отклонения от стандартных шаблонов ритма(метров). 157 | """ 158 | metres = OrderedDict( 159 | [("iambos", '(us)*(uS)(U)?(U)?'), 160 | ("choreios", '(su)*(S)(U)?(U)?'), 161 | ("daktylos", '(suu)*(S)(U)?(U)?'), 162 | ("amphibrachys", '(usu)*(uS)(U)?(U)?'), 163 | ("anapaistos", '(uus)*(uuS)(U)?(U)?'), 164 | ("dolnik3", '(u)?(u)?((su)(u)?)*(S)(U)?(U)?'), 165 | ("dolnik2", '(u)?(u)?((s)(u)?)*(S)(U)?(U)?'), 166 | ("taktovik3", '(u)?(u)?((su)(u)?(u)?)*(S)(U)?(U)?'), 167 | ("taktovik2", '(u)?(u)?((s)(u)?(u)?)*(S)(U)?(U)?') 168 | ]) 169 | 170 | border_syllables_count = 20 171 | 172 | @staticmethod 173 | @timeit 174 | def classify_metre(markup): 175 | """ 176 | Классифицируем стихотворный метр. 177 | 178 | :param markup: разметка. 179 | :return: результат классификации. 180 | """ 181 | result = ClassificationResult(len(markup.lines)) 182 | num_lines = len(markup.lines) 183 | errors_table = ErrorsTable(num_lines) 184 | for l, line in enumerate(markup.lines): 185 | for metre_name, metre_pattern in MetreClassifier.metres.items(): 186 | line_syllables_count = sum([len(word.syllables) for word in line.words]) 187 | 188 | # Строчки длиной больше border_syllables_count слогов не обрабатываем. 189 | if line_syllables_count > MetreClassifier.border_syllables_count or line_syllables_count == 0: 190 | continue 191 | error_border = 7 192 | if metre_name == "dolnik2" or metre_name == "dolnik3": 193 | error_border = 3 194 | if metre_name == "taktovik2" or metre_name == "taktovik3": 195 | error_border = 2 196 | pattern, strong_errors, weak_errors, analysis_errored = \ 197 | PatternAnalyzer.count_errors(MetreClassifier.metres[metre_name], 198 | MetreClassifier.__get_line_pattern(line), 199 | error_border) 200 | if analysis_errored or len(pattern) == 0: 201 | errors_table.add_record(metre_name, l, strong_errors, weak_errors, pattern, True) 202 | continue 203 | corrections = MetreClassifier.__get_line_pattern_matching_corrections(line, l, pattern)[0] 204 | accentuation_errors = len(corrections) 205 | strong_errors += accentuation_errors 206 | errors_table.add_record(metre_name, l, strong_errors, weak_errors, pattern) 207 | result.metre = errors_table.get_best_metre() 208 | 209 | # Запомним все исправления. 210 | for l, line in enumerate(markup.lines): 211 | pattern = errors_table.data[result.metre][l].pattern 212 | failed = errors_table.data[result.metre][l].failed 213 | if failed or len(pattern) == 0: 214 | continue 215 | corrections, resolutions, additions =\ 216 | MetreClassifier.__get_line_pattern_matching_corrections(line, l, pattern) 217 | result.corrections[result.metre] += corrections 218 | result.resolutions[result.metre] += resolutions 219 | result.additions[result.metre] += additions 220 | result.errors_count[result.metre] += len(corrections) 221 | return result 222 | 223 | @staticmethod 224 | def __get_line_pattern(line: Line) -> str: 225 | """ 226 | Сопоставляем строку шаблону, считаем ошибки. 227 | 228 | :param line: строка. 229 | :return: количество ошибок 230 | """ 231 | pattern = "" 232 | for w, word in enumerate(line.words): 233 | if len(word.syllables) == 0: 234 | pattern += "U" 235 | else: 236 | for syllable in word.syllables: 237 | if syllable.stress != -1: 238 | pattern += "S" 239 | else: 240 | pattern += "U" 241 | return pattern 242 | 243 | @staticmethod 244 | def __get_line_pattern_matching_corrections(line: Line, line_number: int, pattern: str) \ 245 | -> Tuple[List[StressCorrection], List[StressCorrection], List[StressCorrection]]: 246 | """ 247 | Ударения могут приходиться на слабое место, 248 | если безударный слог того же слова не попадает на икт. Иначе - ошибка. 249 | 250 | :param line: строка. 251 | :param line_number: номер строки. 252 | :param pattern: шаблон. 253 | :return: ошибки, дополнения и снятия 254 | """ 255 | corrections = [] 256 | resolutions = [] 257 | additions = [] 258 | number_in_pattern = 0 259 | for w, word in enumerate(line.words): 260 | # Игнорируем слова длиной меньше 2 слогов. 261 | if len(word.syllables) == 0: 262 | continue 263 | if len(word.syllables) == 1: 264 | if pattern[number_in_pattern].lower() == "s" and word.syllables[0].stress == -1: 265 | additions.append(StressCorrection(line_number, w, 0, word.text, word.syllables[0].vowel())) 266 | number_in_pattern += len(word.syllables) 267 | continue 268 | stress_count = word.count_stresses() 269 | for syllable in word.syllables: 270 | if stress_count == 0 and pattern[number_in_pattern].lower() == "s": 271 | # Ударений нет, ставим такое, какое подходит по метру. Возможно несколько. 272 | additions.append(StressCorrection(line_number, w, syllable.number, word.text, syllable.vowel())) 273 | elif pattern[number_in_pattern].lower() == "u" and syllable.stress != -1: 274 | # Ударение есть и оно падает на этот слог, при этом в шаблоне безударная позиция. 275 | # Найдём такой слог, у которого в шаблоне ударная позиция. Это и есть наше исправление. 276 | for other_syllable in word.syllables: 277 | other_number_in_pattern = other_syllable.number - syllable.number + number_in_pattern 278 | if syllable.number == other_syllable.number or pattern[other_number_in_pattern].lower() != "s": 279 | continue 280 | ac = StressCorrection(line_number, w, other_syllable.number, word.text, other_syllable.vowel()) 281 | if stress_count == 1 and other_syllable.stress == -1: 282 | corrections.append(ac) 283 | else: 284 | resolutions.append(ac) 285 | number_in_pattern += 1 286 | return corrections, resolutions, additions 287 | 288 | @staticmethod 289 | def get_improved_markup(markup: Markup, result: ClassificationResult) -> Markup: 290 | """ 291 | Улучшаем разметку после классификации метра. 292 | 293 | :param markup: начальная разметка. 294 | :param result: результат классификации. 295 | :return: улучшенная разметка. 296 | """ 297 | for pos in result.corrections[result.metre] + result.resolutions[result.metre]: 298 | syllables = markup.lines[pos.line_number].words[pos.word_number].syllables 299 | for i, syllable in enumerate(syllables): 300 | syllable.stress = -1 301 | if syllable.number == pos.syllable_number: 302 | syllable.stress = syllable.begin + get_first_vowel_position(syllable.text) 303 | for pos in result.additions[result.metre]: 304 | syllable = markup.lines[pos.line_number].words[pos.word_number].syllables[pos.syllable_number] 305 | syllable.stress = syllable.begin + get_first_vowel_position(syllable.text) 306 | 307 | return markup 308 | 309 | @staticmethod 310 | def improve_markup(markup: Markup) -> \ 311 | Tuple[Markup, ClassificationResult]: 312 | """ 313 | Улучшение разметки метрическим классификатором. 314 | 315 | :param markup: начальная разметка. 316 | """ 317 | result = MetreClassifier.classify_metre(markup) 318 | improved_markup = MetreClassifier.get_improved_markup(markup, result) 319 | return improved_markup, result 320 | -------------------------------------------------------------------------------- /rupo/data/examples/markup.json: -------------------------------------------------------------------------------- 1 | { 2 | "items": [ 3 | { 4 | "version": 2, 5 | "text": "Забывши волнения жизни мятежной,\nОдин жил в пустыне рыбак молодой.\nОднажды на скале прибрежной,\nНад тихой прозрачной рекой\nОн с удой беспечно\nСидел\nИ думой сердечной\nК прошедшему счастью летел.", 6 | "lines": [ 7 | { 8 | "words": [ 9 | { 10 | "syllables": [ 11 | { 12 | "begin": 0, 13 | "end": 2, 14 | "text": "За", 15 | "accent": -1, 16 | "number": 0 17 | }, 18 | { 19 | "begin": 2, 20 | "end": 4, 21 | "text": "бы", 22 | "accent": -1, 23 | "number": 1 24 | }, 25 | { 26 | "begin": 4, 27 | "end": 7, 28 | "text": "вши", 29 | "accent": 6, 30 | "number": 2 31 | } 32 | ], 33 | "end": 7, 34 | "text": "Забывши", 35 | "begin": 0 36 | }, 37 | { 38 | "syllables": [ 39 | { 40 | "begin": 0, 41 | "end": 2, 42 | "text": "во", 43 | "accent": -1, 44 | "number": 0 45 | }, 46 | { 47 | "begin": 2, 48 | "end": 5, 49 | "text": "лне", 50 | "accent": 4, 51 | "number": 1 52 | }, 53 | { 54 | "begin": 5, 55 | "end": 7, 56 | "text": "ни", 57 | "accent": -1, 58 | "number": 2 59 | }, 60 | { 61 | "begin": 7, 62 | "end": 8, 63 | "text": "я", 64 | "accent": -1, 65 | "number": 3 66 | } 67 | ], 68 | "end": 16, 69 | "text": "волнения", 70 | "begin": 8 71 | }, 72 | { 73 | "syllables": [ 74 | { 75 | "begin": 0, 76 | "end": 2, 77 | "text": "жи", 78 | "accent": 1, 79 | "number": 0 80 | }, 81 | { 82 | "begin": 2, 83 | "end": 5, 84 | "text": "зни", 85 | "accent": -1, 86 | "number": 1 87 | } 88 | ], 89 | "end": 22, 90 | "text": "жизни", 91 | "begin": 17 92 | }, 93 | { 94 | "syllables": [ 95 | { 96 | "begin": 0, 97 | "end": 2, 98 | "text": "мя", 99 | "accent": -1, 100 | "number": 0 101 | }, 102 | { 103 | "begin": 2, 104 | "end": 4, 105 | "text": "те", 106 | "accent": 3, 107 | "number": 1 108 | }, 109 | { 110 | "begin": 4, 111 | "end": 8, 112 | "text": "жной", 113 | "accent": -1, 114 | "number": 2 115 | } 116 | ], 117 | "end": 31, 118 | "text": "мятежной", 119 | "begin": 23 120 | } 121 | ], 122 | "end": 33, 123 | "text": "Забывши волнения жизни мятежной,", 124 | "begin": 0 125 | }, 126 | { 127 | "words": [ 128 | { 129 | "syllables": [ 130 | { 131 | "begin": 0, 132 | "end": 1, 133 | "text": "О", 134 | "accent": -1, 135 | "number": 0 136 | }, 137 | { 138 | "begin": 1, 139 | "end": 4, 140 | "text": "дин", 141 | "accent": 2, 142 | "number": 1 143 | } 144 | ], 145 | "end": 37, 146 | "text": "Один", 147 | "begin": 33 148 | }, 149 | { 150 | "syllables": [ 151 | { 152 | "begin": 0, 153 | "end": 3, 154 | "text": "жил", 155 | "accent": 1, 156 | "number": 0 157 | } 158 | ], 159 | "end": 41, 160 | "text": "жил", 161 | "begin": 38 162 | }, 163 | { 164 | "syllables": [], 165 | "end": 43, 166 | "text": "в", 167 | "begin": 42 168 | }, 169 | { 170 | "syllables": [ 171 | { 172 | "begin": 0, 173 | "end": 2, 174 | "text": "пу", 175 | "accent": -1, 176 | "number": 0 177 | }, 178 | { 179 | "begin": 2, 180 | "end": 5, 181 | "text": "сты", 182 | "accent": 4, 183 | "number": 1 184 | }, 185 | { 186 | "begin": 5, 187 | "end": 7, 188 | "text": "не", 189 | "accent": -1, 190 | "number": 2 191 | } 192 | ], 193 | "end": 51, 194 | "text": "пустыне", 195 | "begin": 44 196 | }, 197 | { 198 | "syllables": [ 199 | { 200 | "begin": 0, 201 | "end": 2, 202 | "text": "ры", 203 | "accent": -1, 204 | "number": 0 205 | }, 206 | { 207 | "begin": 2, 208 | "end": 5, 209 | "text": "бак", 210 | "accent": 3, 211 | "number": 1 212 | } 213 | ], 214 | "end": 57, 215 | "text": "рыбак", 216 | "begin": 52 217 | }, 218 | { 219 | "syllables": [ 220 | { 221 | "begin": 0, 222 | "end": 2, 223 | "text": "мо", 224 | "accent": -1, 225 | "number": 0 226 | }, 227 | { 228 | "begin": 2, 229 | "end": 4, 230 | "text": "ло", 231 | "accent": -1, 232 | "number": 1 233 | }, 234 | { 235 | "begin": 4, 236 | "end": 7, 237 | "text": "дой", 238 | "accent": 5, 239 | "number": 2 240 | } 241 | ], 242 | "end": 65, 243 | "text": "молодой", 244 | "begin": 58 245 | } 246 | ], 247 | "end": 67, 248 | "text": "Один жил в пустыне рыбак молодой.", 249 | "begin": 33 250 | }, 251 | { 252 | "words": [ 253 | { 254 | "syllables": [ 255 | { 256 | "begin": 0, 257 | "end": 1, 258 | "text": "О", 259 | "accent": -1, 260 | "number": 0 261 | }, 262 | { 263 | "begin": 1, 264 | "end": 4, 265 | "text": "дна", 266 | "accent": -1, 267 | "number": 1 268 | }, 269 | { 270 | "begin": 4, 271 | "end": 7, 272 | "text": "жды", 273 | "accent": 6, 274 | "number": 2 275 | } 276 | ], 277 | "end": 74, 278 | "text": "Однажды", 279 | "begin": 67 280 | }, 281 | { 282 | "syllables": [ 283 | { 284 | "begin": 0, 285 | "end": 2, 286 | "text": "на", 287 | "accent": 1, 288 | "number": 0 289 | } 290 | ], 291 | "end": 77, 292 | "text": "на", 293 | "begin": 75 294 | }, 295 | { 296 | "syllables": [ 297 | { 298 | "begin": 0, 299 | "end": 3, 300 | "text": "ска", 301 | "accent": 2, 302 | "number": 0 303 | }, 304 | { 305 | "begin": 3, 306 | "end": 5, 307 | "text": "ле", 308 | "accent": -1, 309 | "number": 1 310 | } 311 | ], 312 | "end": 83, 313 | "text": "скале", 314 | "begin": 78 315 | }, 316 | { 317 | "syllables": [ 318 | { 319 | "begin": 0, 320 | "end": 3, 321 | "text": "при", 322 | "accent": -1, 323 | "number": 0 324 | }, 325 | { 326 | "begin": 3, 327 | "end": 6, 328 | "text": "бре", 329 | "accent": 5, 330 | "number": 1 331 | }, 332 | { 333 | "begin": 6, 334 | "end": 10, 335 | "text": "жной", 336 | "accent": -1, 337 | "number": 2 338 | } 339 | ], 340 | "end": 94, 341 | "text": "прибрежной", 342 | "begin": 84 343 | } 344 | ], 345 | "end": 96, 346 | "text": "Однажды на скале прибрежной,", 347 | "begin": 67 348 | }, 349 | { 350 | "words": [ 351 | { 352 | "syllables": [ 353 | { 354 | "begin": 0, 355 | "end": 3, 356 | "text": "Над", 357 | "accent": 1, 358 | "number": 0 359 | } 360 | ], 361 | "end": 99, 362 | "text": "Над", 363 | "begin": 96 364 | }, 365 | { 366 | "syllables": [ 367 | { 368 | "begin": 0, 369 | "end": 2, 370 | "text": "ти", 371 | "accent": 1, 372 | "number": 0 373 | }, 374 | { 375 | "begin": 2, 376 | "end": 5, 377 | "text": "хой", 378 | "accent": -1, 379 | "number": 1 380 | } 381 | ], 382 | "end": 105, 383 | "text": "тихой", 384 | "begin": 100 385 | }, 386 | { 387 | "syllables": [ 388 | { 389 | "begin": 0, 390 | "end": 3, 391 | "text": "про", 392 | "accent": -1, 393 | "number": 0 394 | }, 395 | { 396 | "begin": 3, 397 | "end": 6, 398 | "text": "зра", 399 | "accent": 5, 400 | "number": 1 401 | }, 402 | { 403 | "begin": 6, 404 | "end": 10, 405 | "text": "чной", 406 | "accent": -1, 407 | "number": 2 408 | } 409 | ], 410 | "end": 116, 411 | "text": "прозрачной", 412 | "begin": 106 413 | }, 414 | { 415 | "syllables": [ 416 | { 417 | "begin": 0, 418 | "end": 2, 419 | "text": "ре", 420 | "accent": -1, 421 | "number": 0 422 | }, 423 | { 424 | "begin": 2, 425 | "end": 5, 426 | "text": "кой", 427 | "accent": 3, 428 | "number": 1 429 | } 430 | ], 431 | "end": 122, 432 | "text": "рекой", 433 | "begin": 117 434 | } 435 | ], 436 | "end": 123, 437 | "text": "Над тихой прозрачной рекой", 438 | "begin": 96 439 | }, 440 | { 441 | "words": [ 442 | { 443 | "syllables": [ 444 | { 445 | "begin": 0, 446 | "end": 2, 447 | "text": "Он", 448 | "accent": 0, 449 | "number": 0 450 | } 451 | ], 452 | "end": 125, 453 | "text": "Он", 454 | "begin": 123 455 | }, 456 | { 457 | "syllables": [], 458 | "end": 127, 459 | "text": "с", 460 | "begin": 126 461 | }, 462 | { 463 | "syllables": [ 464 | { 465 | "begin": 0, 466 | "end": 1, 467 | "text": "у", 468 | "accent": -1, 469 | "number": 0 470 | }, 471 | { 472 | "begin": 1, 473 | "end": 4, 474 | "text": "дой", 475 | "accent": 2, 476 | "number": 1 477 | } 478 | ], 479 | "end": 132, 480 | "text": "удой", 481 | "begin": 128 482 | }, 483 | { 484 | "syllables": [ 485 | { 486 | "begin": 0, 487 | "end": 2, 488 | "text": "бе", 489 | "accent": -1, 490 | "number": 0 491 | }, 492 | { 493 | "begin": 2, 494 | "end": 5, 495 | "text": "спе", 496 | "accent": 4, 497 | "number": 1 498 | }, 499 | { 500 | "begin": 5, 501 | "end": 8, 502 | "text": "чно", 503 | "accent": -1, 504 | "number": 2 505 | } 506 | ], 507 | "end": 141, 508 | "text": "беспечно", 509 | "begin": 133 510 | } 511 | ], 512 | "end": 142, 513 | "text": "Он с удой беспечно", 514 | "begin": 123 515 | }, 516 | { 517 | "words": [ 518 | { 519 | "syllables": [ 520 | { 521 | "begin": 0, 522 | "end": 2, 523 | "text": "Си", 524 | "accent": -1, 525 | "number": 0 526 | }, 527 | { 528 | "begin": 2, 529 | "end": 5, 530 | "text": "дел", 531 | "accent": 3, 532 | "number": 1 533 | } 534 | ], 535 | "end": 147, 536 | "text": "Сидел", 537 | "begin": 142 538 | } 539 | ], 540 | "end": 148, 541 | "text": "Сидел", 542 | "begin": 142 543 | }, 544 | { 545 | "words": [ 546 | { 547 | "syllables": [ 548 | { 549 | "begin": 0, 550 | "end": 1, 551 | "text": "И", 552 | "accent": 0, 553 | "number": 0 554 | } 555 | ], 556 | "end": 149, 557 | "text": "И", 558 | "begin": 148 559 | }, 560 | { 561 | "syllables": [ 562 | { 563 | "begin": 0, 564 | "end": 2, 565 | "text": "ду", 566 | "accent": 1, 567 | "number": 0 568 | }, 569 | { 570 | "begin": 2, 571 | "end": 5, 572 | "text": "мой", 573 | "accent": -1, 574 | "number": 1 575 | } 576 | ], 577 | "end": 155, 578 | "text": "думой", 579 | "begin": 150 580 | }, 581 | { 582 | "syllables": [ 583 | { 584 | "begin": 0, 585 | "end": 3, 586 | "text": "сер", 587 | "accent": -1, 588 | "number": 0 589 | }, 590 | { 591 | "begin": 3, 592 | "end": 5, 593 | "text": "де", 594 | "accent": 4, 595 | "number": 1 596 | }, 597 | { 598 | "begin": 5, 599 | "end": 9, 600 | "text": "чной", 601 | "accent": -1, 602 | "number": 2 603 | } 604 | ], 605 | "end": 165, 606 | "text": "сердечной", 607 | "begin": 156 608 | } 609 | ], 610 | "end": 166, 611 | "text": "И думой сердечной", 612 | "begin": 148 613 | }, 614 | { 615 | "words": [ 616 | { 617 | "syllables": [], 618 | "end": 167, 619 | "text": "К", 620 | "begin": 166 621 | }, 622 | { 623 | "syllables": [ 624 | { 625 | "begin": 0, 626 | "end": 3, 627 | "text": "про", 628 | "accent": -1, 629 | "number": 0 630 | }, 631 | { 632 | "begin": 3, 633 | "end": 5, 634 | "text": "ше", 635 | "accent": 4, 636 | "number": 1 637 | }, 638 | { 639 | "begin": 5, 640 | "end": 8, 641 | "text": "дше", 642 | "accent": -1, 643 | "number": 2 644 | }, 645 | { 646 | "begin": 8, 647 | "end": 10, 648 | "text": "му", 649 | "accent": -1, 650 | "number": 3 651 | } 652 | ], 653 | "end": 178, 654 | "text": "прошедшему", 655 | "begin": 168 656 | }, 657 | { 658 | "syllables": [ 659 | { 660 | "begin": 0, 661 | "end": 3, 662 | "text": "сча", 663 | "accent": 2, 664 | "number": 0 665 | }, 666 | { 667 | "begin": 3, 668 | "end": 7, 669 | "text": "стью", 670 | "accent": -1, 671 | "number": 1 672 | } 673 | ], 674 | "end": 186, 675 | "text": "счастью", 676 | "begin": 179 677 | }, 678 | { 679 | "syllables": [ 680 | { 681 | "begin": 0, 682 | "end": 2, 683 | "text": "ле", 684 | "accent": -1, 685 | "number": 0 686 | }, 687 | { 688 | "begin": 2, 689 | "end": 5, 690 | "text": "тел", 691 | "accent": 3, 692 | "number": 1 693 | } 694 | ], 695 | "end": 192, 696 | "text": "летел", 697 | "begin": 187 698 | } 699 | ], 700 | "end": 193, 701 | "text": "К прошедшему счастью летел.", 702 | "begin": 166 703 | } 704 | ] 705 | } 706 | ] 707 | } --------------------------------------------------------------------------------