├── src
    ├── __init__.py
    ├── generate_transcriptions.py
    ├── model.py
    └── architectures.py
├── MANIFEST.in
├── model
    ├── vocabulary
    │   ├── non_padded_namespaces.txt
    │   ├── tokens.txt
    │   └── target_tokens.txt
    └── weights.th
├── bin
    └── generate_transcriptions
├── README.md
├── LICENSE
├── .gitignore
└── setup.py


/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include model/ *


--------------------------------------------------------------------------------
/model/vocabulary/non_padded_namespaces.txt:
--------------------------------------------------------------------------------
1 | *labels
2 | *tags
3 | 


--------------------------------------------------------------------------------
/bin/generate_transcriptions:
--------------------------------------------------------------------------------
1 | python -m src.generate_transcriptions $1 $2


--------------------------------------------------------------------------------
/model/weights.th:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DinoTheDinosaur/russian_g2p_neuro/HEAD/model/weights.th


--------------------------------------------------------------------------------
/model/vocabulary/tokens.txt:
--------------------------------------------------------------------------------
 1 | @@UNKNOWN@@
 2 | @start@
 3 | @end@
 4 | о
 5 | а
 6 | е
 7 | и
 8 | н
 9 | р
10 | т
11 | с
12 | л
13 | в
14 | к
15 | м
16 | п
17 | у
18 | д
19 | ы
20 | я
21 | г
22 | з
23 | б
24 | ь
25 | й
26 | ч
27 | х
28 | ш
29 | ю
30 | ж
31 | ц
32 | щ
33 | ф
34 | -
35 | ё
36 | э
37 | ъ
38 | 1
39 | (
40 | )
41 | 0
42 | 2
43 | 


--------------------------------------------------------------------------------
/model/vocabulary/target_tokens.txt:
--------------------------------------------------------------------------------
 1 | @@UNKNOWN@@
 2 | @start@
 3 | @end@
 4 | o0
 5 | a0
 6 | e0
 7 | i0
 8 | n
 9 | j
10 | r
11 | s
12 | a1
13 | t
14 | v
15 | k
16 | u0
17 | m
18 | p
19 | lj
20 | o1
21 | i1
22 | l
23 | tj
24 | rj
25 | nj
26 | e1
27 | d
28 | y0
29 | g
30 | z
31 | sj
32 | ch
33 | b
34 | mj
35 | sh
36 | h
37 | vj
38 | u1
39 | dj
40 | zh
41 | kj
42 | c
43 | sch
44 | pj
45 | bj
46 | y1
47 | f
48 | gj
49 | zj
50 | fj
51 | hj
52 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Russian G2P neuro
 2 | G2P tool for Russian language with **vosk-model-ru** styled transcriptions, see the available models in https://alphacephei.com/vosk/models. Based on AllenNLP Seq2Seq architecture.
 3 | ## Installation
 4 | Easiest is installed on Python 3.6, works on Python 3.8
 5 | ### Direct pip installation
 6 | ```
 7 | pip install git+https://github.com/DinoTheDinosaur/russian_g2p_neuro.git
 8 | ```
 9 | ### Installation through cloning + setup.py
10 | If some files are not included after installation, then one should try cloning and installing from source
11 | ```
12 | git clone https://github.com/DinoTheDinosaur/russian_g2p_neuro.git
13 | cd russian_g2p_neuro/
14 | python setup.py install
15 | ```
16 | ## Usage
17 | To generate a Kaldi-like dictionary use a command line tool:
18 | ```
19 | generate_transcriptions input.txt output.dict
20 | ```
21 | This binary can be used for list of words, but also whole texts.
22 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 DinoTheDinosaur
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/generate_transcriptions.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import os
 3 | import sys
 4 | 
 5 | from pathlib import Path
 6 | from src.model import Russian_G2P
 7 | 
 8 | MODULE_PATH = Path(__file__).parents[1]
 9 | 
10 | g2p = Russian_G2P(MODULE_PATH / 'model')
11 | 
12 | def write_result_to_file(dictionary, output_filename, sep=' '):
13 |     lines = []
14 |     for word, transcription in dictionary.items():
15 |         lines += [sep.join([word, transcription])]
16 |     with open(output_filename, 'w') as f:
17 |         f.write('\n'.join(lines))
18 | 
19 | def transcribe_word_list(wordlist):
20 |     result = {}
21 |     for word in wordlist:
22 |         prediction = g2p.predict(word)
23 |         result[word] = ' '.join(prediction)
24 |     return result
25 | 
26 | def find_words_in_file(filename):
27 |     with open(filename) as f:
28 |         text = f.read().lower()
29 |     words = re.findall(
30 |         '[ёйцукенгшщзхъэждлорпавыфячсмитьбю\-]+',
31 |         text
32 |     )
33 |     words = sorted(list(set(words)))
34 |     return words
35 | 
36 | if __name__ == '__main__':
37 |     input_filename = sys.argv[1]
38 |     output_filename = sys.argv[2]
39 |     wordlist = find_words_in_file(sys.argv[1])
40 |     dictionary = transcribe_word_list(wordlist)
41 |     write_result_to_file(dictionary, output_filename)
42 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/src/model.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | 
 4 | from allennlp.data.dataset_readers.seq2seq import Seq2SeqDatasetReader
 5 | from allennlp.data.vocabulary import Vocabulary
 6 | from allennlp.data.token_indexers import SingleIdTokenIndexer
 7 | from allennlp.data.tokenizers.character_tokenizer import CharacterTokenizer
 8 | from allennlp.modules.token_embedders import Embedding
 9 | from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
10 | from allennlp.modules.seq2seq_encoders import StackedSelfAttentionEncoder
11 | from allennlp.modules.attention import DotProductAttention
12 | from allennlp.predictors import SimpleSeq2SeqPredictor
13 | from src.architectures import G2PSeq2Seq, WhitespaceTokenizer
14 | 
15 | 
16 | 
17 | class Russian_G2P:
18 |     
19 |     def __init__(
20 |         self, model_path, LET_EMBEDDING_DIM=256,
21 |         SOU_EMBEDDING_DIM=256, HIDDEN_DIM=256
22 |     ):
23 |         reader = Seq2SeqDatasetReader(
24 |             source_tokenizer=CharacterTokenizer(),
25 |             target_tokenizer=WhitespaceTokenizer(),
26 |             source_token_indexers={'tokens': SingleIdTokenIndexer()},
27 |             target_token_indexers={'tokens': SingleIdTokenIndexer(namespace='target_tokens')}
28 |         )
29 |         # And here's how to reload the model.
30 |         vocab = Vocabulary.from_files(os.path.join(model_path, 'vocabulary'))
31 | 
32 |         let_embedding = Embedding(
33 |             num_embeddings=vocab.get_vocab_size('tokens'),
34 |             embedding_dim=LET_EMBEDDING_DIM
35 |         )
36 |         source_embedder = BasicTextFieldEmbedder({"tokens": let_embedding})
37 | 
38 |         encoder = StackedSelfAttentionEncoder(
39 |             input_dim=LET_EMBEDDING_DIM,
40 |             hidden_dim=HIDDEN_DIM,
41 |             projection_dim=128,
42 |             feedforward_hidden_dim=128,
43 |             num_layers=1,
44 |             num_attention_heads=8
45 |         )
46 | 
47 |         attention = DotProductAttention()
48 | 
49 |         max_decoding_steps = 40
50 |         model = G2PSeq2Seq(
51 |             vocab, source_embedder, encoder, max_decoding_steps,
52 |             target_embedding_dim=SOU_EMBEDDING_DIM,
53 |             target_namespace='target_tokens',
54 |             attention=attention,
55 |             beam_size=5
56 |         )
57 | 
58 |         with open(os.path.join(model_path, 'weights.th'), 'rb') as f:
59 |             model.load_state_dict(torch.load(f, map_location=torch.device('cpu')))
60 |         
61 |         self.predictor = SimpleSeq2SeqPredictor(model, reader)
62 | 
63 |     def predict(self, word):
64 |         return self.predictor.predict(word)['predicted_tokens']
65 | 


--------------------------------------------------------------------------------
/src/architectures.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | 
  3 | import numpy as np
  4 | 
  5 | from overrides import overrides
  6 | from torch.nn.modules.linear import Linear
  7 | from torch.nn.modules.rnn import LSTMCell
  8 | from allennlp.modules.similarity_functions import SimilarityFunction
  9 | from allennlp.common.util import START_SYMBOL, END_SYMBOL
 10 | from allennlp.modules import Attention, TextFieldEmbedder, Seq2SeqEncoder
 11 | from allennlp.data.vocabulary import Vocabulary
 12 | from allennlp.models.encoder_decoders.simple_seq2seq import SimpleSeq2Seq
 13 | from allennlp.modules.token_embedders import Embedding
 14 | from allennlp.data.tokenizers.token import Token
 15 | from allennlp.data.tokenizers.tokenizer import Tokenizer
 16 | from allennlp.nn.beam_search import BeamSearch
 17 | 
 18 | from typing import Dict, Optional, List, Tuple, Union, Iterable, Any
 19 | 
 20 | Labels = List[Any]
 21 | 
 22 | 
 23 | 
 24 | class G2PSeq2Seq(SimpleSeq2Seq):
 25 |     
 26 |     def __init__(self,
 27 |                  vocab: Vocabulary,
 28 |                  source_embedder: TextFieldEmbedder,
 29 |                  encoder: Seq2SeqEncoder,
 30 |                  max_decoding_steps: int,
 31 |                  attention: Attention = None,
 32 |                  attention_function: SimilarityFunction = None,
 33 |                  beam_size: int = None,
 34 |                  target_namespace: str = "tokens",
 35 |                  target_embedding_dim: int = None,
 36 |                  scheduled_sampling_ratio: float = 0.) -> None:
 37 |         super(SimpleSeq2Seq, self).__init__(vocab)
 38 |         self._target_namespace = target_namespace
 39 |         self._scheduled_sampling_ratio = scheduled_sampling_ratio
 40 | 
 41 |         # We need the start symbol to provide as the input at the first timestep of decoding, and
 42 |         # end symbol as a way to indicate the end of the decoded sequence.
 43 |         self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace)
 44 |         self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace)
 45 | 
 46 |         # At prediction time, we use a beam search to find the most likely sequence of target tokens.
 47 |         beam_size = beam_size or 1
 48 |         self._max_decoding_steps = max_decoding_steps
 49 |         self._beam_search = BeamSearch(self._end_index, max_steps=max_decoding_steps, beam_size=beam_size)
 50 | 
 51 |         # Dense embedding of source vocab tokens.
 52 |         self._source_embedder = source_embedder
 53 | 
 54 |         # Encodes the sequence of source embeddings into a sequence of hidden states.
 55 |         self._encoder = encoder
 56 | 
 57 |         num_classes = self.vocab.get_vocab_size(self._target_namespace)
 58 | 
 59 |         # Attention mechanism applied to the encoder output for each step.
 60 |         if attention:
 61 |             if attention_function:
 62 |                 raise ConfigurationError("You can only specify an attention module or an "
 63 |                                          "attention function, but not both.")
 64 |             self._attention = attention
 65 |         elif attention_function:
 66 |             self._attention = LegacyAttention(attention_function)
 67 |         else:
 68 |             self._attention = None
 69 | 
 70 |         # Dense embedding of vocab words in the target space.
 71 |         target_embedding_dim = target_embedding_dim or source_embedder.get_output_dim()
 72 |         self._target_embedder = Embedding(num_classes, target_embedding_dim)
 73 | 
 74 |         # Decoder output dim needs to be the same as the encoder output dim since we initialize the
 75 |         # hidden state of the decoder with the final hidden state of the encoder.
 76 |         self._encoder_output_dim = self._encoder.get_output_dim()
 77 |         self._decoder_output_dim = self._encoder_output_dim
 78 | 
 79 |         if self._attention:
 80 |             # If using attention, a weighted average over encoder outputs will be concatenated
 81 |             # to the previous target embedding to form the input to the decoder at each
 82 |             # time step.
 83 |             self._decoder_input_dim = self._decoder_output_dim + target_embedding_dim
 84 |         else:
 85 |             # Otherwise, the input to the decoder is just the previous target embedding.
 86 |             self._decoder_input_dim = target_embedding_dim
 87 | 
 88 |         # We'll use an LSTM cell as the recurrent cell that produces a hidden state
 89 |         # for the decoder at each time step.
 90 |         # TODO (pradeep): Do not hardcode decoder cell type.
 91 |         self._decoder_cell = LSTMCell(
 92 |             self._decoder_input_dim, 
 93 |             self._decoder_output_dim,
 94 |         )
 95 | 
 96 |         # We project the hidden state from the decoder into the output vocabulary space
 97 |         # in order to get log probabilities of each target token, at each time step.
 98 |         self._output_projection_layer = Linear(self._decoder_output_dim, num_classes)
 99 | 
100 |     @overrides
101 |     def forward(self,  # type: ignore
102 |                 source_tokens: Dict[str, torch.LongTensor]) -> Dict[str, torch.Tensor]:
103 |         # pylint: disable=arguments-differ
104 |         state = self._encode(source_tokens)
105 | 
106 |         output_dict = {}
107 |         state = self._init_decoder_state(state)
108 |         predictions = self._forward_beam_search(state)
109 |         output_dict.update(predictions)
110 | 
111 |         return output_dict
112 | 
113 | 
114 | 
115 | @Tokenizer.register("whitespace_")
116 | @Tokenizer.register("just_spaces_")
117 | class WhitespaceTokenizer(Tokenizer):
118 |     """
119 |     A `Tokenizer` that assumes you've already done your own tokenization somehow and have
120 |     separated the tokens by spaces.  We just split the input string on whitespace and return the
121 |     resulting list.
122 | 
123 |     Note that we use `text.split()`, which means that the amount of whitespace between the
124 |     tokens does not matter.  This will never result in spaces being included as tokens.
125 | 
126 |     Registered as a `Tokenizer` with name "whitespace" and "just_spaces".
127 |     """
128 | 
129 |     @overrides
130 |     def tokenize(self, text: str) -> List[Token]:
131 |         return [Token(t) for t in text.split()]


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | from setuptools import setup, find_packages
  2 | 
  3 | setup(
  4 |     name='russian_g2p_neuro',
  5 | 
  6 |     version='1.0.0',
  7 | 
  8 |     description='G2P tool for Russian language with vosk-model-ru styled transcriptions',
  9 | 
 10 |     # The project's main homepage.
 11 |     url='https://github.com/DinoTheDinosaur/russian_g2p_neuro',
 12 | 
 13 |     # Author details
 14 |     author='DinoTheDinosaur',
 15 | 
 16 |     # Choose your license
 17 |     license='MIT',
 18 | 
 19 |     # See https://pypi.python.org/pypi?%3Aaction=list_classifiers
 20 |     classifiers=[
 21 |         # How mature is this project? Common values are
 22 |         #   3 - Alpha
 23 |         #   4 - Beta
 24 |         #   5 - Production/Stable
 25 |         'Development Status :: 3 - Alpha',
 26 | 
 27 |         # Indicate who your project is intended for
 28 |         'Intended Audience :: Developers',
 29 |         'Intended Audience :: Science/Research',
 30 |         'Topic :: System :: Networking',
 31 |         'Topic :: Scientific/Engineering',
 32 | 
 33 |         # Pick your license as you wish (should match "license" above)
 34 |         'License :: OSI Approved :: MIT License',
 35 | 
 36 |         'Operating System :: POSIX :: Linux',
 37 | 
 38 |         # Specify the Python versions you support here. In particular, ensure
 39 |         # that you indicate whether you support Python 2, Python 3 or both.
 40 |         'Programming Language :: Python :: 3.8',
 41 |     ],
 42 | 
 43 |     # What does your project relate to?
 44 |     keywords='ASR speech recognition g2p russian grapheme-to-phoneme',
 45 | 
 46 |     # You can just specify the packages manually here if your project is
 47 |     # simple. Or you can use find_packages().
 48 |     packages=find_packages(),
 49 |     scripts=['bin/generate_transcriptions'],
 50 | 
 51 |     # List run-time dependencies here.  These will be installed by pip when
 52 |     # your project is installed. For an analysis of "install_requires" vs pip's
 53 |     # requirements files see:
 54 |     # https://packaging.python.org/en/latest/requirements.html
 55 |     install_requires=[
 56 |         'alabaster==0.7.12',
 57 |         'allennlp==0.9.0',
 58 |         'argon2-cffi==20.1.0',
 59 |         'async-generator==1.10',
 60 |         'attrs==20.3.0',
 61 |         'Babel==2.9.0',
 62 |         'backcall==0.2.0',
 63 |         'bleach==3.3.0',
 64 |         'blis==0.2.4',
 65 |         'boto3==1.17.57',
 66 |         'botocore==1.20.57',
 67 |         'certifi==2020.12.5',
 68 |         'cffi==1.14.5',
 69 |         'chardet==4.0.0',
 70 |         'click==7.1.2',
 71 |         'conllu==1.3.1',
 72 |         'cycler==0.10.0',
 73 |         'cymem==2.0.5',
 74 |         'decorator==5.0.7',
 75 |         'defusedxml==0.7.1',
 76 |         'docutils==0.16',
 77 |         'editdistance==0.5.3',
 78 |         'entrypoints==0.3',
 79 |         'flaky==3.7.0',
 80 |         'Flask==1.1.2',
 81 |         'Flask-Cors==3.0.10',
 82 |         'ftfy==6.0.1',
 83 |         'future==0.18.2',
 84 |         'gevent==21.1.2',
 85 |         'greenlet==1.0.0',
 86 |         'h5py==3.2.1',
 87 |         'idna==2.10',
 88 |         'imagesize==1.2.0',
 89 |         'iniconfig==1.1.1',
 90 |         'ipykernel==5.5.3',
 91 |         'ipython==7.22.0',
 92 |         'ipython-genutils==0.2.0',
 93 |         'ipywidgets==7.6.3',
 94 |         'itsdangerous==1.1.0',
 95 |         'jedi==0.18.0',
 96 |         'Jinja2==2.11.3',
 97 |         'jmespath==0.10.0',
 98 |         'joblib==1.0.1',
 99 |         'jsonnet==0.17.0',
100 |         'jsonpickle==2.0.0',
101 |         'jsonschema==3.2.0',
102 |         'jupyter==1.0.0',
103 |         'jupyter-client==6.1.12',
104 |         'jupyter-console==6.4.0',
105 |         'jupyter-core==4.7.1',
106 |         'jupyterlab-pygments==0.1.2',
107 |         'jupyterlab-widgets==1.0.0',
108 |         'kiwisolver==1.3.1',
109 |         'MarkupSafe==1.1.1',
110 |         'matplotlib==3.4.1',
111 |         'mistune==0.8.4',
112 |         'murmurhash==1.0.5',
113 |         'nbclient==0.5.3',
114 |         'nbconvert==6.0.7',
115 |         'nbformat==5.1.3',
116 |         'nest-asyncio==1.5.1',
117 |         'nltk==3.6.2',
118 |         'notebook==6.3.0',
119 |         'numpy==1.20.2',
120 |         'numpydoc==1.1.0',
121 |         'overrides==4.1.2',
122 |         'packaging==20.9',
123 |         'pandas==1.2.4',
124 |         'pandocfilters==1.4.3',
125 |         'parsimonious==0.8.1',
126 |         'parso==0.8.2',
127 |         'pexpect==4.8.0',
128 |         'pickleshare==0.7.5',
129 |         'Pillow==8.2.0',
130 |         'plac==0.9.6',
131 |         'pluggy==0.13.1',
132 |         'preshed==2.0.1',
133 |         'prometheus-client==0.10.1',
134 |         'prompt-toolkit==3.0.18',
135 |         'protobuf==3.15.8',
136 |         'ptyprocess==0.7.0',
137 |         'py==1.10.0',
138 |         'pycparser==2.20',
139 |         'Pygments==2.8.1',
140 |         'pyparsing==2.4.7',
141 |         'pyrsistent==0.17.3',
142 |         'pytest==6.2.3',
143 |         'python-dateutil==2.8.1',
144 |         'pytorch-pretrained-bert==0.6.2',
145 |         'pytorch-transformers==1.1.0',
146 |         'pytz==2021.1',
147 |         'pyzmq==22.0.3',
148 |         'qtconsole==5.0.3',
149 |         'QtPy==1.9.0',
150 |         'regex==2021.4.4',
151 |         'requests==2.25.1',
152 |         'responses==0.13.2',
153 |         'russian-g2p-neuro==1.0.0',
154 |         's3transfer==0.4.2',
155 |         'scikit-learn==0.24.1',
156 |         'scipy==1.6.3',
157 |         'Send2Trash==1.5.0',
158 |         'sentencepiece==0.1.95',
159 |         'six==1.15.0',
160 |         'snowballstemmer==2.1.0',
161 |         'spacy==2.1.9',
162 |         'Sphinx==3.5.4',
163 |         'sphinxcontrib-applehelp==1.0.2',
164 |         'sphinxcontrib-devhelp==1.0.2',
165 |         'sphinxcontrib-htmlhelp==1.0.3',
166 |         'sphinxcontrib-jsmath==1.0.1',
167 |         'sphinxcontrib-qthelp==1.0.3',
168 |         'sphinxcontrib-serializinghtml==1.1.4',
169 |         'sqlparse==0.4.1',
170 |         'srsly==1.0.5',
171 |         'tensorboardX==2.2',
172 |         'terminado==0.9.4',
173 |         'testpath==0.4.4',
174 |         'thinc==7.0.8',
175 |         'threadpoolctl==2.1.0',
176 |         'toml==0.10.2',
177 |         'torch==1.5.0',
178 |         'tornado==6.1',
179 |         'tqdm==4.60.0',
180 |         'traitlets==5.0.5',
181 |         'typing-utils==0.0.3',
182 |         'Unidecode==1.2.0',
183 |         'urllib3==1.26.4',
184 |         'wasabi==0.8.2',
185 |         'wcwidth==0.2.5',
186 |         'webencodings==0.5.1',
187 |         'Werkzeug==1.0.1',
188 |         'widgetsnbextension==3.5.1',
189 |         'word2number==1.1',
190 |         'zope.event==4.5.0',
191 |         'zope.interface==5.4.0'
192 |     ],
193 | 
194 |     # List additional groups of dependencies here (e.g. development
195 |     # dependencies). You can install these using the following syntax,
196 |     # for example:
197 |     # $ pip install -e .[dev,test]
198 |     extras_require={},
199 | 
200 |     # If there are data files included in your packages that need to be
201 |     # installed, specify them here.  If using Python 2.6 or less, then these
202 |     # have to be included in MANIFEST.in as well.
203 |     package_data={
204 |         'model': [
205 |             'weights.th', 'tokens.txt',
206 |             'non_padded_namespaces.txt',
207 |             'target_tokens.txt'
208 |         ]
209 |     },
210 |     data_files=[
211 |         ('model', ['model/weights.th']),
212 |         ('model/vocabulary/', ['model/vocabulary/tokens.txt']),
213 |         ('model/vocabulary/', ['model/vocabulary/target_tokens.txt']),
214 |         ('model/vocabulary/', ['model/vocabulary/non_padded_namespaces.txt']),
215 |     ],
216 |     include_package_data=True,
217 | 
218 |     # To provide executable scripts, use entry points in preference to the
219 |     # "scripts" keyword. Entry points provide cross-platform support and allow
220 |     # pip to create the appropriate form of executable for the target platform.
221 |     entry_points={},
222 | )
223 | 


--------------------------------------------------------------------------------