├── src ├── __init__.py ├── generate_transcriptions.py ├── model.py └── architectures.py ├── MANIFEST.in ├── model ├── vocabulary │ ├── non_padded_namespaces.txt │ ├── tokens.txt │ └── target_tokens.txt └── weights.th ├── bin └── generate_transcriptions ├── README.md ├── LICENSE ├── .gitignore └── setup.py /src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include model/ * -------------------------------------------------------------------------------- /model/vocabulary/non_padded_namespaces.txt: -------------------------------------------------------------------------------- 1 | *labels 2 | *tags 3 | -------------------------------------------------------------------------------- /bin/generate_transcriptions: -------------------------------------------------------------------------------- 1 | python -m src.generate_transcriptions $1 $2 -------------------------------------------------------------------------------- /model/weights.th: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DinoTheDinosaur/russian_g2p_neuro/HEAD/model/weights.th -------------------------------------------------------------------------------- /model/vocabulary/tokens.txt: -------------------------------------------------------------------------------- 1 | @@UNKNOWN@@ 2 | @start@ 3 | @end@ 4 | о 5 | а 6 | е 7 | и 8 | н 9 | р 10 | т 11 | с 12 | л 13 | в 14 | к 15 | м 16 | п 17 | у 18 | д 19 | ы 20 | я 21 | г 22 | з 23 | б 24 | ь 25 | й 26 | ч 27 | х 28 | ш 29 | ю 30 | ж 31 | ц 32 | щ 33 | ф 34 | - 35 | ё 36 | э 37 | ъ 38 | 1 39 | ( 40 | ) 41 | 0 42 | 2 43 | -------------------------------------------------------------------------------- /model/vocabulary/target_tokens.txt: -------------------------------------------------------------------------------- 1 | @@UNKNOWN@@ 2 | @start@ 3 | @end@ 4 | o0 5 | a0 6 | e0 7 | i0 8 | n 9 | j 10 | r 11 | s 12 | a1 13 | t 14 | v 15 | k 16 | u0 17 | m 18 | p 19 | lj 20 | o1 21 | i1 22 | l 23 | tj 24 | rj 25 | nj 26 | e1 27 | d 28 | y0 29 | g 30 | z 31 | sj 32 | ch 33 | b 34 | mj 35 | sh 36 | h 37 | vj 38 | u1 39 | dj 40 | zh 41 | kj 42 | c 43 | sch 44 | pj 45 | bj 46 | y1 47 | f 48 | gj 49 | zj 50 | fj 51 | hj 52 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Russian G2P neuro 2 | G2P tool for Russian language with **vosk-model-ru** styled transcriptions, see the available models in https://alphacephei.com/vosk/models. Based on AllenNLP Seq2Seq architecture. 3 | ## Installation 4 | Easiest is installed on Python 3.6, works on Python 3.8 5 | ### Direct pip installation 6 | ``` 7 | pip install git+https://github.com/DinoTheDinosaur/russian_g2p_neuro.git 8 | ``` 9 | ### Installation through cloning + setup.py 10 | If some files are not included after installation, then one should try cloning and installing from source 11 | ``` 12 | git clone https://github.com/DinoTheDinosaur/russian_g2p_neuro.git 13 | cd russian_g2p_neuro/ 14 | python setup.py install 15 | ``` 16 | ## Usage 17 | To generate a Kaldi-like dictionary use a command line tool: 18 | ``` 19 | generate_transcriptions input.txt output.dict 20 | ``` 21 | This binary can be used for list of words, but also whole texts. 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 DinoTheDinosaur 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/generate_transcriptions.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | import sys 4 | 5 | from pathlib import Path 6 | from src.model import Russian_G2P 7 | 8 | MODULE_PATH = Path(__file__).parents[1] 9 | 10 | g2p = Russian_G2P(MODULE_PATH / 'model') 11 | 12 | def write_result_to_file(dictionary, output_filename, sep=' '): 13 | lines = [] 14 | for word, transcription in dictionary.items(): 15 | lines += [sep.join([word, transcription])] 16 | with open(output_filename, 'w') as f: 17 | f.write('\n'.join(lines)) 18 | 19 | def transcribe_word_list(wordlist): 20 | result = {} 21 | for word in wordlist: 22 | prediction = g2p.predict(word) 23 | result[word] = ' '.join(prediction) 24 | return result 25 | 26 | def find_words_in_file(filename): 27 | with open(filename) as f: 28 | text = f.read().lower() 29 | words = re.findall( 30 | '[ёйцукенгшщзхъэждлорпавыфячсмитьбю\-]+', 31 | text 32 | ) 33 | words = sorted(list(set(words))) 34 | return words 35 | 36 | if __name__ == '__main__': 37 | input_filename = sys.argv[1] 38 | output_filename = sys.argv[2] 39 | wordlist = find_words_in_file(sys.argv[1]) 40 | dictionary = transcribe_word_list(wordlist) 41 | write_result_to_file(dictionary, output_filename) 42 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /src/model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | 4 | from allennlp.data.dataset_readers.seq2seq import Seq2SeqDatasetReader 5 | from allennlp.data.vocabulary import Vocabulary 6 | from allennlp.data.token_indexers import SingleIdTokenIndexer 7 | from allennlp.data.tokenizers.character_tokenizer import CharacterTokenizer 8 | from allennlp.modules.token_embedders import Embedding 9 | from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder 10 | from allennlp.modules.seq2seq_encoders import StackedSelfAttentionEncoder 11 | from allennlp.modules.attention import DotProductAttention 12 | from allennlp.predictors import SimpleSeq2SeqPredictor 13 | from src.architectures import G2PSeq2Seq, WhitespaceTokenizer 14 | 15 | 16 | 17 | class Russian_G2P: 18 | 19 | def __init__( 20 | self, model_path, LET_EMBEDDING_DIM=256, 21 | SOU_EMBEDDING_DIM=256, HIDDEN_DIM=256 22 | ): 23 | reader = Seq2SeqDatasetReader( 24 | source_tokenizer=CharacterTokenizer(), 25 | target_tokenizer=WhitespaceTokenizer(), 26 | source_token_indexers={'tokens': SingleIdTokenIndexer()}, 27 | target_token_indexers={'tokens': SingleIdTokenIndexer(namespace='target_tokens')} 28 | ) 29 | # And here's how to reload the model. 30 | vocab = Vocabulary.from_files(os.path.join(model_path, 'vocabulary')) 31 | 32 | let_embedding = Embedding( 33 | num_embeddings=vocab.get_vocab_size('tokens'), 34 | embedding_dim=LET_EMBEDDING_DIM 35 | ) 36 | source_embedder = BasicTextFieldEmbedder({"tokens": let_embedding}) 37 | 38 | encoder = StackedSelfAttentionEncoder( 39 | input_dim=LET_EMBEDDING_DIM, 40 | hidden_dim=HIDDEN_DIM, 41 | projection_dim=128, 42 | feedforward_hidden_dim=128, 43 | num_layers=1, 44 | num_attention_heads=8 45 | ) 46 | 47 | attention = DotProductAttention() 48 | 49 | max_decoding_steps = 40 50 | model = G2PSeq2Seq( 51 | vocab, source_embedder, encoder, max_decoding_steps, 52 | target_embedding_dim=SOU_EMBEDDING_DIM, 53 | target_namespace='target_tokens', 54 | attention=attention, 55 | beam_size=5 56 | ) 57 | 58 | with open(os.path.join(model_path, 'weights.th'), 'rb') as f: 59 | model.load_state_dict(torch.load(f, map_location=torch.device('cpu'))) 60 | 61 | self.predictor = SimpleSeq2SeqPredictor(model, reader) 62 | 63 | def predict(self, word): 64 | return self.predictor.predict(word)['predicted_tokens'] 65 | -------------------------------------------------------------------------------- /src/architectures.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import numpy as np 4 | 5 | from overrides import overrides 6 | from torch.nn.modules.linear import Linear 7 | from torch.nn.modules.rnn import LSTMCell 8 | from allennlp.modules.similarity_functions import SimilarityFunction 9 | from allennlp.common.util import START_SYMBOL, END_SYMBOL 10 | from allennlp.modules import Attention, TextFieldEmbedder, Seq2SeqEncoder 11 | from allennlp.data.vocabulary import Vocabulary 12 | from allennlp.models.encoder_decoders.simple_seq2seq import SimpleSeq2Seq 13 | from allennlp.modules.token_embedders import Embedding 14 | from allennlp.data.tokenizers.token import Token 15 | from allennlp.data.tokenizers.tokenizer import Tokenizer 16 | from allennlp.nn.beam_search import BeamSearch 17 | 18 | from typing import Dict, Optional, List, Tuple, Union, Iterable, Any 19 | 20 | Labels = List[Any] 21 | 22 | 23 | 24 | class G2PSeq2Seq(SimpleSeq2Seq): 25 | 26 | def __init__(self, 27 | vocab: Vocabulary, 28 | source_embedder: TextFieldEmbedder, 29 | encoder: Seq2SeqEncoder, 30 | max_decoding_steps: int, 31 | attention: Attention = None, 32 | attention_function: SimilarityFunction = None, 33 | beam_size: int = None, 34 | target_namespace: str = "tokens", 35 | target_embedding_dim: int = None, 36 | scheduled_sampling_ratio: float = 0.) -> None: 37 | super(SimpleSeq2Seq, self).__init__(vocab) 38 | self._target_namespace = target_namespace 39 | self._scheduled_sampling_ratio = scheduled_sampling_ratio 40 | 41 | # We need the start symbol to provide as the input at the first timestep of decoding, and 42 | # end symbol as a way to indicate the end of the decoded sequence. 43 | self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) 44 | self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) 45 | 46 | # At prediction time, we use a beam search to find the most likely sequence of target tokens. 47 | beam_size = beam_size or 1 48 | self._max_decoding_steps = max_decoding_steps 49 | self._beam_search = BeamSearch(self._end_index, max_steps=max_decoding_steps, beam_size=beam_size) 50 | 51 | # Dense embedding of source vocab tokens. 52 | self._source_embedder = source_embedder 53 | 54 | # Encodes the sequence of source embeddings into a sequence of hidden states. 55 | self._encoder = encoder 56 | 57 | num_classes = self.vocab.get_vocab_size(self._target_namespace) 58 | 59 | # Attention mechanism applied to the encoder output for each step. 60 | if attention: 61 | if attention_function: 62 | raise ConfigurationError("You can only specify an attention module or an " 63 | "attention function, but not both.") 64 | self._attention = attention 65 | elif attention_function: 66 | self._attention = LegacyAttention(attention_function) 67 | else: 68 | self._attention = None 69 | 70 | # Dense embedding of vocab words in the target space. 71 | target_embedding_dim = target_embedding_dim or source_embedder.get_output_dim() 72 | self._target_embedder = Embedding(num_classes, target_embedding_dim) 73 | 74 | # Decoder output dim needs to be the same as the encoder output dim since we initialize the 75 | # hidden state of the decoder with the final hidden state of the encoder. 76 | self._encoder_output_dim = self._encoder.get_output_dim() 77 | self._decoder_output_dim = self._encoder_output_dim 78 | 79 | if self._attention: 80 | # If using attention, a weighted average over encoder outputs will be concatenated 81 | # to the previous target embedding to form the input to the decoder at each 82 | # time step. 83 | self._decoder_input_dim = self._decoder_output_dim + target_embedding_dim 84 | else: 85 | # Otherwise, the input to the decoder is just the previous target embedding. 86 | self._decoder_input_dim = target_embedding_dim 87 | 88 | # We'll use an LSTM cell as the recurrent cell that produces a hidden state 89 | # for the decoder at each time step. 90 | # TODO (pradeep): Do not hardcode decoder cell type. 91 | self._decoder_cell = LSTMCell( 92 | self._decoder_input_dim, 93 | self._decoder_output_dim, 94 | ) 95 | 96 | # We project the hidden state from the decoder into the output vocabulary space 97 | # in order to get log probabilities of each target token, at each time step. 98 | self._output_projection_layer = Linear(self._decoder_output_dim, num_classes) 99 | 100 | @overrides 101 | def forward(self, # type: ignore 102 | source_tokens: Dict[str, torch.LongTensor]) -> Dict[str, torch.Tensor]: 103 | # pylint: disable=arguments-differ 104 | state = self._encode(source_tokens) 105 | 106 | output_dict = {} 107 | state = self._init_decoder_state(state) 108 | predictions = self._forward_beam_search(state) 109 | output_dict.update(predictions) 110 | 111 | return output_dict 112 | 113 | 114 | 115 | @Tokenizer.register("whitespace_") 116 | @Tokenizer.register("just_spaces_") 117 | class WhitespaceTokenizer(Tokenizer): 118 | """ 119 | A `Tokenizer` that assumes you've already done your own tokenization somehow and have 120 | separated the tokens by spaces. We just split the input string on whitespace and return the 121 | resulting list. 122 | 123 | Note that we use `text.split()`, which means that the amount of whitespace between the 124 | tokens does not matter. This will never result in spaces being included as tokens. 125 | 126 | Registered as a `Tokenizer` with name "whitespace" and "just_spaces". 127 | """ 128 | 129 | @overrides 130 | def tokenize(self, text: str) -> List[Token]: 131 | return [Token(t) for t in text.split()] -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='russian_g2p_neuro', 5 | 6 | version='1.0.0', 7 | 8 | description='G2P tool for Russian language with vosk-model-ru styled transcriptions', 9 | 10 | # The project's main homepage. 11 | url='https://github.com/DinoTheDinosaur/russian_g2p_neuro', 12 | 13 | # Author details 14 | author='DinoTheDinosaur', 15 | 16 | # Choose your license 17 | license='MIT', 18 | 19 | # See https://pypi.python.org/pypi?%3Aaction=list_classifiers 20 | classifiers=[ 21 | # How mature is this project? Common values are 22 | # 3 - Alpha 23 | # 4 - Beta 24 | # 5 - Production/Stable 25 | 'Development Status :: 3 - Alpha', 26 | 27 | # Indicate who your project is intended for 28 | 'Intended Audience :: Developers', 29 | 'Intended Audience :: Science/Research', 30 | 'Topic :: System :: Networking', 31 | 'Topic :: Scientific/Engineering', 32 | 33 | # Pick your license as you wish (should match "license" above) 34 | 'License :: OSI Approved :: MIT License', 35 | 36 | 'Operating System :: POSIX :: Linux', 37 | 38 | # Specify the Python versions you support here. In particular, ensure 39 | # that you indicate whether you support Python 2, Python 3 or both. 40 | 'Programming Language :: Python :: 3.8', 41 | ], 42 | 43 | # What does your project relate to? 44 | keywords='ASR speech recognition g2p russian grapheme-to-phoneme', 45 | 46 | # You can just specify the packages manually here if your project is 47 | # simple. Or you can use find_packages(). 48 | packages=find_packages(), 49 | scripts=['bin/generate_transcriptions'], 50 | 51 | # List run-time dependencies here. These will be installed by pip when 52 | # your project is installed. For an analysis of "install_requires" vs pip's 53 | # requirements files see: 54 | # https://packaging.python.org/en/latest/requirements.html 55 | install_requires=[ 56 | 'alabaster==0.7.12', 57 | 'allennlp==0.9.0', 58 | 'argon2-cffi==20.1.0', 59 | 'async-generator==1.10', 60 | 'attrs==20.3.0', 61 | 'Babel==2.9.0', 62 | 'backcall==0.2.0', 63 | 'bleach==3.3.0', 64 | 'blis==0.2.4', 65 | 'boto3==1.17.57', 66 | 'botocore==1.20.57', 67 | 'certifi==2020.12.5', 68 | 'cffi==1.14.5', 69 | 'chardet==4.0.0', 70 | 'click==7.1.2', 71 | 'conllu==1.3.1', 72 | 'cycler==0.10.0', 73 | 'cymem==2.0.5', 74 | 'decorator==5.0.7', 75 | 'defusedxml==0.7.1', 76 | 'docutils==0.16', 77 | 'editdistance==0.5.3', 78 | 'entrypoints==0.3', 79 | 'flaky==3.7.0', 80 | 'Flask==1.1.2', 81 | 'Flask-Cors==3.0.10', 82 | 'ftfy==6.0.1', 83 | 'future==0.18.2', 84 | 'gevent==21.1.2', 85 | 'greenlet==1.0.0', 86 | 'h5py==3.2.1', 87 | 'idna==2.10', 88 | 'imagesize==1.2.0', 89 | 'iniconfig==1.1.1', 90 | 'ipykernel==5.5.3', 91 | 'ipython==7.22.0', 92 | 'ipython-genutils==0.2.0', 93 | 'ipywidgets==7.6.3', 94 | 'itsdangerous==1.1.0', 95 | 'jedi==0.18.0', 96 | 'Jinja2==2.11.3', 97 | 'jmespath==0.10.0', 98 | 'joblib==1.0.1', 99 | 'jsonnet==0.17.0', 100 | 'jsonpickle==2.0.0', 101 | 'jsonschema==3.2.0', 102 | 'jupyter==1.0.0', 103 | 'jupyter-client==6.1.12', 104 | 'jupyter-console==6.4.0', 105 | 'jupyter-core==4.7.1', 106 | 'jupyterlab-pygments==0.1.2', 107 | 'jupyterlab-widgets==1.0.0', 108 | 'kiwisolver==1.3.1', 109 | 'MarkupSafe==1.1.1', 110 | 'matplotlib==3.4.1', 111 | 'mistune==0.8.4', 112 | 'murmurhash==1.0.5', 113 | 'nbclient==0.5.3', 114 | 'nbconvert==6.0.7', 115 | 'nbformat==5.1.3', 116 | 'nest-asyncio==1.5.1', 117 | 'nltk==3.6.2', 118 | 'notebook==6.3.0', 119 | 'numpy==1.20.2', 120 | 'numpydoc==1.1.0', 121 | 'overrides==4.1.2', 122 | 'packaging==20.9', 123 | 'pandas==1.2.4', 124 | 'pandocfilters==1.4.3', 125 | 'parsimonious==0.8.1', 126 | 'parso==0.8.2', 127 | 'pexpect==4.8.0', 128 | 'pickleshare==0.7.5', 129 | 'Pillow==8.2.0', 130 | 'plac==0.9.6', 131 | 'pluggy==0.13.1', 132 | 'preshed==2.0.1', 133 | 'prometheus-client==0.10.1', 134 | 'prompt-toolkit==3.0.18', 135 | 'protobuf==3.15.8', 136 | 'ptyprocess==0.7.0', 137 | 'py==1.10.0', 138 | 'pycparser==2.20', 139 | 'Pygments==2.8.1', 140 | 'pyparsing==2.4.7', 141 | 'pyrsistent==0.17.3', 142 | 'pytest==6.2.3', 143 | 'python-dateutil==2.8.1', 144 | 'pytorch-pretrained-bert==0.6.2', 145 | 'pytorch-transformers==1.1.0', 146 | 'pytz==2021.1', 147 | 'pyzmq==22.0.3', 148 | 'qtconsole==5.0.3', 149 | 'QtPy==1.9.0', 150 | 'regex==2021.4.4', 151 | 'requests==2.25.1', 152 | 'responses==0.13.2', 153 | 'russian-g2p-neuro==1.0.0', 154 | 's3transfer==0.4.2', 155 | 'scikit-learn==0.24.1', 156 | 'scipy==1.6.3', 157 | 'Send2Trash==1.5.0', 158 | 'sentencepiece==0.1.95', 159 | 'six==1.15.0', 160 | 'snowballstemmer==2.1.0', 161 | 'spacy==2.1.9', 162 | 'Sphinx==3.5.4', 163 | 'sphinxcontrib-applehelp==1.0.2', 164 | 'sphinxcontrib-devhelp==1.0.2', 165 | 'sphinxcontrib-htmlhelp==1.0.3', 166 | 'sphinxcontrib-jsmath==1.0.1', 167 | 'sphinxcontrib-qthelp==1.0.3', 168 | 'sphinxcontrib-serializinghtml==1.1.4', 169 | 'sqlparse==0.4.1', 170 | 'srsly==1.0.5', 171 | 'tensorboardX==2.2', 172 | 'terminado==0.9.4', 173 | 'testpath==0.4.4', 174 | 'thinc==7.0.8', 175 | 'threadpoolctl==2.1.0', 176 | 'toml==0.10.2', 177 | 'torch==1.5.0', 178 | 'tornado==6.1', 179 | 'tqdm==4.60.0', 180 | 'traitlets==5.0.5', 181 | 'typing-utils==0.0.3', 182 | 'Unidecode==1.2.0', 183 | 'urllib3==1.26.4', 184 | 'wasabi==0.8.2', 185 | 'wcwidth==0.2.5', 186 | 'webencodings==0.5.1', 187 | 'Werkzeug==1.0.1', 188 | 'widgetsnbextension==3.5.1', 189 | 'word2number==1.1', 190 | 'zope.event==4.5.0', 191 | 'zope.interface==5.4.0' 192 | ], 193 | 194 | # List additional groups of dependencies here (e.g. development 195 | # dependencies). You can install these using the following syntax, 196 | # for example: 197 | # $ pip install -e .[dev,test] 198 | extras_require={}, 199 | 200 | # If there are data files included in your packages that need to be 201 | # installed, specify them here. If using Python 2.6 or less, then these 202 | # have to be included in MANIFEST.in as well. 203 | package_data={ 204 | 'model': [ 205 | 'weights.th', 'tokens.txt', 206 | 'non_padded_namespaces.txt', 207 | 'target_tokens.txt' 208 | ] 209 | }, 210 | data_files=[ 211 | ('model', ['model/weights.th']), 212 | ('model/vocabulary/', ['model/vocabulary/tokens.txt']), 213 | ('model/vocabulary/', ['model/vocabulary/target_tokens.txt']), 214 | ('model/vocabulary/', ['model/vocabulary/non_padded_namespaces.txt']), 215 | ], 216 | include_package_data=True, 217 | 218 | # To provide executable scripts, use entry points in preference to the 219 | # "scripts" keyword. Entry points provide cross-platform support and allow 220 | # pip to create the appropriate form of executable for the target platform. 221 | entry_points={}, 222 | ) 223 | --------------------------------------------------------------------------------