├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── elmoformanylangs ├── __init__.py ├── __main__.py ├── biLM.py ├── configs │ ├── cnn_0_100_512_4096_sample.json │ └── cnn_50_100_512_4096_sample.json ├── dataloader.py ├── elmo.py ├── frontend.py ├── modules │ ├── __init__.py │ ├── classify_layer.py │ ├── elmo.py │ ├── embedding_layer.py │ ├── encoder_base.py │ ├── highway.py │ ├── lstm.py │ ├── lstm_cell_with_projection.py │ ├── token_embedder.py │ └── util.py └── utils.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | .DS_Store 106 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 哈工大社会计算与信息检索研究中心 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include elmoformanylangs/configs/*.json 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Pre-trained ELMo Representations for Many Languages 2 | =================================================== 3 | 4 | We release our ELMo representations trained on many languages 5 | which helps us win the [CoNLL 2018 shared task on Universal Dependencies Parsing](http://universaldependencies.org/conll18/results.html) 6 | according to LAS. 7 | 8 | ## Technique Details 9 | 10 | We use the same hyperparameter settings as [Peters et al. (2018)](https://arxiv.org/abs/1802.05365) for the biLM 11 | and the character CNN. 12 | We train their parameters 13 | on a set of 20-million-words data randomly 14 | sampled from the raw text released by the shared task (wikidump + common crawl) for each language. 15 | We largely based ourselves on the code of [AllenNLP](https://allennlp.org/), but made the following changes: 16 | 17 | * We support unicode characters; 18 | * We use the *sample softmax* technique 19 | to make training on large vocabulary feasible ([Jean et al., 2015](https://arxiv.org/abs/1412.2007)). 20 | However, we use a window of words surrounding the target word 21 | as negative samples and it shows better performance in our preliminary experiments. 22 | 23 | The training of ELMo on one language takes roughly 3 days on an NVIDIA P100 GPU. 24 | 25 | 26 | ## Downloads 27 | 28 | | | | | | 29 | |---|---|---|---| 30 | | [Arabic](http://vectors.nlpl.eu/repository/11/136.zip) | [Bulgarian](http://vectors.nlpl.eu/repository/11/137.zip) | [Catalan](http://vectors.nlpl.eu/repository/11/138.zip) | [Czech](http://vectors.nlpl.eu/repository/11/139.zip) | 31 | | [Old Church Slavonic](http://vectors.nlpl.eu/repository/11/140.zip) | [Danish](http://vectors.nlpl.eu/repository/11/141.zip) | [German](http://vectors.nlpl.eu/repository/11/142.zip) | [Greek](http://vectors.nlpl.eu/repository/11/143.zip) | 32 | | [English](http://vectors.nlpl.eu/repository/11/144.zip) | [Spanish](http://vectors.nlpl.eu/repository/11/145.zip) | [Estonian](http://vectors.nlpl.eu/repository/11/146.zip) | [Basque](http://vectors.nlpl.eu/repository/11/147.zip) | 33 | | [Persian](http://vectors.nlpl.eu/repository/11/148.zip) | [Finnish](http://vectors.nlpl.eu/repository/11/149.zip) | [French](http://vectors.nlpl.eu/repository/11/150.zip) | [Irish](http://vectors.nlpl.eu/repository/11/151.zip) | 34 | | [Galician](http://vectors.nlpl.eu/repository/11/152.zip) | [Ancient Greek](http://vectors.nlpl.eu/repository/11/153.zip) | [Hebrew](http://vectors.nlpl.eu/repository/11/154.zip) | [Hindi](http://vectors.nlpl.eu/repository/11/155.zip) | 35 | | [Croatian](http://vectors.nlpl.eu/repository/11/156.zip) | [Hungarian](http://vectors.nlpl.eu/repository/11/157.zip) | [Indonesian](http://vectors.nlpl.eu/repository/11/158.zip) | [Italian](http://vectors.nlpl.eu/repository/11/159.zip) | 36 | | [Japanese](http://vectors.nlpl.eu/repository/11/160.zip) | [Korean](http://vectors.nlpl.eu/repository/11/161.zip) | [Latin](http://vectors.nlpl.eu/repository/11/162.zip) | [Latvian](http://vectors.nlpl.eu/repository/11/163.zip) | 37 | | [Norwegian Bokmål](http://vectors.nlpl.eu/repository/11/165.zip) | [Dutch](http://vectors.nlpl.eu/repository/11/164.zip) | [Norwegian Nynorsk](http://vectors.nlpl.eu/repository/11/166.zip) | [Polish](http://vectors.nlpl.eu/repository/11/167.zip) | 38 | | [Portuguese](http://vectors.nlpl.eu/repository/11/168.zip) | [Romanian](http://vectors.nlpl.eu/repository/11/169.zip) | [Russian](http://vectors.nlpl.eu/repository/11/170.zip) | [Slovak](http://vectors.nlpl.eu/repository/11/171.zip) | 39 | | [Slovene](http://vectors.nlpl.eu/repository/11/172.zip) | [Swedish](http://vectors.nlpl.eu/repository/11/173.zip) | [Turkish](http://vectors.nlpl.eu/repository/11/174.zip) | [Uyghur](http://vectors.nlpl.eu/repository/11/175.zip) | 40 | | [Ukrainian](http://vectors.nlpl.eu/repository/11/176.zip) | [Urdu](http://vectors.nlpl.eu/repository/11/177.zip) | [Vietnamese](http://vectors.nlpl.eu/repository/11/178.zip) | [Chinese](http://vectors.nlpl.eu/repository/11/179.zip) | 41 | 42 | The models are hosted on the [NLPL Vectors Repository](http://wiki.nlpl.eu/index.php/Vectors/home). 43 | 44 | **ELMo for Simplified Chinese** 45 | 46 | We also provided [simplified-Chinese ELMo](http://39.96.43.154/zhs.model.tar.bz2). 47 | It was trained on xinhua proportion of [Chinese gigawords-v5](https://catalog.ldc.upenn.edu/ldc2011t13), 48 | which is different from the Wikipedia for traditional Chinese ELMo. 49 | 50 | ## Pre-requirements 51 | 52 | * **must** python >= 3.6 (if you use python3.5, you will encounter this issue https://github.com/HIT-SCIR/ELMoForManyLangs/issues/8) 53 | * pytorch 0.4 54 | * other requirements from allennlp 55 | 56 | ## Usage 57 | 58 | 59 | ### Install the package 60 | 61 | You need to install the package to use the embeddings with the following commends 62 | ``` 63 | python setup.py install 64 | ``` 65 | 66 | ### Set up the `config_path` 67 | After unzip the model, you will find a JSON file `${lang}.model/config.json`. 68 | Please change the `"config_path"` field to the relative path to 69 | the model configuration `cnn_50_100_512_4096_sample.json`. 70 | For example, if your ELMo model is `zht.model/config.json` and your model configuration 71 | is `zht.model/cnn_50_100_512_4096_sample.json`, you need to change `"config_path"` 72 | in `zht.model/config.json` to `cnn_50_100_512_4096_sample.json`. 73 | 74 | If there is no configuration `cnn_50_100_512_4096_sample.json` under `${lang}.model`, 75 | you can copy the `elmoformanylangs/configs/cnn_50_100_512_4096_sample.json` into `${lang}.model`, 76 | or change the `"config_path"` into `elmoformanylangs/configs/cnn_50_100_512_4096_sample.json`. 77 | 78 | See [issue 27](https://github.com/HIT-SCIR/ELMoForManyLangs/issues/27) for more details. 79 | 80 | 81 | ### Use ELMoForManyLangs in command line 82 | 83 | Prepare your input file in the [conllu format](http://universaldependencies.org/format.html), like 84 | ``` 85 | 1 Sue Sue _ _ _ _ _ _ _ 86 | 2 likes like _ _ _ _ _ _ _ 87 | 3 coffee coffee _ _ _ _ _ _ _ 88 | 4 and and _ _ _ _ _ _ _ 89 | 5 Bill Bill _ _ _ _ _ _ _ 90 | 6 tea tea _ _ _ _ _ _ _ 91 | ``` 92 | Fileds should be separated by `'\t'`. We only use the second column and space (`' '`) is supported in 93 | this field (for Vietnamese, a word can contains spaces). 94 | Do remember tokenization! 95 | 96 | When it's all set, run 97 | 98 | ``` 99 | $ python -m elmoformanylangs test \ 100 | --input_format conll \ 101 | --input /path/to/your/input \ 102 | --model /path/to/your/model \ 103 | --output_prefix /path/to/your/output \ 104 | --output_format hdf5 \ 105 | --output_layer -1 106 | ``` 107 | 108 | It will dump an hdf5 encoded `dict` onto the disk, where the key is `'\t'` separated 109 | words in the sentence and the value is it's 3-layer averaged ELMo representation. 110 | You can also dump the cnn encoded word with `--output_layer 0`, 111 | the first layer of the LsTM with `--output_layer 1` and the second layer 112 | of the LSTM with `--output_layer 2`. 113 | We are actively changing the interface to make it more adapted to the 114 | AllenNLP ELMo and more programmatically friendly. 115 | 116 | ### Use ELMoForManyLangs programmatically 117 | 118 | Thanks @voidism for contributing the API. 119 | By using `Embedder` python object, you can use ELMo into your own code like this: 120 | 121 | ```python 122 | from elmoformanylangs import Embedder 123 | 124 | e = Embedder('/path/to/your/model/') 125 | 126 | sents = [['今', '天', '天氣', '真', '好', '阿'], 127 | ['潮水', '退', '了', '就', '知道', '誰', '沒', '穿', '褲子']] 128 | # the list of lists which store the sentences 129 | # after segment if necessary. 130 | 131 | e.sents2elmo(sents) 132 | # will return a list of numpy arrays 133 | # each with the shape=(seq_len, embedding_size) 134 | ``` 135 | 136 | #### the parameters to init Embedder: 137 | ```python 138 | class Embedder(model_dir='/path/to/your/model/', batch_size=64): 139 | ``` 140 | - **model_dir**: the absolute path from the repo top dir to you model dir. 141 | - **batch_size**: the batch_size you want when the model inference, you can specify it properly according to your gpu/cpu ram size. (default: 64) 142 | 143 | #### the parameters of the function sents2elmo: 144 | ```python 145 | def sents2elmo(sents, output_layer=-1): 146 | ``` 147 | - **sents**: the list of lists which store the sentences after segment if necessary. 148 | - **output_layer**: the target layer to output. 149 | - 0 for the word encoder 150 | - 1 for the first LSTM hidden layer 151 | - 2 for the second LSTM hidden layer 152 | - -1 for an average of 3 layers. (default) 153 | - -2 for all 3 layers 154 | 155 | ## Training Your Own ELMo 156 | 157 | Please run 158 | ``` 159 | $ python -m elmoformanylangs.biLM train -h 160 | ``` 161 | to get more details about the ELMo training. 162 | 163 | Here is an example for training English ELMo. 164 | ``` 165 | $ less data/en.raw 166 | ... (snip) ... 167 | Notable alumni 168 | Aris Kalafatis ( Acting ) 169 | Labour Party 170 | They build an open nest in a tree hole , or man - made nest - boxes . 171 | Legacy 172 | ... (snip) ... 173 | 174 | $ python -m elmoformanylangs.biLM train \ 175 | --train_path data/en.raw \ 176 | --config_path elmoformanylangs/configs/cnn_50_100_512_4096_sample.json \ 177 | --model output/en \ 178 | --optimizer adam \ 179 | --lr 0.001 \ 180 | --lr_decay 0.8 \ 181 | --max_epoch 10 \ 182 | --max_sent_len 20 \ 183 | --max_vocab_size 150000 \ 184 | --min_count 3 185 | ``` 186 | However, we 187 | need to add that the training process is not very stable. 188 | In some cases, we end up with a loss of `nan`. We are actively working on that and hopefully 189 | improve it in the future. 190 | 191 | ## Citation 192 | 193 | If our ELMo gave you nice improvements, please cite us. 194 | 195 | ``` 196 | @InProceedings{che-EtAl:2018:K18-2, 197 | author = {Che, Wanxiang and Liu, Yijia and Wang, Yuxuan and Zheng, Bo and Liu, Ting}, 198 | title = {Towards Better {UD} Parsing: Deep Contextualized Word Embeddings, Ensemble, and Treebank Concatenation}, 199 | booktitle = {Proceedings of the {CoNLL} 2018 Shared Task: Multilingual Parsing from Raw Text to Universal Dependencies}, 200 | month = {October}, 201 | year = {2018}, 202 | address = {Brussels, Belgium}, 203 | publisher = {Association for Computational Linguistics}, 204 | pages = {55--64}, 205 | url = {http://www.aclweb.org/anthology/K18-2005} 206 | } 207 | ``` 208 | 209 | Please also cite the 210 | [NLPL Vectors Repository](http://wiki.nlpl.eu/index.php/Vectors/home) 211 | for hosting the models. 212 | ``` 213 | @InProceedings{fares-EtAl:2017:NoDaLiDa, 214 | author = {Fares, Murhaf and Kutuzov, Andrey and Oepen, Stephan and Velldal, Erik}, 215 | title = {Word vectors, reuse, and replicability: Towards a community repository of large-text resources}, 216 | booktitle = {Proceedings of the 21st Nordic Conference on Computational Linguistics}, 217 | month = {May}, 218 | year = {2017}, 219 | address = {Gothenburg, Sweden}, 220 | publisher = {Association for Computational Linguistics}, 221 | pages = {271--276}, 222 | url = {http://www.aclweb.org/anthology/W17-0237} 223 | } 224 | ``` 225 | -------------------------------------------------------------------------------- /elmoformanylangs/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from .elmo import Embedder 3 | 4 | 5 | import logging 6 | logger = logging.getLogger('elmoformanylangs') 7 | 8 | # if the client application hasn't set the log level, we set it 9 | # ourselves to INFO 10 | if logger.level == 0: 11 | logger.setLevel(logging.INFO) 12 | 13 | log_handler = logging.StreamHandler() 14 | log_formatter = logging.Formatter(fmt="%(asctime)-15s %(levelname)s: %(message)s") 15 | log_handler.setFormatter(log_formatter) 16 | 17 | # also, if the client hasn't added any handlers for this logger 18 | # (or a default handler), we add a handler of our own 19 | # 20 | # client can later do 21 | # logger.removeHandler(stanza.log_handler) 22 | if not logger.hasHandlers(): 23 | logger.addHandler(log_handler) 24 | -------------------------------------------------------------------------------- /elmoformanylangs/__main__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | from __future__ import unicode_literals 4 | import os 5 | import sys 6 | import codecs 7 | import argparse 8 | import logging 9 | import json 10 | import torch 11 | from .modules.embedding_layer import EmbeddingLayer 12 | from .utils import dict2namedtuple 13 | from .frontend import Model 14 | from .frontend import create_batches 15 | import numpy as np 16 | import h5py 17 | 18 | logger = logging.getLogger('elmoformanylangs') 19 | 20 | 21 | def read_corpus(path, max_chars=None): 22 | """ 23 | read raw text file. The format of the input is like, one sentence per line 24 | words are separated by '\t' 25 | 26 | :param path: 27 | :param max_chars: int, the number of maximum characters in a word, this 28 | parameter is used when the model is configured with CNN word encoder. 29 | :return: 30 | """ 31 | dataset = [] 32 | textset = [] 33 | with codecs.open(path, 'r', encoding='utf-8') as fin: 34 | for line in fin.read().strip().split('\n'): 35 | data = [''] 36 | text = [] 37 | for token in line.split('\t'): 38 | text.append(token) 39 | if max_chars is not None and len(token) + 2 > max_chars: 40 | token = token[:max_chars - 2] 41 | data.append(token) 42 | data.append('') 43 | dataset.append(data) 44 | textset.append(text) 45 | return dataset, textset 46 | 47 | 48 | def read_conll_corpus(path, max_chars=None): 49 | """ 50 | read text in CoNLL-U format. 51 | 52 | :param path: 53 | :param max_chars: 54 | :return: 55 | """ 56 | dataset = [] 57 | textset = [] 58 | with codecs.open(path, 'r', encoding='utf-8') as fin: 59 | for payload in fin.read().strip().split('\n\n'): 60 | data = [''] 61 | text = [] 62 | lines = payload.splitlines() 63 | body = [line for line in lines if not line.startswith('#')] 64 | for line in body: 65 | fields = line.split('\t') 66 | num, token = fields[0], fields[1] 67 | if '-' in num or '.' in num: 68 | continue 69 | text.append(token) 70 | if max_chars is not None and len(token) + 2 > max_chars: 71 | token = token[:max_chars - 2] 72 | data.append(token) 73 | data.append('') 74 | dataset.append(data) 75 | textset.append(text) 76 | return dataset, textset 77 | 78 | 79 | def read_conll_char_corpus(path, max_chars=None): 80 | """ 81 | 82 | :param path: 83 | :param max_chars: 84 | :return: 85 | """ 86 | dataset = [] 87 | textset = [] 88 | with codecs.open(path, 'r', encoding='utf-8') as fin: 89 | for payload in fin.read().strip().split('\n\n'): 90 | data = [''] 91 | text = [] 92 | lines = payload.splitlines() 93 | body = [line for line in lines if not line.startswith('#')] 94 | for line in body: 95 | fields = line.split('\t') 96 | num, token = fields[0], fields[1] 97 | if '-' in num or '.' in num: 98 | continue 99 | for ch in token: 100 | text.append(ch) 101 | if max_chars is not None and len(ch) + 2 > max_chars: 102 | ch = ch[:max_chars - 2] 103 | data.append(ch) 104 | data.append('') 105 | dataset.append(data) 106 | textset.append(text) 107 | return dataset, textset 108 | 109 | 110 | def read_conll_char_vi_corpus(path, max_chars=None): 111 | """ 112 | 113 | :param path: 114 | :param max_chars: 115 | :return: 116 | """ 117 | dataset = [] 118 | textset = [] 119 | with codecs.open(path, 'r', encoding='utf-8') as fin: 120 | for payload in fin.read().strip().split('\n\n'): 121 | data = [''] 122 | text = [] 123 | lines = payload.splitlines() 124 | body = [line for line in lines if not line.startswith('#')] 125 | for line in body: 126 | fields = line.split('\t') 127 | num, token = fields[0], fields[1] 128 | if '-' in num or '.' in num: 129 | continue 130 | for ch in token.split(): 131 | text.append(ch) 132 | if max_chars is not None and len(ch) + 2 > max_chars: 133 | ch = ch[:max_chars - 2] 134 | data.append(ch) 135 | data.append('') 136 | dataset.append(data) 137 | textset.append(text) 138 | return dataset, textset 139 | 140 | 141 | def test_main(): 142 | # Configurations 143 | cmd = argparse.ArgumentParser('The testing components of') 144 | cmd.add_argument('--gpu', default=-1, type=int, help='use id of gpu, -1 if cpu.') 145 | cmd.add_argument('--input_format', default='plain', choices=('plain', 'conll', 'conll_char', 'conll_char_vi'), 146 | help='the input format.') 147 | cmd.add_argument("--input", help="the path to the raw text file.") 148 | cmd.add_argument("--output_format", default='hdf5', help='the output format. Supported format includes (hdf5, txt).' 149 | ' Use comma to separate the format identifiers,' 150 | ' like \'--output_format=hdf5,plain\'') 151 | cmd.add_argument("--output_prefix", help='the prefix of the output file. The output file is in the format of ' 152 | '..') 153 | cmd.add_argument("--output_layer", help='the target layer to output. 0 for the word encoder, 1 for the first LSTM ' 154 | 'hidden layer, 2 for the second LSTM hidden layer, -1 for an average' 155 | 'of 3 layers.') 156 | cmd.add_argument("--model", required=True, help="the path to the model.") 157 | cmd.add_argument("--batch_size", "--batch", type=int, default=1, help='the batch size.') 158 | args = cmd.parse_args(sys.argv[2:]) 159 | 160 | if args.gpu >= 0: 161 | torch.cuda.set_device(args.gpu) 162 | use_cuda = args.gpu >= 0 and torch.cuda.is_available() 163 | # load the model configurations 164 | args2 = dict2namedtuple(json.load(codecs.open(os.path.join(args.model, 'config.json'), 'r', encoding='utf-8'))) 165 | 166 | with open(os.path.join(args.model, args2.config_path), 'r') as fin: 167 | config = json.load(fin) 168 | 169 | # For the model trained with character-based word encoder. 170 | if config['token_embedder']['char_dim'] > 0: 171 | char_lexicon = {} 172 | with codecs.open(os.path.join(args.model, 'char.dic'), 'r', encoding='utf-8') as fpi: 173 | for line in fpi: 174 | tokens = line.strip().split('\t') 175 | if len(tokens) == 1: 176 | tokens.insert(0, '\u3000') 177 | token, i = tokens 178 | char_lexicon[token] = int(i) 179 | char_emb_layer = EmbeddingLayer(config['token_embedder']['char_dim'], char_lexicon, fix_emb=False, embs=None) 180 | logger.info('char embedding size: ' + str(len(char_emb_layer.word2id))) 181 | else: 182 | char_lexicon = None 183 | char_emb_layer = None 184 | 185 | # For the model trained with word form word encoder. 186 | if config['token_embedder']['word_dim'] > 0: 187 | word_lexicon = {} 188 | with codecs.open(os.path.join(args.model, 'word.dic'), 'r', encoding='utf-8') as fpi: 189 | for line in fpi: 190 | tokens = line.strip().split('\t') 191 | if len(tokens) == 1: 192 | tokens.insert(0, '\u3000') 193 | token, i = tokens 194 | word_lexicon[token] = int(i) 195 | word_emb_layer = EmbeddingLayer(config['token_embedder']['word_dim'], word_lexicon, fix_emb=False, embs=None) 196 | logger.info('word embedding size: ' + str(len(word_emb_layer.word2id))) 197 | else: 198 | word_lexicon = None 199 | word_emb_layer = None 200 | 201 | # instantiate the model 202 | model = Model(config, word_emb_layer, char_emb_layer, use_cuda) 203 | 204 | if use_cuda: 205 | model.cuda() 206 | 207 | logger.info(str(model)) 208 | model.load_model(args.model) 209 | 210 | # read test data according to input format 211 | read_function = read_corpus if args.input_format == 'plain' else ( 212 | read_conll_corpus if args.input_format == 'conll' else ( 213 | read_conll_char_corpus if args.input_format == 'conll_char' else read_conll_char_vi_corpus)) 214 | 215 | if config['token_embedder']['name'].lower() == 'cnn': 216 | test, text = read_function(args.input, config['token_embedder']['max_characters_per_token']) 217 | else: 218 | test, text = read_function(args.input) 219 | 220 | # create test batches from the input data. 221 | test_w, test_c, test_lens, test_masks, test_text = create_batches( 222 | test, args.batch_size, word_lexicon, char_lexicon, config, text=text) 223 | 224 | # configure the model to evaluation mode. 225 | model.eval() 226 | 227 | sent_set = set() 228 | cnt = 0 229 | 230 | output_formats = args.output_format.split(',') 231 | output_layers = map(int, args.output_layer.split(',')) 232 | 233 | handlers = {} 234 | for output_format in output_formats: 235 | if output_format not in ('hdf5', 'txt'): 236 | print('Unknown output_format: {0}'.format(output_format)) 237 | continue 238 | for output_layer in output_layers: 239 | filename = '{0}.ly{1}.{2}'.format(args.output_prefix, output_layer, output_format) 240 | handlers[output_format, output_layer] = \ 241 | h5py.File(filename, 'w') if output_format == 'hdf5' else open(filename, 'w') 242 | 243 | for w, c, lens, masks, texts in zip(test_w, test_c, test_lens, test_masks, test_text): 244 | output = model.forward(w, c, masks) 245 | for i, text in enumerate(texts): 246 | sent = '\t'.join(text) 247 | sent = sent.replace('.', '$period$') 248 | sent = sent.replace('/', '$backslash$') 249 | if sent in sent_set: 250 | continue 251 | sent_set.add(sent) 252 | if config['encoder']['name'].lower() == 'lstm': 253 | data = output[i, 1:lens[i]-1, :].data 254 | if use_cuda: 255 | data = data.cpu() 256 | data = data.numpy() 257 | elif config['encoder']['name'].lower() == 'elmo': 258 | data = output[:, i, 1:lens[i]-1, :].data 259 | if use_cuda: 260 | data = data.cpu() 261 | data = data.numpy() 262 | 263 | for (output_format, output_layer) in handlers: 264 | fout = handlers[output_format, output_layer] 265 | if output_layer == -1: 266 | payload = np.average(data, axis=0) 267 | else: 268 | payload = data[output_layer] 269 | if output_format == 'hdf5': 270 | fout.create_dataset(sent, payload.shape, dtype='float32', data=payload) 271 | else: 272 | for word, row in zip(text, payload): 273 | print('{0}\t{1}'.format(word, '\t'.join(['{0:.8f}'.format(elem) for elem in row])), file=fout) 274 | print('', file=fout) 275 | 276 | cnt += 1 277 | if cnt % 1000 == 0: 278 | logger.info('Finished {0} sentences.'.format(cnt)) 279 | for _, handler in handlers.items(): 280 | handler.close() 281 | 282 | 283 | if __name__ == "__main__": 284 | if len(sys.argv) > 1 and sys.argv[1] == 'test': 285 | test_main() 286 | else: 287 | print('Usage: {0} [test] [options]'.format(sys.argv[0]), file=sys.stderr) 288 | -------------------------------------------------------------------------------- /elmoformanylangs/biLM.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | from __future__ import unicode_literals 4 | import os 5 | import errno 6 | import sys 7 | import codecs 8 | import argparse 9 | import time 10 | import random 11 | import logging 12 | import json 13 | import torch 14 | import torch.nn as nn 15 | import torch.nn.functional as F 16 | import torch.optim as optim 17 | from torch.autograd import Variable 18 | from .modules.elmo import ElmobiLm 19 | from .modules.lstm import LstmbiLm 20 | from .modules.token_embedder import ConvTokenEmbedder, LstmTokenEmbedder 21 | from .modules.embedding_layer import EmbeddingLayer 22 | from .modules.classify_layer import SoftmaxLayer, CNNSoftmaxLayer, SampledSoftmaxLayer 23 | from .dataloader import load_embedding 24 | from .utils import dict2namedtuple 25 | from collections import Counter 26 | import numpy as np 27 | 28 | logger = logging.getLogger('elmoformanylangs') 29 | 30 | 31 | def divide(data, valid_size): 32 | valid_size = min(valid_size, len(data) // 10) 33 | random.shuffle(data) 34 | return data[valid_size:], data[:valid_size] 35 | 36 | 37 | def break_sentence(sentence, max_sent_len): 38 | """ 39 | For example, for a sentence with 70 words, supposing the the `max_sent_len' 40 | is 30, break it into 3 sentences. 41 | 42 | :param sentence: list[str] the sentence 43 | :param max_sent_len: 44 | :return: 45 | """ 46 | ret = [] 47 | cur = 0 48 | length = len(sentence) 49 | while cur < length: 50 | if cur + max_sent_len + 5 >= length: 51 | ret.append(sentence[cur: length]) 52 | break 53 | ret.append(sentence[cur: min(length, cur + max_sent_len)]) 54 | cur += max_sent_len 55 | return ret 56 | 57 | 58 | def read_corpus(path, max_chars=None, max_sent_len=20): 59 | """ 60 | read raw text file 61 | :param path: str 62 | :param max_chars: int 63 | :param max_sent_len: int 64 | :return: 65 | """ 66 | data = [] 67 | with codecs.open(path, 'r', encoding='utf-8') as fin: 68 | for line in fin: 69 | data.append('') 70 | for token in line.strip().split(): 71 | if max_chars is not None and len(token) + 2 > max_chars: 72 | token = token[:max_chars - 2] 73 | data.append(token) 74 | data.append('') 75 | dataset = break_sentence(data, max_sent_len) 76 | return dataset 77 | 78 | 79 | def create_one_batch(x, word2id, char2id, config, oov='', pad='', sort=True): 80 | """ 81 | 82 | :param x: 83 | :param word2id: dict 84 | :param char2id: dict 85 | :param config: 86 | :param oov: 87 | :param pad: 88 | :param sort: 89 | :return: 90 | """ 91 | batch_size = len(x) 92 | lst = list(range(batch_size)) 93 | if sort: 94 | lst.sort(key=lambda l: -len(x[l])) 95 | 96 | x = [x[i] for i in lst] 97 | lens = [len(x[i]) for i in lst] 98 | max_len = max(lens) 99 | 100 | if word2id is not None: 101 | oov_id, pad_id = word2id.get(oov, None), word2id.get(pad, None) 102 | assert oov_id is not None and pad_id is not None 103 | batch_w = torch.LongTensor(batch_size, max_len).fill_(pad_id) 104 | for i, x_i in enumerate(x): 105 | for j, x_ij in enumerate(x_i): 106 | batch_w[i][j] = word2id.get(x_ij, oov_id) 107 | else: 108 | batch_w = None 109 | 110 | if char2id is not None: 111 | bow_id, eow_id, oov_id, pad_id = char2id.get('', None), char2id.get('', None), char2id.get(oov, None), char2id.get(pad, None) 112 | 113 | assert bow_id is not None and eow_id is not None and oov_id is not None and pad_id is not None 114 | 115 | if config['token_embedder']['name'].lower() == 'cnn': 116 | max_chars = config['token_embedder']['max_characters_per_token'] 117 | assert max([len(w) for i in lst for w in x[i]]) + 2 <= max_chars 118 | elif config['token_embedder']['name'].lower() == 'lstm': 119 | max_chars = max([len(w) for i in lst for w in x[i]]) + 2 # counting the and 120 | 121 | batch_c = torch.LongTensor(batch_size, max_len, max_chars).fill_(pad_id) 122 | 123 | for i, x_i in enumerate(x): 124 | for j, x_ij in enumerate(x_i): 125 | batch_c[i][j][0] = bow_id 126 | if x_ij == '' or x_ij == '': 127 | batch_c[i][j][1] = char2id.get(x_ij) 128 | batch_c[i][j][2] = eow_id 129 | else: 130 | for k, c in enumerate(x_ij): 131 | batch_c[i][j][k + 1] = char2id.get(c, oov_id) 132 | batch_c[i][j][len(x_ij) + 1] = eow_id 133 | else: 134 | batch_c = None 135 | 136 | masks = [torch.LongTensor(batch_size, max_len).fill_(0), [], []] 137 | 138 | for i, x_i in enumerate(x): 139 | for j in range(len(x_i)): 140 | masks[0][i][j] = 1 141 | if j + 1 < len(x_i): 142 | masks[1].append(i * max_len + j) 143 | if j > 0: 144 | masks[2].append(i * max_len + j) 145 | 146 | assert len(masks[1]) <= batch_size * max_len 147 | assert len(masks[2]) <= batch_size * max_len 148 | 149 | masks[1] = torch.LongTensor(masks[1]) 150 | masks[2] = torch.LongTensor(masks[2]) 151 | 152 | return batch_w, batch_c, lens, masks 153 | 154 | 155 | # shuffle training examples and create mini-batches 156 | def create_batches(x, batch_size, word2id, char2id, config, perm=None, shuffle=True, sort=True, use_cuda=False): 157 | """ 158 | 159 | :param x: 160 | :param batch_size: 161 | :param word2id: 162 | :param char2id: 163 | :param config: 164 | :param perm: 165 | :param shuffle: 166 | :param sort: 167 | :param use_cuda: 168 | :return: 169 | """ 170 | lst = perm or list(range(len(x))) 171 | if shuffle: 172 | random.shuffle(lst) 173 | 174 | if sort: 175 | lst.sort(key=lambda l: -len(x[l])) 176 | 177 | x = [x[i] for i in lst] 178 | 179 | sum_len = 0.0 180 | batches_w, batches_c, batches_lens, batches_masks = [], [], [], [] 181 | size = batch_size 182 | nbatch = (len(x) - 1) // size + 1 183 | for i in range(nbatch): 184 | start_id, end_id = i * size, (i + 1) * size 185 | bw, bc, blens, bmasks = create_one_batch(x[start_id: end_id], word2id, char2id, config, sort=sort) 186 | sum_len += sum(blens) 187 | batches_w.append(bw) 188 | batches_c.append(bc) 189 | batches_lens.append(blens) 190 | batches_masks.append(bmasks) 191 | 192 | if sort: 193 | perm = list(range(nbatch)) 194 | random.shuffle(perm) 195 | batches_w = [batches_w[i] for i in perm] 196 | batches_c = [batches_c[i] for i in perm] 197 | batches_lens = [batches_lens[i] for i in perm] 198 | batches_masks = [batches_masks[i] for i in perm] 199 | 200 | logger.info("{} batches, avg len: {:.1f}".format(nbatch, sum_len / len(x))) 201 | return batches_w, batches_c, batches_lens, batches_masks 202 | 203 | 204 | class Model(nn.Module): 205 | def __init__(self, config, word_emb_layer, char_emb_layer, n_class, use_cuda=False): 206 | super(Model, self).__init__() 207 | self.use_cuda = use_cuda 208 | self.config = config 209 | 210 | if config['token_embedder']['name'].lower() == 'cnn': 211 | self.token_embedder = ConvTokenEmbedder(config, word_emb_layer, char_emb_layer, use_cuda) 212 | elif config['token_embedder']['name'].lower() == 'lstm': 213 | self.token_embedder = LstmTokenEmbedder(config, word_emb_layer, char_emb_layer, use_cuda) 214 | 215 | if config['encoder']['name'].lower() == 'elmo': 216 | self.encoder = ElmobiLm(config, use_cuda) 217 | elif config['encoder']['name'].lower() == 'lstm': 218 | self.encoder = LstmbiLm(config, use_cuda) 219 | 220 | self.output_dim = config['encoder']['projection_dim'] 221 | if config['classifier']['name'].lower() == 'softmax': 222 | self.classify_layer = SoftmaxLayer(self.output_dim, n_class) 223 | elif config['classifier']['name'].lower() == 'cnn_softmax': 224 | self.classify_layer = CNNSoftmaxLayer(self.token_embedder, self.output_dim, n_class, 225 | config['classifier']['n_samples'], config['classifier']['corr_dim'], 226 | use_cuda) 227 | elif config['classifier']['name'].lower() == 'sampled_softmax': 228 | self.classify_layer = SampledSoftmaxLayer(self.output_dim, n_class, config['classifier']['n_samples'], use_cuda) 229 | 230 | def forward(self, word_inp, chars_inp, mask_package): 231 | """ 232 | 233 | :param word_inp: 234 | :param chars_inp: 235 | :param mask_package: Tuple[] 236 | :return: 237 | """ 238 | classifier_name = self.config['classifier']['name'].lower() 239 | 240 | if self.training and classifier_name == 'cnn_softmax' or classifier_name == 'sampled_softmax': 241 | self.classify_layer.update_negative_samples(word_inp, chars_inp, mask_package[0]) 242 | self.classify_layer.update_embedding_matrix() 243 | 244 | token_embedding = self.token_embedder(word_inp, chars_inp, (mask_package[0].size(0), mask_package[0].size(1))) 245 | token_embedding = F.dropout(token_embedding, self.config['dropout'], self.training) 246 | 247 | encoder_name = self.config['encoder']['name'].lower() 248 | if encoder_name == 'elmo': 249 | mask = Variable(mask_package[0].cuda()).cuda() if self.use_cuda else Variable(mask_package[0]) 250 | encoder_output = self.encoder(token_embedding, mask) 251 | encoder_output = encoder_output[1] 252 | # [batch_size, len, hidden_size] 253 | elif encoder_name == 'lstm': 254 | encoder_output = self.encoder(token_embedding) 255 | else: 256 | raise ValueError('') 257 | 258 | encoder_output = F.dropout(encoder_output, self.config['dropout'], self.training) 259 | forward, backward = encoder_output.split(self.output_dim, 2) 260 | 261 | word_inp = Variable(word_inp) 262 | if self.use_cuda: 263 | word_inp = word_inp.cuda() 264 | 265 | mask1 = Variable(mask_package[1].cuda()).cuda() if self.use_cuda else Variable(mask_package[1]) 266 | mask2 = Variable(mask_package[2].cuda()).cuda() if self.use_cuda else Variable(mask_package[2]) 267 | 268 | forward_x = forward.contiguous().view(-1, self.output_dim).index_select(0, mask1) 269 | forward_y = word_inp.contiguous().view(-1).index_select(0, mask2) 270 | 271 | backward_x = backward.contiguous().view(-1, self.output_dim).index_select(0, mask2) 272 | backward_y = word_inp.contiguous().view(-1).index_select(0, mask1) 273 | 274 | return self.classify_layer(forward_x, forward_y), self.classify_layer(backward_x, backward_y) 275 | 276 | def save_model(self, path, save_classify_layer): 277 | torch.save(self.token_embedder.state_dict(), os.path.join(path, 'token_embedder.pkl')) 278 | torch.save(self.encoder.state_dict(), os.path.join(path, 'encoder.pkl')) 279 | if save_classify_layer: 280 | torch.save(self.classify_layer.state_dict(), os.path.join(path, 'classifier.pkl')) 281 | 282 | def load_model(self, path): 283 | self.token_embedder.load_state_dict(torch.load(os.path.join(path, 'token_embedder.pkl'))) 284 | self.encoder.load_state_dict(torch.load(os.path.join(path, 'encoder.pkl'))) 285 | self.classify_layer.load_state_dict(torch.load(os.path.join(path, 'classifier.pkl'))) 286 | 287 | 288 | def eval_model(model, valid): 289 | model.eval() 290 | if model.config['classifier']['name'].lower() == 'cnn_softmax' or \ 291 | model.config['classifier']['name'].lower() == 'sampled_softmax': 292 | model.classify_layer.update_embedding_matrix() 293 | total_loss, total_tag = 0.0, 0 294 | valid_w, valid_c, valid_lens, valid_masks = valid 295 | for w, c, lens, masks in zip(valid_w, valid_c, valid_lens, valid_masks): 296 | loss_forward, loss_backward = model.forward(w, c, masks) 297 | total_loss += loss_forward.data[0] 298 | n_tags = sum(lens) 299 | total_tag += n_tags 300 | model.train() 301 | return np.exp(total_loss / total_tag) 302 | 303 | 304 | def train_model(epoch, opt, model, optimizer, 305 | train, valid, test, best_train, best_valid, test_result): 306 | """ 307 | Training model for one epoch 308 | 309 | :param epoch: 310 | :param opt: 311 | :param model: 312 | :param optimizer: 313 | :param train: 314 | :param best_train: 315 | :param valid: 316 | :param best_valid: 317 | :param test: 318 | :param test_result: 319 | :return: 320 | """ 321 | model.train() 322 | 323 | total_loss, total_tag = 0.0, 0 324 | cnt = 0 325 | start_time = time.time() 326 | 327 | train_w, train_c, train_lens, train_masks = train 328 | 329 | lst = list(range(len(train_w))) 330 | random.shuffle(lst) 331 | 332 | train_w = [train_w[l] for l in lst] 333 | train_c = [train_c[l] for l in lst] 334 | train_lens = [train_lens[l] for l in lst] 335 | train_masks = [train_masks[l] for l in lst] 336 | 337 | for w, c, lens, masks in zip(train_w, train_c, train_lens, train_masks): 338 | cnt += 1 339 | model.zero_grad() 340 | loss_forward, loss_backward = model.forward(w, c, masks) 341 | 342 | loss = (loss_forward + loss_backward) / 2.0 343 | total_loss += loss_forward.data[0] 344 | n_tags = sum(lens) 345 | total_tag += n_tags 346 | loss.backward() 347 | 348 | torch.nn.utils.clip_grad_norm(model.parameters(), opt.clip_grad) 349 | optimizer.step() 350 | if cnt * opt.batch_size % 1024 == 0: 351 | logger.info("Epoch={} iter={} lr={:.6f} train_ppl={:.6f} time={:.2f}s".format( 352 | epoch, cnt, optimizer.param_groups[0]['lr'], 353 | np.exp(total_loss / total_tag), time.time() - start_time 354 | )) 355 | start_time = time.time() 356 | 357 | if cnt % opt.eval_steps == 0 or cnt % len(train_w) == 0: 358 | if valid is None: 359 | train_ppl = np.exp(total_loss / total_tag) 360 | logger.info("Epoch={} iter={} lr={:.6f} train_ppl={:.6f}".format( 361 | epoch, cnt, optimizer.param_groups[0]['lr'], train_ppl)) 362 | if train_ppl < best_train: 363 | best_train = train_ppl 364 | logger.info("New record achieved on training dataset!") 365 | model.save_model(opt.model, opt.save_classify_layer) 366 | else: 367 | valid_ppl = eval_model(model, valid) 368 | logger.info("Epoch={} iter={} lr={:.6f} valid_ppl={:.6f}".format( 369 | epoch, cnt, optimizer.param_groups[0]['lr'], valid_ppl)) 370 | 371 | if valid_ppl < best_valid: 372 | model.save_model(opt.model, opt.save_classify_layer) 373 | best_valid = valid_ppl 374 | logger.info("New record achieved!") 375 | 376 | if test is not None: 377 | test_result = eval_model(model, test) 378 | logger.info("Epoch={} iter={} lr={:.6f} test_ppl={:.6f}".format( 379 | epoch, cnt, optimizer.param_groups[0]['lr'], test_result)) 380 | return best_train, best_valid, test_result 381 | 382 | 383 | def get_truncated_vocab(dataset, min_count): 384 | """ 385 | 386 | :param dataset: 387 | :param min_count: int 388 | :return: 389 | """ 390 | word_count = Counter() 391 | for sentence in dataset: 392 | word_count.update(sentence) 393 | 394 | word_count = list(word_count.items()) 395 | word_count.sort(key=lambda x: x[1], reverse=True) 396 | 397 | i = 0 398 | for word, count in word_count: 399 | if count < min_count: 400 | break 401 | i += 1 402 | 403 | logger.info('Truncated word count: {0}.'.format(sum([count for word, count in word_count[i:]]))) 404 | logger.info('Original vocabulary size: {0}.'.format(len(word_count))) 405 | return word_count[:i] 406 | 407 | 408 | def train(): 409 | cmd = argparse.ArgumentParser(sys.argv[0], conflict_handler='resolve') 410 | cmd.add_argument('--seed', default=1, type=int, help='The random seed.') 411 | cmd.add_argument('--gpu', default=-1, type=int, help='Use id of gpu, -1 if cpu.') 412 | 413 | cmd.add_argument('--train_path', required=True, help='The path to the training file.') 414 | cmd.add_argument('--valid_path', help='The path to the development file.') 415 | cmd.add_argument('--test_path', help='The path to the testing file.') 416 | 417 | cmd.add_argument('--config_path', required=True, help='the path to the config file.') 418 | cmd.add_argument("--word_embedding", help="The path to word vectors.") 419 | 420 | cmd.add_argument('--optimizer', default='sgd', choices=['sgd', 'adam', 'adagrad'], 421 | help='the type of optimizer: valid options=[sgd, adam, adagrad]') 422 | cmd.add_argument("--lr", type=float, default=0.01, help='the learning rate.') 423 | cmd.add_argument("--lr_decay", type=float, default=0, help='the learning rate decay.') 424 | 425 | cmd.add_argument("--model", required=True, help="path to save model") 426 | 427 | cmd.add_argument("--batch_size", "--batch", type=int, default=32, help='the batch size.') 428 | cmd.add_argument("--max_epoch", type=int, default=100, help='the maximum number of iteration.') 429 | 430 | cmd.add_argument("--clip_grad", type=float, default=5, help='the tense of clipped grad.') 431 | 432 | cmd.add_argument('--max_sent_len', type=int, default=20, help='maximum sentence length.') 433 | 434 | cmd.add_argument('--min_count', type=int, default=5, help='minimum word count.') 435 | 436 | cmd.add_argument('--max_vocab_size', type=int, default=150000, help='maximum vocabulary size.') 437 | 438 | cmd.add_argument('--save_classify_layer', default=False, action='store_true', 439 | help="whether to save the classify layer") 440 | 441 | cmd.add_argument('--valid_size', type=int, default=0, help="size of validation dataset when there's no valid.") 442 | cmd.add_argument('--eval_steps', required=False, type=int, help='report every xx batches.') 443 | 444 | opt = cmd.parse_args(sys.argv[2:]) 445 | 446 | with open(opt.config_path, 'r') as fin: 447 | config = json.load(fin) 448 | 449 | # Dump configurations 450 | print(opt) 451 | print(config) 452 | 453 | # set seed. 454 | torch.manual_seed(opt.seed) 455 | random.seed(opt.seed) 456 | if opt.gpu >= 0: 457 | torch.cuda.set_device(opt.gpu) 458 | if opt.seed > 0: 459 | torch.cuda.manual_seed(opt.seed) 460 | 461 | use_cuda = opt.gpu >= 0 and torch.cuda.is_available() 462 | 463 | token_embedder_name = config['token_embedder']['name'].lower() 464 | token_embedder_max_chars = config['token_embedder'].get('max_characters_per_token', None) 465 | if token_embedder_name == 'cnn': 466 | train_data = read_corpus(opt.train_path, token_embedder_max_chars, opt.max_sent_len) 467 | elif token_embedder_name == 'lstm': 468 | train_data = read_corpus(opt.train_path, opt.max_sent_len) 469 | else: 470 | raise ValueError('Unknown token embedder name: {}'.format(token_embedder_name)) 471 | 472 | logger.info('training instance: {}, training tokens: {}.'.format(len(train_data), 473 | sum([len(s) - 1 for s in train_data]))) 474 | 475 | if opt.valid_path is not None: 476 | if token_embedder_name == 'cnn': 477 | valid_data = read_corpus(opt.valid_path, token_embedder_max_chars, opt.max_sent_len) 478 | elif token_embedder_name == 'lstm': 479 | valid_data = read_corpus(opt.valid_path, opt.max_sent_len) 480 | else: 481 | raise ValueError('Unknown token embedder name: {}'.format(token_embedder_name)) 482 | logger.info('valid instance: {}, valid tokens: {}.'.format(len(valid_data), 483 | sum([len(s) - 1 for s in valid_data]))) 484 | elif opt.valid_size > 0: 485 | train_data, valid_data = divide(train_data, opt.valid_size) 486 | logger.info('training instance: {}, training tokens after division: {}.'.format( 487 | len(train_data), sum([len(s) - 1 for s in train_data]))) 488 | logger.info('valid instance: {}, valid tokens: {}.'.format( 489 | len(valid_data), sum([len(s) - 1 for s in valid_data]))) 490 | else: 491 | valid_data = None 492 | 493 | if opt.test_path is not None: 494 | if token_embedder_name == 'cnn': 495 | test_data = read_corpus(opt.test_path, token_embedder_max_chars, opt.max_sent_len) 496 | elif token_embedder_name == 'lstm': 497 | test_data = read_corpus(opt.test_path, opt.max_sent_len) 498 | else: 499 | raise ValueError('Unknown token embedder name: {}'.format(token_embedder_name)) 500 | logger.info('testing instance: {}, testing tokens: {}.'.format( 501 | len(test_data), sum([len(s) - 1 for s in test_data]))) 502 | else: 503 | test_data = None 504 | 505 | if opt.word_embedding is not None: 506 | embs = load_embedding(opt.word_embedding) 507 | word_lexicon = {word: i for i, word in enumerate(embs[0])} 508 | else: 509 | embs = None 510 | word_lexicon = {} 511 | 512 | # Maintain the vocabulary. vocabulary is used in either WordEmbeddingInput or softmax classification 513 | vocab = get_truncated_vocab(train_data, opt.min_count) 514 | 515 | # Ensure index of '' is 0 516 | for special_word in ['', '', '', '']: 517 | if special_word not in word_lexicon: 518 | word_lexicon[special_word] = len(word_lexicon) 519 | 520 | for word, _ in vocab: 521 | if word not in word_lexicon: 522 | word_lexicon[word] = len(word_lexicon) 523 | 524 | # Word Embedding 525 | if config['token_embedder']['word_dim'] > 0: 526 | word_emb_layer = EmbeddingLayer(config['token_embedder']['word_dim'], word_lexicon, fix_emb=False, embs=embs) 527 | logger.info('Word embedding size: {0}'.format(len(word_emb_layer.word2id))) 528 | else: 529 | word_emb_layer = None 530 | logger.info('Vocabulary size: {0}'.format(len(word_lexicon))) 531 | 532 | # Character Lexicon 533 | if config['token_embedder']['char_dim'] > 0: 534 | char_lexicon = {} 535 | for sentence in train_data: 536 | for word in sentence: 537 | for ch in word: 538 | if ch not in char_lexicon: 539 | char_lexicon[ch] = len(char_lexicon) 540 | 541 | for special_char in ['', '', '', '', '', '']: 542 | if special_char not in char_lexicon: 543 | char_lexicon[special_char] = len(char_lexicon) 544 | 545 | char_emb_layer = EmbeddingLayer(config['token_embedder']['char_dim'], char_lexicon, fix_emb=False) 546 | logger.info('Char embedding size: {0}'.format(len(char_emb_layer.word2id))) 547 | else: 548 | char_lexicon = None 549 | char_emb_layer = None 550 | 551 | train = create_batches( 552 | train_data, opt.batch_size, word_lexicon, char_lexicon, config, use_cuda=use_cuda) 553 | 554 | if opt.eval_steps is None: 555 | opt.eval_steps = len(train[0]) 556 | logger.info('Evaluate every {0} batches.'.format(opt.eval_steps)) 557 | 558 | if valid_data is not None: 559 | valid = create_batches( 560 | valid_data, opt.batch_size, word_lexicon, char_lexicon, config, sort=False, shuffle=False, use_cuda=use_cuda) 561 | else: 562 | valid = None 563 | 564 | if test_data is not None: 565 | test = create_batches( 566 | test_data, opt.batch_size, word_lexicon, char_lexicon, config, sort=False, shuffle=False, use_cuda=use_cuda) 567 | else: 568 | test = None 569 | 570 | label_to_ix = word_lexicon 571 | logger.info('vocab size: {0}'.format(len(label_to_ix))) 572 | 573 | nclasses = len(label_to_ix) 574 | 575 | model = Model(config, word_emb_layer, char_emb_layer, nclasses, use_cuda) 576 | logger.info(str(model)) 577 | if use_cuda: 578 | model = model.cuda() 579 | 580 | need_grad = lambda x: x.requires_grad 581 | if opt.optimizer.lower() == 'adam': 582 | optimizer = optim.Adam(filter(need_grad, model.parameters()), lr=opt.lr) 583 | elif opt.optimizer.lower() == 'sgd': 584 | optimizer = optim.SGD(filter(need_grad, model.parameters()), lr=opt.lr) 585 | elif opt.optimizer.lower() == 'adagrad': 586 | optimizer = optim.Adagrad(filter(need_grad, model.parameters()), lr=opt.lr) 587 | else: 588 | raise ValueError('Unknown optimizer {}'.format(opt.optimizer.lower())) 589 | 590 | try: 591 | os.makedirs(opt.model) 592 | except OSError as exception: 593 | if exception.errno != errno.EEXIST: 594 | raise 595 | 596 | if config['token_embedder']['char_dim'] > 0: 597 | with codecs.open(os.path.join(opt.model, 'char.dic'), 'w', encoding='utf-8') as fpo: 598 | for ch, i in char_emb_layer.word2id.items(): 599 | print('{0}\t{1}'.format(ch, i), file=fpo) 600 | 601 | with codecs.open(os.path.join(opt.model, 'word.dic'), 'w', encoding='utf-8') as fpo: 602 | for w, i in word_lexicon.items(): 603 | print('{0}\t{1}'.format(w, i), file=fpo) 604 | 605 | json.dump(vars(opt), codecs.open(os.path.join(opt.model, 'config.json'), 'w', encoding='utf-8')) 606 | 607 | best_train = 1e+8 608 | best_valid = 1e+8 609 | test_result = 1e+8 610 | 611 | for epoch in range(opt.max_epoch): 612 | best_train, best_valid, test_result = train_model(epoch, opt, model, optimizer, 613 | train, valid, test, best_train, best_valid, test_result) 614 | if opt.lr_decay > 0: 615 | optimizer.param_groups[0]['lr'] *= opt.lr_decay 616 | 617 | if valid_data is None: 618 | logger.info("best train ppl: {:.6f}.".format(best_train)) 619 | elif test_data is None: 620 | logger.info("best train ppl: {:.6f}, best valid ppl: {:.6f}.".format(best_train, best_valid)) 621 | else: 622 | logger.info("best train ppl: {:.6f}, best valid ppl: {:.6f}, test ppl: {:.6f}.".format(best_train, best_valid, test_result)) 623 | 624 | 625 | def test(): 626 | cmd = argparse.ArgumentParser('The testing components of') 627 | cmd.add_argument('--gpu', default=-1, type=int, help='use id of gpu, -1 if cpu.') 628 | cmd.add_argument("--input", help="the path to the raw text file.") 629 | cmd.add_argument("--model", required=True, help="path to save model") 630 | cmd.add_argument("--batch_size", "--batch", type=int, default=1, help='the batch size.') 631 | args = cmd.parse_args(sys.argv[2:]) 632 | 633 | if args.gpu >= 0: 634 | torch.cuda.set_device(args.gpu) 635 | use_cuda = args.gpu >= 0 and torch.cuda.is_available() 636 | 637 | args2 = dict2namedtuple(json.load(codecs.open(os.path.join(args.model, 'config.json'), 'r', encoding='utf-8'))) 638 | 639 | with open(args2.config_path, 'r') as fin: 640 | config = json.load(fin) 641 | 642 | if config['token_embedder']['char_dim'] > 0: 643 | char_lexicon = {} 644 | with codecs.open(os.path.join(args.model, 'char.dic'), 'r', encoding='utf-8') as fpi: 645 | for line in fpi: 646 | tokens = line.strip().split('\t') 647 | if len(tokens) == 1: 648 | tokens.insert(0, '\u3000') 649 | token, i = tokens 650 | char_lexicon[token] = int(i) 651 | char_emb_layer = EmbeddingLayer(config['token_embedder']['char_dim'], char_lexicon, fix_emb=False) 652 | logger.info('char embedding size: ' + str(len(char_emb_layer.word2id))) 653 | else: 654 | char_lexicon = None 655 | char_emb_layer = None 656 | 657 | word_lexicon = {} 658 | with codecs.open(os.path.join(args.model, 'word.dic'), 'r', encoding='utf-8') as fpi: 659 | for line in fpi: 660 | tokens = line.strip().split('\t') 661 | if len(tokens) == 1: 662 | tokens.insert(0, '\u3000') 663 | token, i = tokens 664 | word_lexicon[token] = int(i) 665 | 666 | if config['token_embedder']['word_dim'] > 0: 667 | word_emb_layer = EmbeddingLayer(config['token_embedder']['word_dim'], word_lexicon, fix_emb=False, embs=None) 668 | logger.info('word embedding size: ' + str(len(word_emb_layer.word2id))) 669 | else: 670 | word_emb_layer = None 671 | 672 | model = Model(config, word_emb_layer, char_emb_layer, len(word_lexicon), use_cuda) 673 | 674 | if use_cuda: 675 | model.cuda() 676 | 677 | logger.info(str(model)) 678 | model.load_model(args.model) 679 | if config['token_embedder']['name'].lower() == 'cnn': 680 | test = read_corpus(args.input, config['token_embedder']['max_characters_per_token'], max_sent_len=10000) 681 | elif config['token_embedder']['name'].lower() == 'lstm': 682 | test = read_corpus(args.input, max_sent_len=10000) 683 | else: 684 | raise ValueError('') 685 | 686 | test_w, test_c, test_lens, test_masks = create_batches( 687 | test, args.batch_size, word_lexicon, char_lexicon, config, sort=False, shuffle=False, use_cuda=use_cuda) 688 | 689 | test_result = eval_model(model, (test_w, test_c, test_lens, test_masks)) 690 | 691 | logger.info("test_ppl={:.6f}".format(test_result)) 692 | 693 | 694 | if __name__ == "__main__": 695 | if len(sys.argv) > 1 and sys.argv[1] == 'train': 696 | train() 697 | elif len(sys.argv) > 1 and sys.argv[1] == 'test': 698 | test() 699 | else: 700 | print('Usage: {0} [train|test] [options]'.format(sys.argv[0]), file=sys.stderr) 701 | -------------------------------------------------------------------------------- /elmoformanylangs/configs/cnn_0_100_512_4096_sample.json: -------------------------------------------------------------------------------- 1 | { 2 | "encoder": { 3 | "name": "elmo", 4 | "projection_dim": 512, 5 | "cell_clip": 3, 6 | "proj_clip": 3, 7 | "dim": 4096, 8 | "n_layers": 2 9 | }, 10 | 11 | "token_embedder": { 12 | "name": "cnn", 13 | "activation": "relu", 14 | "filters": [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], 15 | "n_highway": 2, 16 | "word_dim": 100, 17 | "char_dim": 50, 18 | "max_characters_per_token": 50 19 | }, 20 | 21 | "classifier": { 22 | "name": "sampled_softmax", 23 | "n_samples": 8192 24 | }, 25 | "dropout": 0.1 26 | } 27 | -------------------------------------------------------------------------------- /elmoformanylangs/configs/cnn_50_100_512_4096_sample.json: -------------------------------------------------------------------------------- 1 | { 2 | "encoder": { 3 | "name": "elmo", 4 | "projection_dim": 512, 5 | "cell_clip": 3, 6 | "proj_clip": 3, 7 | "dim": 4096, 8 | "n_layers": 2 9 | }, 10 | 11 | "token_embedder": { 12 | "name": "cnn", 13 | "activation": "relu", 14 | "filters": [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], 15 | "n_highway": 2, 16 | "word_dim": 100, 17 | "char_dim": 50, 18 | "max_characters_per_token": 50 19 | }, 20 | 21 | "classifier": { 22 | "name": "sampled_softmax", 23 | "n_samples": 8192 24 | }, 25 | "dropout": 0.1 26 | } 27 | -------------------------------------------------------------------------------- /elmoformanylangs/dataloader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import unicode_literals 3 | import codecs 4 | import numpy as np 5 | 6 | 7 | def pad(sequences, pad_token='', pad_left=False): 8 | """ 9 | input sequences is a list of text sequence [[str]] 10 | pad each text sequence to the length of the longest 11 | 12 | :param sequences: 13 | :param pad_token: 14 | :param pad_left: 15 | :return: 16 | """ 17 | # max_len = max(5,max(len(seq) for seq in sequences)) 18 | max_len = max(len(seq) for seq in sequences) 19 | if pad_left: 20 | return [[pad_token]*(max_len-len(seq)) + seq for seq in sequences] 21 | return [seq + [pad_token]*(max_len-len(seq)) for seq in sequences] 22 | 23 | 24 | def load_embedding_npz(path): 25 | data = np.load(path) 26 | return [str(w) for w in data['words']], data['vals'] 27 | 28 | 29 | def load_embedding_txt(path): 30 | words = [] 31 | vals = [] 32 | with codecs.open(path, 'r', encoding='utf-8') as fin: 33 | fin.readline() 34 | for line in fin: 35 | line = line.strip() 36 | if line: 37 | parts = line.split() 38 | words.append(parts[0]) 39 | vals += [float(x) for x in parts[1:]] # equal to append 40 | return words, np.asarray(vals).reshape(len(words), -1) # reshape 41 | 42 | 43 | def load_embedding(path): 44 | if path.endswith(".npz"): 45 | return load_embedding_npz(path) 46 | else: 47 | return load_embedding_txt(path) 48 | -------------------------------------------------------------------------------- /elmoformanylangs/elmo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | from __future__ import unicode_literals 4 | import os 5 | import codecs 6 | import random 7 | import logging 8 | import json 9 | import torch 10 | from .modules.embedding_layer import EmbeddingLayer 11 | from .utils import dict2namedtuple 12 | from .frontend import create_one_batch 13 | from .frontend import Model 14 | import numpy as np 15 | 16 | logger = logging.getLogger('elmoformanylangs') 17 | 18 | 19 | def read_list(sents, max_chars=None): 20 | """ 21 | read raw text file. The format of the input is like, one sentence per line 22 | words are separated by '\t' 23 | 24 | :param path: 25 | :param max_chars: int, the number of maximum characters in a word, this 26 | parameter is used when the model is configured with CNN word encoder. 27 | :return: 28 | """ 29 | dataset = [] 30 | textset = [] 31 | for sent in sents: 32 | data = [''] 33 | text = [] 34 | for token in sent: 35 | text.append(token) 36 | if max_chars is not None and len(token) + 2 > max_chars: 37 | token = token[:max_chars - 2] 38 | data.append(token) 39 | data.append('') 40 | dataset.append(data) 41 | textset.append(text) 42 | return dataset, textset 43 | 44 | 45 | def recover(li, ind): 46 | # li[piv], ind = torch.sort(li[piv], dim=0, descending=(not unsort)) 47 | dummy = list(range(len(ind))) 48 | dummy.sort(key=lambda l: ind[l]) 49 | li = [li[i] for i in dummy] 50 | return li 51 | 52 | 53 | # shuffle training examples and create mini-batches 54 | def create_batches(x, batch_size, word2id, char2id, config, perm=None, shuffle=False, sort=True, text=None): 55 | ind = list(range(len(x))) 56 | lst = perm or list(range(len(x))) 57 | if shuffle: 58 | random.shuffle(lst) 59 | 60 | if sort: 61 | lst.sort(key=lambda l: -len(x[l])) 62 | 63 | x = [x[i] for i in lst] 64 | ind = [ind[i] for i in lst] 65 | if text is not None: 66 | text = [text[i] for i in lst] 67 | 68 | sum_len = 0.0 69 | batches_w, batches_c, batches_lens, batches_masks, batches_text, batches_ind = [], [], [], [], [], [] 70 | size = batch_size 71 | nbatch = (len(x) - 1) // size + 1 72 | for i in range(nbatch): 73 | start_id, end_id = i * size, (i + 1) * size 74 | bw, bc, blens, bmasks = create_one_batch(x[start_id: end_id], word2id, char2id, config, sort=sort) 75 | sum_len += sum(blens) 76 | batches_w.append(bw) 77 | batches_c.append(bc) 78 | batches_lens.append(blens) 79 | batches_masks.append(bmasks) 80 | batches_ind.append(ind[start_id: end_id]) 81 | if text is not None: 82 | batches_text.append(text[start_id: end_id]) 83 | 84 | if sort: 85 | perm = list(range(nbatch)) 86 | random.shuffle(perm) 87 | batches_w = [batches_w[i] for i in perm] 88 | batches_c = [batches_c[i] for i in perm] 89 | batches_lens = [batches_lens[i] for i in perm] 90 | batches_masks = [batches_masks[i] for i in perm] 91 | batches_ind = [batches_ind[i] for i in perm] 92 | if text is not None: 93 | batches_text = [batches_text[i] for i in perm] 94 | 95 | logger.info("{} batches, avg len: {:.1f}".format( 96 | nbatch, sum_len / len(x))) 97 | recover_ind = [item for sublist in batches_ind for item in sublist] 98 | if text is not None: 99 | return batches_w, batches_c, batches_lens, batches_masks, batches_text, recover_ind 100 | return batches_w, batches_c, batches_lens, batches_masks, recover_ind 101 | 102 | 103 | class Embedder(object): 104 | def __init__(self, model_dir, batch_size=64): 105 | self.model_dir = model_dir 106 | self.model, self.config = self.get_model() 107 | self.batch_size = batch_size 108 | 109 | def get_model(self): 110 | # torch.cuda.set_device(1) 111 | self.use_cuda = torch.cuda.is_available() 112 | # load the model configurations 113 | args2 = dict2namedtuple(json.load(codecs.open( 114 | os.path.join(self.model_dir, 'config.json'), 'r', encoding='utf-8'))) 115 | 116 | config_path = os.path.join(self.model_dir, args2.config_path) 117 | # Some of the available models may have the config in the 118 | # model dir, but the path given in the config directory was an 119 | # absolute path. 120 | if not os.path.exists(config_path): 121 | config_path = os.path.join(self.model_dir, 122 | os.path.split(config_path)[1]) 123 | logger.warning("Could not find config. Trying " + config_path) 124 | # In many cases, such as the publicly available English model, 125 | # the config is one of the default provided configs in 126 | # elmoformanylangs/configs 127 | if not os.path.exists(config_path): 128 | config_path = os.path.join(os.path.split(__file__)[0], "configs", 129 | os.path.split(config_path)[1]) 130 | logger.warning("Could not find config. Trying " + config_path) 131 | 132 | if not os.path.exists(config_path): 133 | raise FileNotFoundError("Could not find the model config in either the model directory " 134 | "or the default configs. Path in config file: %s" % args2.config_path) 135 | 136 | with open(config_path, 'r') as fin: 137 | config = json.load(fin) 138 | 139 | # For the model trained with character-based word encoder. 140 | if config['token_embedder']['char_dim'] > 0: 141 | self.char_lexicon = {} 142 | with codecs.open(os.path.join(self.model_dir, 'char.dic'), 'r', encoding='utf-8') as fpi: 143 | for line in fpi: 144 | tokens = line.strip().split('\t') 145 | if len(tokens) == 1: 146 | tokens.insert(0, '\u3000') 147 | token, i = tokens 148 | self.char_lexicon[token] = int(i) 149 | char_emb_layer = EmbeddingLayer( 150 | config['token_embedder']['char_dim'], self.char_lexicon, fix_emb=False, embs=None) 151 | logger.info('char embedding size: ' + 152 | str(len(char_emb_layer.word2id))) 153 | else: 154 | self.char_lexicon = None 155 | char_emb_layer = None 156 | 157 | # For the model trained with word form word encoder. 158 | if config['token_embedder']['word_dim'] > 0: 159 | self.word_lexicon = {} 160 | with codecs.open(os.path.join(self.model_dir, 'word.dic'), 'r', encoding='utf-8') as fpi: 161 | for line in fpi: 162 | tokens = line.strip().split('\t') 163 | if len(tokens) == 1: 164 | tokens.insert(0, '\u3000') 165 | token, i = tokens 166 | self.word_lexicon[token] = int(i) 167 | word_emb_layer = EmbeddingLayer( 168 | config['token_embedder']['word_dim'], self.word_lexicon, fix_emb=False, embs=None) 169 | logger.info('word embedding size: ' + 170 | str(len(word_emb_layer.word2id))) 171 | else: 172 | self.word_lexicon = None 173 | word_emb_layer = None 174 | 175 | # instantiate the model 176 | model = Model(config, word_emb_layer, char_emb_layer, self.use_cuda) 177 | 178 | if self.use_cuda: 179 | model.cuda() 180 | 181 | logger.info(str(model)) 182 | model.load_model(self.model_dir) 183 | 184 | # read test data according to input format 185 | 186 | # configure the model to evaluation mode. 187 | model.eval() 188 | return model, config 189 | 190 | def sents2elmo(self, sents, output_layer=-1): 191 | read_function = read_list 192 | 193 | if self.config['token_embedder']['name'].lower() == 'cnn': 194 | test, text = read_function(sents, self.config['token_embedder']['max_characters_per_token']) 195 | else: 196 | test, text = read_function(sents) 197 | 198 | # create test batches from the input data. 199 | test_w, test_c, test_lens, test_masks, test_text, recover_ind = create_batches( 200 | test, self.batch_size, self.word_lexicon, self.char_lexicon, self.config, text=text) 201 | 202 | cnt = 0 203 | 204 | after_elmo = [] 205 | for w, c, lens, masks, texts in zip(test_w, test_c, test_lens, test_masks, test_text): 206 | output = self.model.forward(w, c, masks) 207 | for i, text in enumerate(texts): 208 | 209 | if self.config['encoder']['name'].lower() == 'lstm': 210 | data = output[i, 1:lens[i]-1, :].data 211 | if self.use_cuda: 212 | data = data.cpu() 213 | data = data.numpy() 214 | elif self.config['encoder']['name'].lower() == 'elmo': 215 | data = output[:, i, 1:lens[i]-1, :].data 216 | if self.use_cuda: 217 | data = data.cpu() 218 | data = data.numpy() 219 | 220 | if output_layer == -1: 221 | payload = np.average(data, axis=0) 222 | elif output_layer == -2: 223 | payload = data 224 | else: 225 | payload = data[output_layer] 226 | after_elmo.append(payload) 227 | 228 | cnt += 1 229 | if cnt % 1000 == 0: 230 | logger.info('Finished {0} sentences.'.format(cnt)) 231 | 232 | after_elmo = recover(after_elmo, recover_ind) 233 | return after_elmo 234 | -------------------------------------------------------------------------------- /elmoformanylangs/frontend.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import random 4 | import torch 5 | import torch.nn as nn 6 | import logging 7 | from torch.autograd import Variable 8 | from .modules.elmo import ElmobiLm 9 | from .modules.lstm import LstmbiLm 10 | from .modules.token_embedder import ConvTokenEmbedder, LstmTokenEmbedder 11 | 12 | logger = logging.getLogger('elmoformanylangs') 13 | 14 | def create_one_batch(x, word2id, char2id, config, oov='', pad='', sort=True): 15 | """ 16 | Create one batch of input. 17 | 18 | :param x: List[List[str]] 19 | :param word2id: Dict | None 20 | :param char2id: Dict | None 21 | :param config: Dict 22 | :param oov: str, the form of OOV token. 23 | :param pad: str, the form of padding token. 24 | :param sort: bool, specify whether sorting the sentences by their lengths. 25 | :return: 26 | """ 27 | batch_size = len(x) 28 | # lst represents the order of sentences 29 | lst = list(range(batch_size)) 30 | if sort: 31 | lst.sort(key=lambda l: -len(x[l])) 32 | 33 | # shuffle the sentences by 34 | x = [x[i] for i in lst] 35 | lens = [len(x[i]) for i in lst] 36 | max_len = max(lens) 37 | 38 | # get a batch of word id whose size is (batch x max_len) 39 | if word2id is not None: 40 | oov_id, pad_id = word2id.get(oov, None), word2id.get(pad, None) 41 | assert oov_id is not None and pad_id is not None 42 | batch_w = torch.LongTensor(batch_size, max_len).fill_(pad_id) 43 | for i, x_i in enumerate(x): 44 | for j, x_ij in enumerate(x_i): 45 | batch_w[i][j] = word2id.get(x_ij, oov_id) 46 | else: 47 | batch_w = None 48 | 49 | # get a batch of character id whose size is (batch x max_len x max_chars) 50 | if char2id is not None: 51 | bow_id, eow_id, oov_id, pad_id = [char2id.get(key, None) for key in ('', '', oov, pad)] 52 | 53 | assert bow_id is not None and eow_id is not None and oov_id is not None and pad_id is not None 54 | 55 | if config['token_embedder']['name'].lower() == 'cnn': 56 | max_chars = config['token_embedder']['max_characters_per_token'] 57 | assert max([len(w) for i in lst for w in x[i]]) + 2 <= max_chars 58 | elif config['token_embedder']['name'].lower() == 'lstm': 59 | # counting the and 60 | max_chars = max([len(w) for i in lst for w in x[i]]) + 2 61 | else: 62 | raise ValueError('Unknown token_embedder: {0}'.format(config['token_embedder']['name'])) 63 | 64 | batch_c = torch.LongTensor(batch_size, max_len, max_chars).fill_(pad_id) 65 | 66 | for i, x_i in enumerate(x): 67 | for j, x_ij in enumerate(x_i): 68 | batch_c[i][j][0] = bow_id 69 | if x_ij == '' or x_ij == '': 70 | batch_c[i][j][1] = char2id.get(x_ij) 71 | batch_c[i][j][2] = eow_id 72 | else: 73 | for k, c in enumerate(x_ij): 74 | batch_c[i][j][k + 1] = char2id.get(c, oov_id) 75 | batch_c[i][j][len(x_ij) + 1] = eow_id 76 | else: 77 | batch_c = None 78 | 79 | # mask[0] is the matrix (batch x max_len) indicating whether 80 | # there is an id is valid (not a padding) in this batch. 81 | # mask[1] stores the flattened ids indicating whether there is a valid 82 | # previous token 83 | # mask[2] stores the flattened ids indicating whether there is a valid 84 | # next token 85 | masks = [torch.LongTensor(batch_size, max_len).fill_(0), [], []] 86 | 87 | for i, x_i in enumerate(x): 88 | for j in range(len(x_i)): 89 | masks[0][i][j] = 1 90 | if j + 1 < len(x_i): 91 | masks[1].append(i * max_len + j) 92 | if j > 0: 93 | masks[2].append(i * max_len + j) 94 | 95 | assert len(masks[1]) <= batch_size * max_len 96 | assert len(masks[2]) <= batch_size * max_len 97 | 98 | masks[1] = torch.LongTensor(masks[1]) 99 | masks[2] = torch.LongTensor(masks[2]) 100 | 101 | return batch_w, batch_c, lens, masks 102 | 103 | 104 | # shuffle training examples and create mini-batches 105 | def create_batches(x, batch_size, word2id, char2id, config, perm=None, shuffle=True, sort=True, text=None): 106 | """ 107 | 108 | :param x: List[List[str]] 109 | :param batch_size: 110 | :param word2id: 111 | :param char2id: 112 | :param config: 113 | :param perm: 114 | :param shuffle: 115 | :param sort: 116 | :param text: 117 | :return: 118 | """ 119 | lst = perm or list(range(len(x))) 120 | if shuffle: 121 | random.shuffle(lst) 122 | 123 | if sort: 124 | lst.sort(key=lambda l: -len(x[l])) 125 | 126 | x = [x[i] for i in lst] 127 | if text is not None: 128 | text = [text[i] for i in lst] 129 | 130 | sum_len = 0.0 131 | batches_w, batches_c, batches_lens, batches_masks, batches_text = [], [], [], [], [] 132 | size = batch_size 133 | nbatch = (len(x) - 1) // size + 1 134 | for i in range(nbatch): 135 | start_id, end_id = i * size, (i + 1) * size 136 | bw, bc, blens, bmasks = create_one_batch(x[start_id: end_id], word2id, char2id, config, sort=sort) 137 | sum_len += sum(blens) 138 | batches_w.append(bw) 139 | batches_c.append(bc) 140 | batches_lens.append(blens) 141 | batches_masks.append(bmasks) 142 | if text is not None: 143 | batches_text.append(text[start_id: end_id]) 144 | 145 | if sort: 146 | perm = list(range(nbatch)) 147 | random.shuffle(perm) 148 | batches_w = [batches_w[i] for i in perm] 149 | batches_c = [batches_c[i] for i in perm] 150 | batches_lens = [batches_lens[i] for i in perm] 151 | batches_masks = [batches_masks[i] for i in perm] 152 | if text is not None: 153 | batches_text = [batches_text[i] for i in perm] 154 | 155 | logger.info("{} batches, avg len: {:.1f}".format(nbatch, sum_len / len(x))) 156 | if text is not None: 157 | return batches_w, batches_c, batches_lens, batches_masks, batches_text 158 | return batches_w, batches_c, batches_lens, batches_masks 159 | 160 | 161 | class Model(nn.Module): 162 | def __init__(self, config, word_emb_layer, char_emb_layer, use_cuda=False): 163 | super(Model, self).__init__() 164 | self.use_cuda = use_cuda 165 | self.config = config 166 | 167 | if config['token_embedder']['name'].lower() == 'cnn': 168 | self.token_embedder = ConvTokenEmbedder( 169 | config, word_emb_layer, char_emb_layer, use_cuda) 170 | elif config['token_embedder']['name'].lower() == 'lstm': 171 | self.token_embedder = LstmTokenEmbedder( 172 | config, word_emb_layer, char_emb_layer, use_cuda) 173 | 174 | if config['encoder']['name'].lower() == 'elmo': 175 | self.encoder = ElmobiLm(config, use_cuda) 176 | elif config['encoder']['name'].lower() == 'lstm': 177 | self.encoder = LstmbiLm(config, use_cuda) 178 | 179 | self.output_dim = config['encoder']['projection_dim'] 180 | 181 | def forward(self, word_inp, chars_package, mask_package): 182 | """ 183 | 184 | :param word_inp: 185 | :param chars_package: 186 | :param mask_package: 187 | :return: 188 | """ 189 | token_embedding = self.token_embedder(word_inp, chars_package, (mask_package[0].size(0), mask_package[0].size(1))) 190 | if self.config['encoder']['name'] == 'elmo': 191 | mask = Variable(mask_package[0]).cuda() if self.use_cuda else Variable(mask_package[0]) 192 | encoder_output = self.encoder(token_embedding, mask) 193 | sz = encoder_output.size() 194 | token_embedding = torch.cat( 195 | [token_embedding, token_embedding], dim=2).view(1, sz[1], sz[2], sz[3]) 196 | encoder_output = torch.cat( 197 | [token_embedding, encoder_output], dim=0) 198 | elif self.config['encoder']['name'] == 'lstm': 199 | encoder_output = self.encoder(token_embedding) 200 | else: 201 | raise ValueError('Unknown encoder: {0}'.format(self.config['encoder']['name'])) 202 | 203 | return encoder_output 204 | 205 | def load_model(self, path): 206 | self.token_embedder.load_state_dict(torch.load(os.path.join(path, 'token_embedder.pkl'), 207 | map_location=lambda storage, loc: storage)) 208 | self.encoder.load_state_dict(torch.load(os.path.join(path, 'encoder.pkl'), 209 | map_location=lambda storage, loc: storage)) 210 | -------------------------------------------------------------------------------- /elmoformanylangs/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HIT-SCIR/ELMoForManyLangs/b3de5f1dc1ac13638a930b49c41e1f1e0e185ca1/elmoformanylangs/modules/__init__.py -------------------------------------------------------------------------------- /elmoformanylangs/modules/classify_layer.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch.autograd import Variable 6 | 7 | 8 | class SoftmaxLayer(nn.Module): 9 | """ Naive softmax-layer """ 10 | def __init__(self, output_dim, n_class): 11 | """ 12 | 13 | :param output_dim: int 14 | :param n_class: int 15 | """ 16 | super(SoftmaxLayer, self).__init__() 17 | self.hidden2tag = nn.Linear(output_dim, n_class) 18 | self.criterion = nn.CrossEntropyLoss(size_average=False) 19 | 20 | def forward(self, x, y): 21 | """ 22 | 23 | :param x: torch.Tensor 24 | :param y: torch.Tensor 25 | :return: 26 | """ 27 | tag_scores = self.hidden2tag(x) 28 | return self.criterion(tag_scores, y) 29 | 30 | 31 | class SampledSoftmaxLayer(nn.Module): 32 | """ 33 | 34 | """ 35 | def __init__(self, output_dim, n_class, n_samples, use_cuda): 36 | """ 37 | 38 | :param output_dim: 39 | :param n_class: 40 | :param n_samples: 41 | :param use_cuda: 42 | """ 43 | super(SampledSoftmaxLayer, self).__init__() 44 | self.n_samples = n_samples 45 | self.n_class = n_class 46 | self.use_cuda = use_cuda 47 | self.criterion = nn.CrossEntropyLoss(size_average=False) 48 | self.negative_samples = [] 49 | self.word_to_column = {0: 0} 50 | 51 | self.all_word = [] 52 | self.all_word_to_column = {0: 0} 53 | 54 | self.column_emb = nn.Embedding(n_class, output_dim) 55 | self.column_emb.weight.data.uniform_(-0.25, 0.25) 56 | 57 | self.column_bias = nn.Embedding(n_class, 1) 58 | self.column_bias.weight.data.uniform_(-0.25, 0.25) 59 | 60 | self.oov_column = nn.Parameter(torch.Tensor(output_dim, 1)) 61 | self.oov_column.data.uniform_(-0.25, 0.25) 62 | 63 | def forward(self, x, y): 64 | if self.training: 65 | for i in range(y.size(0)): 66 | y[i] = self.word_to_column.get(y[i].tolist()) 67 | samples = torch.LongTensor(len(self.word_to_column)).fill_(0) 68 | for word in self.negative_samples: 69 | samples[self.word_to_column[word]] = word 70 | else: 71 | for i in range(y.size(0)): 72 | y[i] = self.all_word_to_column.get(y[i].tolist(), 0) 73 | samples = torch.LongTensor(len(self.all_word_to_column)).fill_(0) 74 | for word in self.all_word: 75 | samples[self.all_word_to_column[word]] = word 76 | 77 | if self.use_cuda: 78 | samples = samples.cuda() 79 | 80 | tag_scores = (x.matmul(self.embedding_matrix)).view(y.size(0), -1) + \ 81 | (self.column_bias.forward(samples)).view(1, -1) 82 | return self.criterion(tag_scores, y) 83 | 84 | def update_embedding_matrix(self): 85 | word_inp, chars_inp = [], [] 86 | if self.training: 87 | columns = torch.LongTensor(len(self.negative_samples) + 1) 88 | samples = self.negative_samples 89 | for i, word in enumerate(samples): 90 | columns[self.word_to_column[word]] = word 91 | columns[0] = 0 92 | else: 93 | columns = torch.LongTensor(len(self.all_word) + 1) 94 | samples = self.all_word 95 | for i, word in enumerate(samples): 96 | columns[self.all_word_to_column[word]] = word 97 | columns[0] = 0 98 | 99 | if self.use_cuda: 100 | columns = columns.cuda() 101 | self.embedding_matrix = self.column_emb.forward(columns).transpose(0, 1) 102 | 103 | def update_negative_samples(self, word_inp, chars_inp, mask): 104 | batch_size, seq_len = word_inp.size(0), word_inp.size(1) 105 | in_batch = set() 106 | for i in range(batch_size): 107 | for j in range(seq_len): 108 | if mask[i][j] == 0: 109 | continue 110 | word = word_inp[i][j].tolist() 111 | in_batch.add(word) 112 | for i in range(batch_size): 113 | for j in range(seq_len): 114 | if mask[i][j] == 0: 115 | continue 116 | word = word_inp[i][j].tolist() 117 | if word not in self.all_word_to_column: 118 | self.all_word.append(word) 119 | self.all_word_to_column[word] = len(self.all_word_to_column) 120 | 121 | if word not in self.word_to_column: 122 | if len(self.negative_samples) < self.n_samples: 123 | self.negative_samples.append(word) 124 | self.word_to_column[word] = len(self.word_to_column) 125 | else: 126 | while self.negative_samples[0] in in_batch: 127 | self.negative_samples = self.negative_samples[1:] + [self.negative_samples[0]] 128 | self.word_to_column[word] = self.word_to_column.pop(self.negative_samples[0]) 129 | self.negative_samples = self.negative_samples[1:] + [word] 130 | 131 | 132 | class CNNSoftmaxLayer(nn.Module): 133 | def __init__(self, token_embedder, output_dim, n_class, n_samples, corr_dim, use_cuda): 134 | super(CNNSoftmaxLayer, self).__init__() 135 | self.token_embedder = token_embedder 136 | self.n_samples = n_samples 137 | self.use_cuda = use_cuda 138 | self.criterion = nn.CrossEntropyLoss(size_average=False) 139 | self.negative_samples = [] 140 | self.word_to_column = {0: 0} 141 | 142 | self.all_word = [] 143 | self.all_word_to_column = {0: 0} 144 | 145 | self.M = nn.Parameter(torch.Tensor(output_dim, corr_dim)) 146 | stdv = 1. / math.sqrt(self.M.size(1)) 147 | self.M.data.uniform_(-stdv, stdv) 148 | 149 | self.corr = nn.Embedding(n_class, corr_dim) 150 | self.corr.weight.data.uniform_(-0.25, 0.25) 151 | 152 | self.oov_column = nn.Parameter(torch.Tensor(output_dim, 1)) 153 | self.oov_column.data.uniform_(-0.25, 0.25) 154 | 155 | def forward(self, x, y): 156 | if self.training: 157 | for i in range(y.size(0)): 158 | y[i] = self.word_to_column.get(y[i].tolist()) 159 | samples = torch.LongTensor(len(self.word_to_column)).fill_(0) 160 | for package in self.negative_samples: 161 | samples[self.word_to_column[package[0]]] = package[0] 162 | else: 163 | for i in range(y.size(0)): 164 | y[i] = self.all_word_to_column.get(y[i].tolist(), 0) 165 | samples = torch.LongTensor(len(self.all_word_to_column)).fill_(0) 166 | for package in self.all_word: 167 | samples[self.all_word_to_column[package[0]]] = package[0] 168 | 169 | if self.use_cuda: 170 | samples = samples.cuda() 171 | 172 | tag_scores = (x.matmul(self.embedding_matrix)).view(y.size(0), -1) + \ 173 | (x.matmul(self.M).matmul(self.corr.forward(samples).transpose(0, 1))).view(y.size(0), -1) 174 | return self.criterion(tag_scores, y) 175 | 176 | def update_embedding_matrix(self): 177 | batch_size = 2048 178 | word_inp, chars_inp = [], [] 179 | if self.training: 180 | sub_matrices = [self.oov_column] 181 | samples = self.negative_samples 182 | id2pack = {} 183 | for i, package in enumerate(samples): 184 | id2pack[self.word_to_column[package[0]]] = i 185 | else: 186 | sub_matrices = [self.oov_column] 187 | samples = self.all_word 188 | id2pack = {} 189 | for i, package in enumerate(samples): 190 | id2pack[self.all_word_to_column[package[0]]] = i 191 | 192 | for i in range(len(samples)): 193 | # [n_samples, 1], [n_samples, 1, x], [n_samples, 1] 194 | word_inp.append(samples[id2pack[i + 1]][0]) 195 | chars_inp.append(samples[id2pack[i + 1]][1]) 196 | if len(word_inp) == batch_size or i == len(samples) - 1: 197 | sub_matrices.append(self.token_embedder.forward(torch.LongTensor(word_inp).view(len(word_inp), 1), 198 | None if chars_inp[0] is None else torch.LongTensor(chars_inp).view(len(word_inp), 1, len(package[1])), 199 | (len(word_inp), 1)).squeeze(1).transpose(0, 1)) 200 | if not self.training: 201 | sub_matrices[-1] = sub_matrices[-1].detach() 202 | word_inp, chars_inp = [], [] 203 | 204 | sum = 0 205 | for mat in sub_matrices: 206 | sum += mat.size(1) 207 | #print(sum, len(self.word_to_column)) 208 | self.embedding_matrix = torch.cat(sub_matrices, dim=1) 209 | 210 | def update_negative_samples(self, word_inp, chars_inp, mask): 211 | batch_size, seq_len = word_inp.size(0), word_inp.size(1) 212 | in_batch = set() 213 | for i in range(batch_size): 214 | for j in range(seq_len): 215 | if mask[i][j] == 0: 216 | continue 217 | word = word_inp[i][j].tolist() 218 | in_batch.add(word) 219 | for i in range(batch_size): 220 | for j in range(seq_len): 221 | if mask[i][j] == 0: 222 | continue 223 | package = (word_inp[i][j].tolist(), None if chars_inp is None else chars_inp[i][j].tolist()) 224 | if package[0] not in self.all_word_to_column: 225 | self.all_word.append(package) 226 | self.all_word_to_column[package[0]] = len(self.all_word_to_column) 227 | 228 | if package[0] not in self.word_to_column: 229 | if len(self.negative_samples) < self.n_samples: 230 | self.negative_samples.append(package) 231 | self.word_to_column[package[0]] = len(self.word_to_column) 232 | else: 233 | while self.negative_samples[0][0] in in_batch: 234 | self.negative_samples = self.negative_samples[1:] + [self.negative_samples[0]] 235 | self.word_to_column[package[0]] = self.word_to_column.pop(self.negative_samples[0][0]) 236 | self.negative_samples = self.negative_samples[1:] + [package] 237 | -------------------------------------------------------------------------------- /elmoformanylangs/modules/elmo.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Tuple, List, Callable, Union 2 | 3 | import h5py 4 | import numpy 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | from torch.nn.utils.rnn import PackedSequence, pad_packed_sequence, pack_padded_sequence 9 | from torch.autograd import Variable 10 | 11 | from .encoder_base import _EncoderBase 12 | from .lstm_cell_with_projection import LstmCellWithProjection 13 | 14 | RnnState = Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]] # pylint: disable=invalid-name 15 | RnnStateStorage = Tuple[torch.Tensor, ...] # pylint: disable=invalid-name 16 | 17 | 18 | class ElmobiLm(_EncoderBase): 19 | def __init__(self, config, use_cuda=False): 20 | super(ElmobiLm, self).__init__(stateful=True) 21 | self.config = config 22 | self.use_cuda = use_cuda 23 | input_size = config['encoder']['projection_dim'] 24 | hidden_size = config['encoder']['projection_dim'] 25 | cell_size = config['encoder']['dim'] 26 | num_layers = config['encoder']['n_layers'] 27 | memory_cell_clip_value = config['encoder']['cell_clip'] 28 | state_projection_clip_value = config['encoder']['proj_clip'] 29 | recurrent_dropout_probability = config['dropout'] 30 | 31 | self.input_size = input_size 32 | self.hidden_size = hidden_size 33 | self.num_layers = num_layers 34 | self.cell_size = cell_size 35 | 36 | forward_layers = [] 37 | backward_layers = [] 38 | 39 | lstm_input_size = input_size 40 | go_forward = True 41 | for layer_index in range(num_layers): 42 | forward_layer = LstmCellWithProjection(lstm_input_size, 43 | hidden_size, 44 | cell_size, 45 | go_forward, 46 | recurrent_dropout_probability, 47 | memory_cell_clip_value, 48 | state_projection_clip_value) 49 | backward_layer = LstmCellWithProjection(lstm_input_size, 50 | hidden_size, 51 | cell_size, 52 | not go_forward, 53 | recurrent_dropout_probability, 54 | memory_cell_clip_value, 55 | state_projection_clip_value) 56 | lstm_input_size = hidden_size 57 | 58 | self.add_module('forward_layer_{}'.format(layer_index), forward_layer) 59 | self.add_module('backward_layer_{}'.format(layer_index), backward_layer) 60 | forward_layers.append(forward_layer) 61 | backward_layers.append(backward_layer) 62 | self.forward_layers = forward_layers 63 | self.backward_layers = backward_layers 64 | 65 | def forward(self, inputs, mask): 66 | batch_size, total_sequence_length = mask.size() 67 | stacked_sequence_output, final_states, restoration_indices = \ 68 | self.sort_and_run_forward(self._lstm_forward, inputs, mask) 69 | 70 | num_layers, num_valid, returned_timesteps, encoder_dim = stacked_sequence_output.size() 71 | # Add back invalid rows which were removed in the call to sort_and_run_forward. 72 | if num_valid < batch_size: 73 | zeros = stacked_sequence_output.data.new(num_layers, 74 | batch_size - num_valid, 75 | returned_timesteps, 76 | encoder_dim).fill_(0) 77 | zeros = Variable(zeros) 78 | stacked_sequence_output = torch.cat([stacked_sequence_output, zeros], 1) 79 | 80 | # The states also need to have invalid rows added back. 81 | new_states = [] 82 | for state in final_states: 83 | state_dim = state.size(-1) 84 | zeros = state.data.new(num_layers, batch_size - num_valid, state_dim).fill_(0) 85 | zeros = Variable(zeros) 86 | new_states.append(torch.cat([state, zeros], 1)) 87 | final_states = new_states 88 | 89 | # It's possible to need to pass sequences which are padded to longer than the 90 | # max length of the sequence to a Seq2StackEncoder. However, packing and unpacking 91 | # the sequences mean that the returned tensor won't include these dimensions, because 92 | # the RNN did not need to process them. We add them back on in the form of zeros here. 93 | sequence_length_difference = total_sequence_length - returned_timesteps 94 | if sequence_length_difference > 0: 95 | zeros = stacked_sequence_output.data.new(num_layers, 96 | batch_size, 97 | sequence_length_difference, 98 | stacked_sequence_output[0].size(-1)).fill_(0) 99 | zeros = Variable(zeros) 100 | stacked_sequence_output = torch.cat([stacked_sequence_output, zeros], 2) 101 | 102 | self._update_states(final_states, restoration_indices) 103 | 104 | # Restore the original indices and return the sequence. 105 | # Has shape (num_layers, batch_size, sequence_length, hidden_size) 106 | return stacked_sequence_output.index_select(1, restoration_indices) 107 | 108 | 109 | def _lstm_forward(self, 110 | inputs: PackedSequence, 111 | initial_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None) -> \ 112 | Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: 113 | """ 114 | Parameters 115 | ---------- 116 | inputs : ``PackedSequence``, required. 117 | A batch first ``PackedSequence`` to run the stacked LSTM over. 118 | initial_state : ``Tuple[torch.Tensor, torch.Tensor]``, optional, (default = None) 119 | A tuple (state, memory) representing the initial hidden state and memory 120 | of the LSTM, with shape (num_layers, batch_size, 2 * hidden_size) and 121 | (num_layers, batch_size, 2 * cell_size) respectively. 122 | Returns 123 | ------- 124 | output_sequence : ``torch.FloatTensor`` 125 | The encoded sequence of shape (num_layers, batch_size, sequence_length, hidden_size) 126 | final_states: ``Tuple[torch.FloatTensor, torch.FloatTensor]`` 127 | The per-layer final (state, memory) states of the LSTM, with shape 128 | (num_layers, batch_size, 2 * hidden_size) and (num_layers, batch_size, 2 * cell_size) 129 | respectively. The last dimension is duplicated because it contains the state/memory 130 | for both the forward and backward layers. 131 | """ 132 | 133 | if initial_state is None: 134 | hidden_states: List[Optional[Tuple[torch.Tensor, 135 | torch.Tensor]]] = [None] * len(self.forward_layers) 136 | elif initial_state[0].size()[0] != len(self.forward_layers): 137 | raise Exception("Initial states were passed to forward() but the number of " 138 | "initial states does not match the number of layers.") 139 | else: 140 | hidden_states = list(zip(initial_state[0].split(1, 0), initial_state[1].split(1, 0))) 141 | 142 | inputs, batch_lengths = pad_packed_sequence(inputs, batch_first=True) 143 | forward_output_sequence = inputs 144 | backward_output_sequence = inputs 145 | 146 | final_states = [] 147 | sequence_outputs = [] 148 | for layer_index, state in enumerate(hidden_states): 149 | forward_layer = getattr(self, 'forward_layer_{}'.format(layer_index)) 150 | backward_layer = getattr(self, 'backward_layer_{}'.format(layer_index)) 151 | 152 | forward_cache = forward_output_sequence 153 | backward_cache = backward_output_sequence 154 | 155 | if state is not None: 156 | forward_hidden_state, backward_hidden_state = state[0].split(self.hidden_size, 2) 157 | forward_memory_state, backward_memory_state = state[1].split(self.cell_size, 2) 158 | forward_state = (forward_hidden_state, forward_memory_state) 159 | backward_state = (backward_hidden_state, backward_memory_state) 160 | else: 161 | forward_state = None 162 | backward_state = None 163 | 164 | forward_output_sequence, forward_state = forward_layer(forward_output_sequence, 165 | batch_lengths, 166 | forward_state) 167 | backward_output_sequence, backward_state = backward_layer(backward_output_sequence, 168 | batch_lengths, 169 | backward_state) 170 | # Skip connections, just adding the input to the output. 171 | if layer_index != 0: 172 | forward_output_sequence += forward_cache 173 | backward_output_sequence += backward_cache 174 | 175 | sequence_outputs.append(torch.cat([forward_output_sequence, 176 | backward_output_sequence], -1)) 177 | # Append the state tuples in a list, so that we can return 178 | # the final states for all the layers. 179 | final_states.append((torch.cat([forward_state[0], backward_state[0]], -1), 180 | torch.cat([forward_state[1], backward_state[1]], -1))) 181 | 182 | stacked_sequence_outputs: torch.FloatTensor = torch.stack(sequence_outputs) 183 | # Stack the hidden state and memory for each layer into 2 tensors of shape 184 | # (num_layers, batch_size, hidden_size) and (num_layers, batch_size, cell_size) 185 | # respectively. 186 | final_hidden_states, final_memory_states = zip(*final_states) 187 | final_state_tuple: Tuple[torch.FloatTensor, 188 | torch.FloatTensor] = (torch.cat(final_hidden_states, 0), 189 | torch.cat(final_memory_states, 0)) 190 | return stacked_sequence_outputs, final_state_tuple 191 | -------------------------------------------------------------------------------- /elmoformanylangs/modules/embedding_layer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | import logging 6 | 7 | logger = logging.getLogger('elmoformanylangs') 8 | 9 | 10 | class EmbeddingLayer(nn.Module): 11 | def __init__(self, n_d, word2id, embs=None, fix_emb=True, oov='', pad='', normalize=True): 12 | super(EmbeddingLayer, self).__init__() 13 | if embs is not None: 14 | embwords, embvecs = embs 15 | # for word in embwords: 16 | # assert word not in word2id, "Duplicate words in pre-trained embeddings" 17 | # word2id[word] = len(word2id) 18 | 19 | logger.info("{} pre-trained word embeddings loaded.".format(len(word2id))) 20 | if n_d != len(embvecs[0]): 21 | logger.warning("[WARNING] n_d ({}) != word vector size ({}). Use {} for embeddings.".format( 22 | n_d, len(embvecs[0]), len(embvecs[0]))) 23 | n_d = len(embvecs[0]) 24 | 25 | self.word2id = word2id 26 | self.id2word = {i: word for word, i in word2id.items()} 27 | self.n_V, self.n_d = len(word2id), n_d 28 | self.oovid = word2id[oov] 29 | self.padid = word2id[pad] 30 | self.embedding = nn.Embedding(self.n_V, n_d, padding_idx=self.padid) 31 | self.embedding.weight.data.uniform_(-0.25, 0.25) 32 | 33 | if embs is not None: 34 | weight = self.embedding.weight 35 | weight.data[:len(embwords)].copy_(torch.from_numpy(embvecs)) 36 | logger.info("embedding shape: {}".format(weight.size())) 37 | 38 | if normalize: 39 | weight = self.embedding.weight 40 | norms = weight.data.norm(2, 1) 41 | if norms.dim() == 1: 42 | norms = norms.unsqueeze(1) 43 | weight.data.div_(norms.expand_as(weight.data)) 44 | 45 | if fix_emb: 46 | self.embedding.weight.requires_grad = False 47 | 48 | def forward(self, input_): 49 | return self.embedding(input_) 50 | -------------------------------------------------------------------------------- /elmoformanylangs/modules/encoder_base.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple, Union, Optional, Callable 2 | import torch 3 | from torch.autograd import Variable 4 | from torch.nn.utils.rnn import pack_padded_sequence, PackedSequence 5 | 6 | from .util import get_lengths_from_binary_sequence_mask, sort_batch_by_length 7 | 8 | # We have two types here for the state, because storing the state in something 9 | # which is Iterable (like a tuple, below), is helpful for internal manipulation 10 | # - however, the states are consumed as either Tensors or a Tuple of Tensors, so 11 | # returning them in this format is unhelpful. 12 | RnnState = Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]] # pylint: disable=invalid-name 13 | RnnStateStorage = Tuple[torch.Tensor, ...] # pylint: disable=invalid-name 14 | 15 | 16 | class _EncoderBase(torch.nn.Module): 17 | # pylint: disable=abstract-method 18 | """ 19 | This abstract class serves as a base for the 3 ``Encoder`` abstractions in AllenNLP. 20 | - :class:`~allennlp.modules.seq2seq_encoders.Seq2SeqEncoders` 21 | - :class:`~allennlp.modules.seq2vec_encoders.Seq2VecEncoders` 22 | Additionally, this class provides functionality for sorting sequences by length 23 | so they can be consumed by Pytorch RNN classes, which require their inputs to be 24 | sorted by length. Finally, it also provides optional statefulness to all of it's 25 | subclasses by allowing the caching and retrieving of the hidden states of RNNs. 26 | """ 27 | def __init__(self, stateful: bool = False) -> None: 28 | super(_EncoderBase, self).__init__() 29 | self.stateful = stateful 30 | self._states: Optional[RnnStateStorage] = None 31 | 32 | def sort_and_run_forward(self, 33 | module: Callable[[PackedSequence, Optional[RnnState]], 34 | Tuple[Union[PackedSequence, torch.Tensor], RnnState]], 35 | inputs: torch.Tensor, 36 | mask: torch.Tensor, 37 | hidden_state: Optional[RnnState] = None): 38 | """ 39 | This function exists because Pytorch RNNs require that their inputs be sorted 40 | before being passed as input. As all of our Seq2xxxEncoders use this functionality, 41 | it is provided in a base class. This method can be called on any module which 42 | takes as input a ``PackedSequence`` and some ``hidden_state``, which can either be a 43 | tuple of tensors or a tensor. 44 | As all of our Seq2xxxEncoders have different return types, we return `sorted` 45 | outputs from the module, which is called directly. Additionally, we return the 46 | indices into the batch dimension required to restore the tensor to it's correct, 47 | unsorted order and the number of valid batch elements (i.e the number of elements 48 | in the batch which are not completely masked). This un-sorting and re-padding 49 | of the module outputs is left to the subclasses because their outputs have different 50 | types and handling them smoothly here is difficult. 51 | Parameters 52 | ---------- 53 | module : ``Callable[[PackedSequence, Optional[RnnState]], 54 | Tuple[Union[PackedSequence, torch.Tensor], RnnState]]``, required. 55 | A function to run on the inputs. In most cases, this is a ``torch.nn.Module``. 56 | inputs : ``torch.Tensor``, required. 57 | A tensor of shape ``(batch_size, sequence_length, embedding_size)`` representing 58 | the inputs to the Encoder. 59 | mask : ``torch.Tensor``, required. 60 | A tensor of shape ``(batch_size, sequence_length)``, representing masked and 61 | non-masked elements of the sequence for each element in the batch. 62 | hidden_state : ``Optional[RnnState]``, (default = None). 63 | A single tensor of shape (num_layers, batch_size, hidden_size) representing the 64 | state of an RNN with or a tuple of 65 | tensors of shapes (num_layers, batch_size, hidden_size) and 66 | (num_layers, batch_size, memory_size), representing the hidden state and memory 67 | state of an LSTM-like RNN. 68 | Returns 69 | ------- 70 | module_output : ``Union[torch.Tensor, PackedSequence]``. 71 | A Tensor or PackedSequence representing the output of the Pytorch Module. 72 | The batch size dimension will be equal to ``num_valid``, as sequences of zero 73 | length are clipped off before the module is called, as Pytorch cannot handle 74 | zero length sequences. 75 | final_states : ``Optional[RnnState]`` 76 | A Tensor representing the hidden state of the Pytorch Module. This can either 77 | be a single tensor of shape (num_layers, num_valid, hidden_size), for instance in 78 | the case of a GRU, or a tuple of tensors, such as those required for an LSTM. 79 | restoration_indices : ``torch.LongTensor`` 80 | A tensor of shape ``(batch_size,)``, describing the re-indexing required to transform 81 | the outputs back to their original batch order. 82 | """ 83 | # In some circumstances you may have sequences of zero length. ``pack_padded_sequence`` 84 | # requires all sequence lengths to be > 0, so remove sequences of zero length before 85 | # calling self._module, then fill with zeros. 86 | 87 | # First count how many sequences are empty. 88 | batch_size = mask.size(0) 89 | num_valid = torch.sum(mask[:, 0]).int().item() 90 | 91 | sequence_lengths = get_lengths_from_binary_sequence_mask(mask) 92 | sorted_inputs, sorted_sequence_lengths, restoration_indices, sorting_indices =\ 93 | sort_batch_by_length(inputs, sequence_lengths) 94 | 95 | # Now create a PackedSequence with only the non-empty, sorted sequences. 96 | packed_sequence_input = pack_padded_sequence(sorted_inputs[:num_valid, :, :], 97 | sorted_sequence_lengths[:num_valid].data.tolist(), 98 | batch_first=True) 99 | # Prepare the initial states. 100 | if not self.stateful: 101 | if hidden_state is None: 102 | initial_states = hidden_state 103 | elif isinstance(hidden_state, tuple): 104 | initial_states = [state.index_select(1, sorting_indices)[:, :num_valid, :] 105 | for state in hidden_state] 106 | else: 107 | initial_states = hidden_state.index_select(1, sorting_indices)[:, :num_valid, :] 108 | 109 | else: 110 | initial_states = self._get_initial_states(batch_size, num_valid, sorting_indices) 111 | 112 | # Actually call the module on the sorted PackedSequence. 113 | module_output, final_states = module(packed_sequence_input, initial_states) 114 | 115 | return module_output, final_states, restoration_indices 116 | 117 | def _get_initial_states(self, 118 | batch_size: int, 119 | num_valid: int, 120 | sorting_indices: torch.LongTensor) -> Optional[RnnState]: 121 | """ 122 | Returns an initial state for use in an RNN. Additionally, this method handles 123 | the batch size changing across calls by mutating the state to append initial states 124 | for new elements in the batch. Finally, it also handles sorting the states 125 | with respect to the sequence lengths of elements in the batch and removing rows 126 | which are completely padded. Importantly, this `mutates` the state if the 127 | current batch size is larger than when it was previously called. 128 | Parameters 129 | ---------- 130 | batch_size : ``int``, required. 131 | The batch size can change size across calls to stateful RNNs, so we need 132 | to know if we need to expand or shrink the states before returning them. 133 | Expanded states will be set to zero. 134 | num_valid : ``int``, required. 135 | The batch may contain completely padded sequences which get removed before 136 | the sequence is passed through the encoder. We also need to clip these off 137 | of the state too. 138 | sorting_indices ``torch.LongTensor``, required. 139 | Pytorch RNNs take sequences sorted by length. When we return the states to be 140 | used for a given call to ``module.forward``, we need the states to match up to 141 | the sorted sequences, so before returning them, we sort the states using the 142 | same indices used to sort the sequences. 143 | Returns 144 | ------- 145 | This method has a complex return type because it has to deal with the first time it 146 | is called, when it has no state, and the fact that types of RNN have heterogeneous 147 | states. 148 | If it is the first time the module has been called, it returns ``None``, regardless 149 | of the type of the ``Module``. 150 | Otherwise, for LSTMs, it returns a tuple of ``torch.Tensors`` with shape 151 | ``(num_layers, num_valid, state_size)`` and ``(num_layers, num_valid, memory_size)`` 152 | respectively, or for GRUs, it returns a single ``torch.Tensor`` of shape 153 | ``(num_layers, num_valid, state_size)``. 154 | """ 155 | # We don't know the state sizes the first time calling forward, 156 | # so we let the module define what it's initial hidden state looks like. 157 | if self._states is None: 158 | return None 159 | 160 | # Otherwise, we have some previous states. 161 | if batch_size > self._states[0].size(1): 162 | # This batch is larger than the all previous states. 163 | # If so, resize the states. 164 | num_states_to_concat = batch_size - self._states[0].size(1) 165 | resized_states = [] 166 | # state has shape (num_layers, batch_size, hidden_size) 167 | for state in self._states: 168 | # This _must_ be inside the loop because some 169 | # RNNs have states with different last dimension sizes. 170 | zeros = state.data.new(state.size(0), 171 | num_states_to_concat, 172 | state.size(2)).fill_(0) 173 | zeros = Variable(zeros) 174 | resized_states.append(torch.cat([state, zeros], 1)) 175 | self._states = tuple(resized_states) 176 | correctly_shaped_states = self._states 177 | 178 | elif batch_size < self._states[0].size(1): 179 | # This batch is smaller than the previous one. 180 | correctly_shaped_states = tuple(state[:, :batch_size, :] for state in self._states) 181 | else: 182 | correctly_shaped_states = self._states 183 | 184 | # At this point, our states are of shape (num_layers, batch_size, hidden_size). 185 | # However, the encoder uses sorted sequences and additionally removes elements 186 | # of the batch which are fully padded. We need the states to match up to these 187 | # sorted and filtered sequences, so we do that in the next two blocks before 188 | # returning the state/s. 189 | if len(self._states) == 1: 190 | # GRUs only have a single state. This `unpacks` it from the 191 | # tuple and returns the tensor directly. 192 | correctly_shaped_state = correctly_shaped_states[0] 193 | sorted_state = correctly_shaped_state.index_select(1, sorting_indices) 194 | return sorted_state[:, :num_valid, :] 195 | else: 196 | # LSTMs have a state tuple of (state, memory). 197 | sorted_states = [state.index_select(1, sorting_indices) 198 | for state in correctly_shaped_states] 199 | return tuple(state[:, :num_valid, :] for state in sorted_states) 200 | 201 | def _update_states(self, 202 | final_states: RnnStateStorage, 203 | restoration_indices: torch.LongTensor) -> None: 204 | """ 205 | After the RNN has run forward, the states need to be updated. 206 | This method just sets the state to the updated new state, performing 207 | several pieces of book-keeping along the way - namely, unsorting the 208 | states and ensuring that the states of completely padded sequences are 209 | not updated. Finally, it also detatches the state variable from the 210 | computational graph, such that the graph can be garbage collected after 211 | each batch iteration. 212 | Parameters 213 | ---------- 214 | final_states : ``RnnStateStorage``, required. 215 | The hidden states returned as output from the RNN. 216 | restoration_indices : ``torch.LongTensor``, required. 217 | The indices that invert the sorting used in ``sort_and_run_forward`` 218 | to order the states with respect to the lengths of the sequences in 219 | the batch. 220 | """ 221 | # TODO(Mark): seems weird to sort here, but append zeros in the subclasses. 222 | # which way around is best? 223 | new_unsorted_states = [state.index_select(1, restoration_indices) 224 | for state in final_states] 225 | 226 | if self._states is None: 227 | # We don't already have states, so just set the 228 | # ones we receive to be the current state. 229 | self._states = tuple([torch.autograd.Variable(state.data) 230 | for state in new_unsorted_states]) 231 | else: 232 | # Now we've sorted the states back so that they correspond to the original 233 | # indices, we need to figure out what states we need to update, because if we 234 | # didn't use a state for a particular row, we want to preserve its state. 235 | # Thankfully, the rows which are all zero in the state correspond exactly 236 | # to those which aren't used, so we create masks of shape (new_batch_size,), 237 | # denoting which states were used in the RNN computation. 238 | current_state_batch_size = self._states[0].size(1) 239 | new_state_batch_size = final_states[0].size(1) 240 | # Masks for the unused states of shape (1, new_batch_size, 1) 241 | used_new_rows_mask = [(state[0, :, :].sum(-1) 242 | != 0.0).float().view(1, new_state_batch_size, 1) 243 | for state in new_unsorted_states] 244 | new_states = [] 245 | if current_state_batch_size > new_state_batch_size: 246 | # The new state is smaller than the old one, 247 | # so just update the indices which we used. 248 | for old_state, new_state, used_mask in zip(self._states, 249 | new_unsorted_states, 250 | used_new_rows_mask): 251 | # zero out all rows in the previous state 252 | # which _were_ used in the current state. 253 | masked_old_state = old_state[:, :new_state_batch_size, :] * (1 - used_mask) 254 | # The old state is larger, so update the relevant parts of it. 255 | old_state[:, :new_state_batch_size, :] = new_state + masked_old_state 256 | # Detatch the Variable. 257 | new_states.append(torch.autograd.Variable(old_state.data)) 258 | else: 259 | # The states are the same size, so we just have to 260 | # deal with the possibility that some rows weren't used. 261 | new_states = [] 262 | for old_state, new_state, used_mask in zip(self._states, 263 | new_unsorted_states, 264 | used_new_rows_mask): 265 | # zero out all rows which _were_ used in the current state. 266 | masked_old_state = old_state * (1 - used_mask) 267 | # The old state is larger, so update the relevant parts of it. 268 | new_state += masked_old_state 269 | # Detatch the Variable. 270 | new_states.append(torch.autograd.Variable(new_state.data)) 271 | 272 | # It looks like there should be another case handled here - when 273 | # the current_state_batch_size < new_state_batch_size. However, 274 | # this never happens, because the states themeselves are mutated 275 | # by appending zeros when calling _get_inital_states, meaning that 276 | # the new states are either of equal size, or smaller, in the case 277 | # that there are some unused elements (zero-length) for the RNN computation. 278 | self._states = tuple(new_states) 279 | 280 | def reset_states(self): 281 | self._states = None 282 | -------------------------------------------------------------------------------- /elmoformanylangs/modules/highway.py: -------------------------------------------------------------------------------- 1 | """ 2 | A `Highway layer `_ that does a gated combination of a linear 3 | transformation and a non-linear transformation of its input. 4 | """ 5 | 6 | from typing import Callable 7 | 8 | import torch 9 | from overrides import overrides 10 | 11 | 12 | class Highway(torch.nn.Module): 13 | """ 14 | A `Highway layer `_ does a gated combination of a linear 15 | transformation and a non-linear transformation of its input. :math:`y = g * x + (1 - g) * 16 | f(A(x))`, where :math:`A` is a linear transformation, :math:`f` is an element-wise 17 | non-linearity, and :math:`g` is an element-wise gate, computed as :math:`sigmoid(B(x))`. 18 | This module will apply a fixed number of highway layers to its input, returning the final 19 | result. 20 | Parameters 21 | ---------- 22 | input_dim : ``int`` 23 | The dimensionality of :math:`x`. We assume the input has shape ``(batch_size, 24 | input_dim)``. 25 | num_layers : ``int``, optional (default=``1``) 26 | The number of highway layers to apply to the input. 27 | activation : ``Callable[[torch.Tensor], torch.Tensor]``, optional (default=``torch.nn.functional.relu``) 28 | The non-linearity to use in the highway layers. 29 | """ 30 | def __init__(self, 31 | input_dim: int, 32 | num_layers: int = 1, 33 | activation: Callable[[torch.Tensor], torch.Tensor] = torch.nn.functional.relu) -> None: 34 | super(Highway, self).__init__() 35 | self._input_dim = input_dim 36 | self._layers = torch.nn.ModuleList([torch.nn.Linear(input_dim, input_dim * 2) 37 | for _ in range(num_layers)]) 38 | self._activation = activation 39 | for layer in self._layers: 40 | # We should bias the highway layer to just carry its input forward. We do that by 41 | # setting the bias on `B(x)` to be positive, because that means `g` will be biased to 42 | # be high, to we will carry the input forward. The bias on `B(x)` is the second half 43 | # of the bias vector in each Linear layer. 44 | layer.bias[input_dim:].data.fill_(1) 45 | 46 | @overrides 47 | def forward(self, inputs: torch.Tensor) -> torch.Tensor: # pylint: disable=arguments-differ 48 | current_input = inputs 49 | for layer in self._layers: 50 | projected_input = layer(current_input) 51 | linear_part = current_input 52 | # NOTE: if you modify this, think about whether you should modify the initialization 53 | # above, too. 54 | nonlinear_part = projected_input[:, (0 * self._input_dim):(1 * self._input_dim)] 55 | gate = projected_input[:, (1 * self._input_dim):(2 * self._input_dim)] 56 | nonlinear_part = self._activation(nonlinear_part) 57 | gate = torch.sigmoid(gate) 58 | current_input = gate * linear_part + (1 - gate) * nonlinear_part 59 | return current_input 60 | -------------------------------------------------------------------------------- /elmoformanylangs/modules/lstm.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import unicode_literals 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from torch.autograd import Variable 7 | import copy 8 | 9 | 10 | class LstmbiLm(nn.Module): 11 | def __init__(self, config, use_cuda=False): 12 | super(LstmbiLm, self).__init__() 13 | self.config = config 14 | self.use_cuda = use_cuda 15 | 16 | self.encoder = nn.LSTM(self.config['encoder']['projection_dim'], 17 | self.config['encoder']['dim'], 18 | num_layers=self.config['encoder']['n_layers'], 19 | bidirectional=True, 20 | batch_first=True, 21 | dropout=self.config['dropout']) 22 | self.projection = nn.Linear(self.config['encoder']['dim'], self.config['encoder']['projection_dim'], bias=True) 23 | 24 | def forward(self, inputs): 25 | forward, backward = self.encoder(inputs)[0].split(self.config['encoder']['dim'], 2) 26 | return torch.cat([self.projection(forward), self.projection(backward)], dim=2) 27 | -------------------------------------------------------------------------------- /elmoformanylangs/modules/lstm_cell_with_projection.py: -------------------------------------------------------------------------------- 1 | """ 2 | An LSTM with Recurrent Dropout, a hidden_state which is projected and 3 | clipping on both the hidden state and the memory state of the LSTM. 4 | """ 5 | 6 | from typing import Optional, Tuple, List 7 | 8 | import torch 9 | from torch.autograd import Variable 10 | 11 | from .util import block_orthogonal, get_dropout_mask 12 | 13 | class LstmCellWithProjection(torch.nn.Module): 14 | """ 15 | An LSTM with Recurrent Dropout and a projected and clipped hidden state and 16 | memory. Note: this implementation is slower than the native Pytorch LSTM because 17 | it cannot make use of CUDNN optimizations for stacked RNNs due to and 18 | variational dropout and the custom nature of the cell state. 19 | Parameters 20 | ---------- 21 | input_size : ``int``, required. 22 | The dimension of the inputs to the LSTM. 23 | hidden_size : ``int``, required. 24 | The dimension of the outputs of the LSTM. 25 | cell_size : ``int``, required. 26 | The dimension of the memory cell used for the LSTM. 27 | go_forward: ``bool``, optional (default = True) 28 | The direction in which the LSTM is applied to the sequence. 29 | Forwards by default, or backwards if False. 30 | recurrent_dropout_probability: ``float``, optional (default = 0.0) 31 | The dropout probability to be used in a dropout scheme as stated in 32 | `A Theoretically Grounded Application of Dropout in Recurrent Neural Networks 33 | `_ . Implementation wise, this simply 34 | applies a fixed dropout mask per sequence to the recurrent connection of the 35 | LSTM. 36 | state_projection_clip_value: ``float``, optional, (default = None) 37 | The magnitude with which to clip the hidden_state after projecting it. 38 | memory_cell_clip_value: ``float``, optional, (default = None) 39 | The magnitude with which to clip the memory cell. 40 | Returns 41 | ------- 42 | output_accumulator : ``torch.FloatTensor`` 43 | The outputs of the LSTM for each timestep. A tensor of shape 44 | (batch_size, max_timesteps, hidden_size) where for a given batch 45 | element, all outputs past the sequence length for that batch are 46 | zero tensors. 47 | final_state: ``Tuple[torch.FloatTensor, torch.FloatTensor]`` 48 | The final (state, memory) states of the LSTM, with shape 49 | (1, batch_size, hidden_size) and (1, batch_size, cell_size) 50 | respectively. The first dimension is 1 in order to match the Pytorch 51 | API for returning stacked LSTM states. 52 | """ 53 | def __init__(self, 54 | input_size: int, 55 | hidden_size: int, 56 | cell_size: int, 57 | go_forward: bool = True, 58 | recurrent_dropout_probability: float = 0.0, 59 | memory_cell_clip_value: Optional[float] = None, 60 | state_projection_clip_value: Optional[float] = None) -> None: 61 | super(LstmCellWithProjection, self).__init__() 62 | # Required to be wrapped with a :class:`PytorchSeq2SeqWrapper`. 63 | self.input_size = input_size 64 | self.hidden_size = hidden_size 65 | self.cell_size = cell_size 66 | 67 | self.go_forward = go_forward 68 | self.state_projection_clip_value = state_projection_clip_value 69 | self.memory_cell_clip_value = memory_cell_clip_value 70 | self.recurrent_dropout_probability = recurrent_dropout_probability 71 | 72 | # We do the projections for all the gates all at once. 73 | self.input_linearity = torch.nn.Linear(input_size, 4 * cell_size, bias=False) 74 | self.state_linearity = torch.nn.Linear(hidden_size, 4 * cell_size, bias=True) 75 | 76 | # Additional projection matrix for making the hidden state smaller. 77 | self.state_projection = torch.nn.Linear(cell_size, hidden_size, bias=False) 78 | self.reset_parameters() 79 | 80 | def reset_parameters(self): 81 | # Use sensible default initializations for parameters. 82 | block_orthogonal(self.input_linearity.weight.data, [self.cell_size, self.input_size]) 83 | block_orthogonal(self.state_linearity.weight.data, [self.cell_size, self.hidden_size]) 84 | 85 | self.state_linearity.bias.data.fill_(0.0) 86 | # Initialize forget gate biases to 1.0 as per An Empirical 87 | # Exploration of Recurrent Network Architectures, (Jozefowicz, 2015). 88 | self.state_linearity.bias.data[self.cell_size:2 * self.cell_size].fill_(1.0) 89 | 90 | def forward(self, # pylint: disable=arguments-differ 91 | inputs: torch.FloatTensor, 92 | batch_lengths: List[int], 93 | initial_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None): 94 | """ 95 | Parameters 96 | ---------- 97 | inputs : ``torch.FloatTensor``, required. 98 | A tensor of shape (batch_size, num_timesteps, input_size) 99 | to apply the LSTM over. 100 | batch_lengths : ``List[int]``, required. 101 | A list of length batch_size containing the lengths of the sequences in batch. 102 | initial_state : ``Tuple[torch.Tensor, torch.Tensor]``, optional, (default = None) 103 | A tuple (state, memory) representing the initial hidden state and memory 104 | of the LSTM. The ``state`` has shape (1, batch_size, hidden_size) and the 105 | ``memory`` has shape (1, batch_size, cell_size). 106 | Returns 107 | ------- 108 | output_accumulator : ``torch.FloatTensor`` 109 | The outputs of the LSTM for each timestep. A tensor of shape 110 | (batch_size, max_timesteps, hidden_size) where for a given batch 111 | element, all outputs past the sequence length for that batch are 112 | zero tensors. 113 | final_state : ``Tuple[``torch.FloatTensor, torch.FloatTensor]`` 114 | A tuple (state, memory) representing the initial hidden state and memory 115 | of the LSTM. The ``state`` has shape (1, batch_size, hidden_size) and the 116 | ``memory`` has shape (1, batch_size, cell_size). 117 | """ 118 | batch_size = inputs.size()[0] 119 | total_timesteps = inputs.size()[1] 120 | 121 | # We have to use this '.data.new().fill_' pattern to create tensors with the correct 122 | # type - forward has no knowledge of whether these are torch.Tensors or torch.cuda.Tensors. 123 | output_accumulator = Variable(inputs.data.new(batch_size, 124 | total_timesteps, 125 | self.hidden_size).fill_(0)) 126 | if initial_state is None: 127 | full_batch_previous_memory = Variable(inputs.data.new(batch_size, 128 | self.cell_size).fill_(0)) 129 | full_batch_previous_state = Variable(inputs.data.new(batch_size, 130 | self.hidden_size).fill_(0)) 131 | else: 132 | full_batch_previous_state = initial_state[0].squeeze(0) 133 | full_batch_previous_memory = initial_state[1].squeeze(0) 134 | 135 | current_length_index = batch_size - 1 if self.go_forward else 0 136 | if self.recurrent_dropout_probability > 0.0 and self.training: 137 | dropout_mask = get_dropout_mask(self.recurrent_dropout_probability, 138 | full_batch_previous_state) 139 | else: 140 | dropout_mask = None 141 | 142 | for timestep in range(total_timesteps): 143 | # The index depends on which end we start. 144 | index = timestep if self.go_forward else total_timesteps - timestep - 1 145 | 146 | # What we are doing here is finding the index into the batch dimension 147 | # which we need to use for this timestep, because the sequences have 148 | # variable length, so once the index is greater than the length of this 149 | # particular batch sequence, we no longer need to do the computation for 150 | # this sequence. The key thing to recognise here is that the batch inputs 151 | # must be _ordered_ by length from longest (first in batch) to shortest 152 | # (last) so initially, we are going forwards with every sequence and as we 153 | # pass the index at which the shortest elements of the batch finish, 154 | # we stop picking them up for the computation. 155 | if self.go_forward: 156 | while batch_lengths[current_length_index] <= index: 157 | current_length_index -= 1 158 | # If we're going backwards, we are _picking up_ more indices. 159 | else: 160 | # First conditional: Are we already at the maximum number of elements in the batch? 161 | # Second conditional: Does the next shortest sequence beyond the current batch 162 | # index require computation use this timestep? 163 | while current_length_index < (len(batch_lengths) - 1) and \ 164 | batch_lengths[current_length_index + 1] > index: 165 | current_length_index += 1 166 | 167 | # Actually get the slices of the batch which we 168 | # need for the computation at this timestep. 169 | # shape (batch_size, cell_size) 170 | previous_memory = full_batch_previous_memory[0: current_length_index + 1].clone() 171 | # Shape (batch_size, hidden_size) 172 | previous_state = full_batch_previous_state[0: current_length_index + 1].clone() 173 | # Shape (batch_size, input_size) 174 | timestep_input = inputs[0: current_length_index + 1, index] 175 | 176 | # Do the projections for all the gates all at once. 177 | # Both have shape (batch_size, 4 * cell_size) 178 | projected_input = self.input_linearity(timestep_input) 179 | projected_state = self.state_linearity(previous_state) 180 | 181 | # Main LSTM equations using relevant chunks of the big linear 182 | # projections of the hidden state and inputs. 183 | input_gate = torch.sigmoid(projected_input[:, (0 * self.cell_size):(1 * self.cell_size)] + 184 | projected_state[:, (0 * self.cell_size):(1 * self.cell_size)]) 185 | forget_gate = torch.sigmoid(projected_input[:, (1 * self.cell_size):(2 * self.cell_size)] + 186 | projected_state[:, (1 * self.cell_size):(2 * self.cell_size)]) 187 | memory_init = torch.tanh(projected_input[:, (2 * self.cell_size):(3 * self.cell_size)] + 188 | projected_state[:, (2 * self.cell_size):(3 * self.cell_size)]) 189 | output_gate = torch.sigmoid(projected_input[:, (3 * self.cell_size):(4 * self.cell_size)] + 190 | projected_state[:, (3 * self.cell_size):(4 * self.cell_size)]) 191 | memory = input_gate * memory_init + forget_gate * previous_memory 192 | 193 | # Here is the non-standard part of this LSTM cell; first, we clip the 194 | # memory cell, then we project the output of the timestep to a smaller size 195 | # and again clip it. 196 | 197 | if self.memory_cell_clip_value: 198 | # pylint: disable=invalid-unary-operand-type 199 | memory = torch.clamp(memory, -self.memory_cell_clip_value, self.memory_cell_clip_value) 200 | 201 | # shape (current_length_index, cell_size) 202 | pre_projection_timestep_output = output_gate * torch.tanh(memory) 203 | 204 | # shape (current_length_index, hidden_size) 205 | timestep_output = self.state_projection(pre_projection_timestep_output) 206 | if self.state_projection_clip_value: 207 | # pylint: disable=invalid-unary-operand-type 208 | timestep_output = torch.clamp(timestep_output, 209 | -self.state_projection_clip_value, 210 | self.state_projection_clip_value) 211 | 212 | # Only do dropout if the dropout prob is > 0.0 and we are in training mode. 213 | if dropout_mask is not None: 214 | timestep_output = timestep_output * dropout_mask[0: current_length_index + 1] 215 | 216 | # We've been doing computation with less than the full batch, so here we create a new 217 | # variable for the the whole batch at this timestep and insert the result for the 218 | # relevant elements of the batch into it. 219 | full_batch_previous_memory = Variable(full_batch_previous_memory.data.clone()) 220 | full_batch_previous_state = Variable(full_batch_previous_state.data.clone()) 221 | full_batch_previous_memory[0:current_length_index + 1] = memory 222 | full_batch_previous_state[0:current_length_index + 1] = timestep_output 223 | output_accumulator[0:current_length_index + 1, index] = timestep_output 224 | 225 | # Mimic the pytorch API by returning state in the following shape: 226 | # (num_layers * num_directions, batch_size, ...). As this 227 | # LSTM cell cannot be stacked, the first dimension here is just 1. 228 | final_state = (full_batch_previous_state.unsqueeze(0), 229 | full_batch_previous_memory.unsqueeze(0)) 230 | 231 | return output_accumulator, final_state 232 | -------------------------------------------------------------------------------- /elmoformanylangs/modules/token_embedder.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import unicode_literals 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from torch.autograd import Variable 7 | import copy 8 | from .highway import Highway 9 | 10 | 11 | class LstmTokenEmbedder(nn.Module): 12 | def __init__(self, config, word_emb_layer, char_emb_layer, use_cuda=False): 13 | super(LstmTokenEmbedder, self).__init__() 14 | self.config = config 15 | self.use_cuda = use_cuda 16 | self.word_emb_layer = word_emb_layer 17 | self.char_emb_layer = char_emb_layer 18 | self.output_dim = config['encoder']['projection_dim'] 19 | emb_dim = 0 20 | if word_emb_layer is not None: 21 | emb_dim += word_emb_layer.n_d 22 | 23 | if char_emb_layer is not None: 24 | emb_dim += char_emb_layer.n_d * 2 25 | self.char_lstm = nn.LSTM(char_emb_layer.n_d, char_emb_layer.n_d, num_layers=1, bidirectional=True, 26 | batch_first=True, dropout=config['dropout']) 27 | 28 | self.projection = nn.Linear(emb_dim, self.output_dim, bias=True) 29 | 30 | def forward(self, word_inp, chars_inp, shape): 31 | embs = [] 32 | batch_size, seq_len = shape 33 | if self.word_emb_layer is not None: 34 | word_emb = self.word_emb_layer(Variable(word_inp).cuda() if self.use_cuda else Variable(word_inp)) 35 | embs.append(word_emb) 36 | 37 | if self.char_emb_layer is not None: 38 | chars_inp = chars_inp.view(batch_size * seq_len, -1) 39 | chars_emb = self.char_emb_layer(Variable(chars_inp).cuda() if self.use_cuda else Variable(chars_inp)) 40 | _, (chars_outputs, __) = self.char_lstm(chars_emb) 41 | chars_outputs = chars_outputs.contiguous().view(-1, self.config['token_embedder']['char_dim'] * 2) 42 | embs.append(chars_outputs) 43 | 44 | token_embedding = torch.cat(embs, dim=2) 45 | 46 | return self.projection(token_embedding) 47 | 48 | 49 | class ConvTokenEmbedder(nn.Module): 50 | def __init__(self, config, word_emb_layer, char_emb_layer, use_cuda): 51 | super(ConvTokenEmbedder, self).__init__() 52 | self.config = config 53 | self.use_cuda = use_cuda 54 | 55 | self.word_emb_layer = word_emb_layer 56 | self.char_emb_layer = char_emb_layer 57 | 58 | self.output_dim = config['encoder']['projection_dim'] 59 | self.emb_dim = 0 60 | if word_emb_layer is not None: 61 | self.emb_dim += word_emb_layer.n_d 62 | 63 | if char_emb_layer is not None: 64 | self.convolutions = [] 65 | cnn_config = config['token_embedder'] 66 | filters = cnn_config['filters'] 67 | char_embed_dim = cnn_config['char_dim'] 68 | 69 | for i, (width, num) in enumerate(filters): 70 | conv = torch.nn.Conv1d( 71 | in_channels=char_embed_dim, 72 | out_channels=num, 73 | kernel_size=width, 74 | bias=True 75 | ) 76 | self.convolutions.append(conv) 77 | 78 | self.convolutions = nn.ModuleList(self.convolutions) 79 | 80 | self.n_filters = sum(f[1] for f in filters) 81 | self.n_highway = cnn_config['n_highway'] 82 | 83 | self.highways = Highway(self.n_filters, self.n_highway, activation=torch.nn.functional.relu) 84 | self.emb_dim += self.n_filters 85 | 86 | self.projection = nn.Linear(self.emb_dim, self.output_dim, bias=True) 87 | 88 | def forward(self, word_inp, chars_inp, shape): 89 | embs = [] 90 | batch_size, seq_len = shape 91 | if self.word_emb_layer is not None: 92 | batch_size, seq_len = word_inp.size(0), word_inp.size(1) 93 | word_emb = self.word_emb_layer(Variable(word_inp).cuda() if self.use_cuda else Variable(word_inp)) 94 | embs.append(word_emb) 95 | 96 | if self.char_emb_layer is not None: 97 | chars_inp = chars_inp.view(batch_size * seq_len, -1) 98 | 99 | character_embedding = self.char_emb_layer(Variable(chars_inp).cuda() if self.use_cuda else Variable(chars_inp)) 100 | 101 | character_embedding = torch.transpose(character_embedding, 1, 2) 102 | 103 | cnn_config = self.config['token_embedder'] 104 | if cnn_config['activation'] == 'tanh': 105 | activation = torch.nn.functional.tanh 106 | elif cnn_config['activation'] == 'relu': 107 | activation = torch.nn.functional.relu 108 | else: 109 | raise Exception("Unknown activation") 110 | 111 | convs = [] 112 | for i in range(len(self.convolutions)): 113 | convolved = self.convolutions[i](character_embedding) 114 | # (batch_size * sequence_length, n_filters for this width) 115 | convolved, _ = torch.max(convolved, dim=-1) 116 | convolved = activation(convolved) 117 | convs.append(convolved) 118 | char_emb = torch.cat(convs, dim=-1) 119 | char_emb = self.highways(char_emb) 120 | 121 | embs.append(char_emb.view(batch_size, -1, self.n_filters)) 122 | 123 | token_embedding = torch.cat(embs, dim=2) 124 | 125 | return self.projection(token_embedding) 126 | -------------------------------------------------------------------------------- /elmoformanylangs/modules/util.py: -------------------------------------------------------------------------------- 1 | """ 2 | Assorted utilities for working with neural networks in AllenNLP. 3 | """ 4 | from collections import defaultdict 5 | from typing import Dict, List, Optional, Any, Tuple, Callable 6 | import itertools 7 | import math 8 | import torch 9 | from torch.autograd import Variable 10 | 11 | def get_lengths_from_binary_sequence_mask(mask: torch.Tensor): 12 | """ 13 | Compute sequence lengths for each batch element in a tensor using a 14 | binary mask. 15 | Parameters 16 | ---------- 17 | mask : torch.Tensor, required. 18 | A 2D binary mask of shape (batch_size, sequence_length) to 19 | calculate the per-batch sequence lengths from. 20 | Returns 21 | ------- 22 | A torch.LongTensor of shape (batch_size,) representing the lengths 23 | of the sequences in the batch. 24 | """ 25 | return mask.long().sum(-1) 26 | 27 | 28 | def sort_batch_by_length(tensor: torch.autograd.Variable, 29 | sequence_lengths: torch.autograd.Variable): 30 | """ 31 | Sort a batch first tensor by some specified lengths. 32 | Parameters 33 | ---------- 34 | tensor : Variable(torch.FloatTensor), required. 35 | A batch first Pytorch tensor. 36 | sequence_lengths : Variable(torch.LongTensor), required. 37 | A tensor representing the lengths of some dimension of the tensor which 38 | we want to sort by. 39 | Returns 40 | ------- 41 | sorted_tensor : Variable(torch.FloatTensor) 42 | The original tensor sorted along the batch dimension with respect to sequence_lengths. 43 | sorted_sequence_lengths : Variable(torch.LongTensor) 44 | The original sequence_lengths sorted by decreasing size. 45 | restoration_indices : Variable(torch.LongTensor) 46 | Indices into the sorted_tensor such that 47 | ``sorted_tensor.index_select(0, restoration_indices) == original_tensor`` 48 | permuation_index : Variable(torch.LongTensor) 49 | The indices used to sort the tensor. This is useful if you want to sort many 50 | tensors using the same ordering. 51 | """ 52 | 53 | if not isinstance(tensor, Variable) or not isinstance(sequence_lengths, Variable): 54 | raise Exception("Both the tensor and sequence lengths must be torch.autograd.Variables.") 55 | 56 | sorted_sequence_lengths, permutation_index = sequence_lengths.sort(0, descending=True) 57 | sorted_tensor = tensor.index_select(0, permutation_index) 58 | 59 | # This is ugly, but required - we are creating a new variable at runtime, so we 60 | # must ensure it has the correct CUDA vs non-CUDA type. We do this by cloning and 61 | # refilling one of the inputs to the function. 62 | index_range = sequence_lengths.data.clone().copy_(torch.arange(0, len(sequence_lengths))) 63 | # This is the equivalent of zipping with index, sorting by the original 64 | # sequence lengths and returning the now sorted indices. 65 | index_range = Variable(index_range.long()) 66 | _, reverse_mapping = permutation_index.sort(0, descending=False) 67 | restoration_indices = index_range.index_select(0, reverse_mapping) 68 | return sorted_tensor, sorted_sequence_lengths, restoration_indices, permutation_index 69 | 70 | 71 | def get_final_encoder_states(encoder_outputs: torch.Tensor, 72 | mask: torch.Tensor, 73 | bidirectional: bool = False) -> torch.Tensor: 74 | """ 75 | Given the output from a ``Seq2SeqEncoder``, with shape ``(batch_size, sequence_length, 76 | encoding_dim)``, this method returns the final hidden state for each element of the batch, 77 | giving a tensor of shape ``(batch_size, encoding_dim)``. This is not as simple as 78 | ``encoder_outputs[:, -1]``, because the sequences could have different lengths. We use the 79 | mask (which has shape ``(batch_size, sequence_length)``) to find the final state for each batch 80 | instance. 81 | Additionally, if ``bidirectional`` is ``True``, we will split the final dimension of the 82 | ``encoder_outputs`` into two and assume that the first half is for the forward direction of the 83 | encoder and the second half is for the backward direction. We will concatenate the last state 84 | for each encoder dimension, giving ``encoder_outputs[:, -1, :encoding_dim/2]`` concated with 85 | ``encoder_outputs[:, 0, encoding_dim/2:]``. 86 | """ 87 | # These are the indices of the last words in the sequences (i.e. length sans padding - 1). We 88 | # are assuming sequences are right padded. 89 | # Shape: (batch_size,) 90 | last_word_indices = mask.sum(1).long() - 1 91 | batch_size, _, encoder_output_dim = encoder_outputs.size() 92 | expanded_indices = last_word_indices.view(-1, 1, 1).expand(batch_size, 1, encoder_output_dim) 93 | # Shape: (batch_size, 1, encoder_output_dim) 94 | final_encoder_output = encoder_outputs.gather(1, expanded_indices) 95 | final_encoder_output = final_encoder_output.squeeze(1) # (batch_size, encoder_output_dim) 96 | if bidirectional: 97 | final_forward_output = final_encoder_output[:, :(encoder_output_dim // 2)] 98 | final_backward_output = encoder_outputs[:, 0, (encoder_output_dim // 2):] 99 | final_encoder_output = torch.cat([final_forward_output, final_backward_output], dim=-1) 100 | return final_encoder_output 101 | 102 | 103 | def get_dropout_mask(dropout_probability: float, tensor_for_masking: torch.autograd.Variable): 104 | """ 105 | Computes and returns an element-wise dropout mask for a given tensor, where 106 | each element in the mask is dropped out with probability dropout_probability. 107 | Note that the mask is NOT applied to the tensor - the tensor is passed to retain 108 | the correct CUDA tensor type for the mask. 109 | Parameters 110 | ---------- 111 | dropout_probability : float, required. 112 | Probability of dropping a dimension of the input. 113 | tensor_for_masking : torch.Variable, required. 114 | Returns 115 | ------- 116 | A torch.FloatTensor consisting of the binary mask scaled by 1/ (1 - dropout_probability). 117 | This scaling ensures expected values and variances of the output of applying this mask 118 | and the original tensor are the same. 119 | """ 120 | binary_mask = tensor_for_masking.clone() 121 | binary_mask.data.copy_(torch.rand(tensor_for_masking.size()) > dropout_probability) 122 | # Scale mask by 1/keep_prob to preserve output statistics. 123 | dropout_mask = binary_mask.float().div(1.0 - dropout_probability) 124 | return dropout_mask 125 | 126 | def block_orthogonal(tensor: torch.Tensor, 127 | split_sizes: List[int], 128 | gain: float = 1.0) -> None: 129 | """ 130 | An initializer which allows initializing model parameters in "blocks". This is helpful 131 | in the case of recurrent models which use multiple gates applied to linear projections, 132 | which can be computed efficiently if they are concatenated together. However, they are 133 | separate parameters which should be initialized independently. 134 | Parameters 135 | ---------- 136 | tensor : ``torch.Tensor``, required. 137 | A tensor to initialize. 138 | split_sizes : List[int], required. 139 | A list of length ``tensor.ndim()`` specifying the size of the 140 | blocks along that particular dimension. E.g. ``[10, 20]`` would 141 | result in the tensor being split into chunks of size 10 along the 142 | first dimension and 20 along the second. 143 | gain : float, optional (default = 1.0) 144 | The gain (scaling) applied to the orthogonal initialization. 145 | """ 146 | 147 | if isinstance(tensor, Variable): 148 | # in pytorch 4.0, Variable equals Tensor 149 | # block_orthogonal(tensor.data, split_sizes, gain) 150 | #else: 151 | sizes = list(tensor.size()) 152 | if any([a % b != 0 for a, b in zip(sizes, split_sizes)]): 153 | raise ConfigurationError("tensor dimensions must be divisible by their respective " 154 | "split_sizes. Found size: {} and split_sizes: {}".format(sizes, split_sizes)) 155 | indexes = [list(range(0, max_size, split)) 156 | for max_size, split in zip(sizes, split_sizes)] 157 | # Iterate over all possible blocks within the tensor. 158 | for block_start_indices in itertools.product(*indexes): 159 | # A list of tuples containing the index to start at for this block 160 | # and the appropriate step size (i.e split_size[i] for dimension i). 161 | index_and_step_tuples = zip(block_start_indices, split_sizes) 162 | # This is a tuple of slices corresponding to: 163 | # tensor[index: index + step_size, ...]. This is 164 | # required because we could have an arbitrary number 165 | # of dimensions. The actual slices we need are the 166 | # start_index: start_index + step for each dimension in the tensor. 167 | block_slice = tuple([slice(start_index, start_index + step) 168 | for start_index, step in index_and_step_tuples]) 169 | tensor[block_slice] = torch.nn.init.orthogonal_(tensor[block_slice].contiguous(), gain=gain) 170 | -------------------------------------------------------------------------------- /elmoformanylangs/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import unicode_literals 3 | import collections 4 | import itertools 5 | 6 | 7 | def flatten(lst): 8 | return list(itertools.chain.from_iterable(lst)) 9 | 10 | 11 | def deep_iter(x): 12 | if isinstance(x, list) or isinstance(x, tuple): 13 | for u in x: 14 | for v in deep_iter(u): 15 | yield v 16 | else: 17 | yield 18 | 19 | 20 | def dict2namedtuple(dic): 21 | return collections.namedtuple('Namespace', dic.keys())(**dic) 22 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import setuptools 3 | 4 | # read the contents of your README file 5 | from os import path 6 | this_directory = path.abspath(path.dirname(__file__)) 7 | with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f: 8 | long_description = f.read() 9 | 10 | 11 | setuptools.setup( 12 | name="elmoformanylangs", 13 | version="0.0.4.post2", 14 | packages=setuptools.find_packages(), 15 | install_requires=[ 16 | "torch", 17 | "h5py", 18 | "numpy", 19 | "overrides", 20 | ], 21 | package_data={'configs': ['elmoformanylangs/configs/*.json']}, 22 | include_package_data=True, 23 | author="Research Center for Social Computing and Information Retrieval", 24 | description="ELMo, updated to be usable with models for many languages", 25 | long_description=long_description, 26 | long_description_content_type='text/markdown', 27 | url="https://github.com/HIT-SCIR/ELMoForManyLangs", 28 | classifiers=[ 29 | "Development Status :: 3 - Alpha", 30 | "Programming Language :: Python", 31 | "Programming Language :: Python :: 3.6", 32 | ], 33 | ) 34 | --------------------------------------------------------------------------------