├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── elmoformanylangs
    ├── __init__.py
    ├── __main__.py
    ├── biLM.py
    ├── configs
    │   ├── cnn_0_100_512_4096_sample.json
    │   └── cnn_50_100_512_4096_sample.json
    ├── dataloader.py
    ├── elmo.py
    ├── frontend.py
    ├── modules
    │   ├── __init__.py
    │   ├── classify_layer.py
    │   ├── elmo.py
    │   ├── embedding_layer.py
    │   ├── encoder_base.py
    │   ├── highway.py
    │   ├── lstm.py
    │   ├── lstm_cell_with_projection.py
    │   ├── token_embedder.py
    │   └── util.py
    └── utils.py
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | .DS_Store
106 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 哈工大社会计算与信息检索研究中心
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include elmoformanylangs/configs/*.json
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Pre-trained ELMo Representations for Many Languages
  2 | ===================================================
  3 | 
  4 | We release our ELMo representations trained on many languages
  5 | which helps us win the [CoNLL 2018 shared task on Universal Dependencies Parsing](http://universaldependencies.org/conll18/results.html)
  6 | according to LAS.
  7 | 
  8 | ## Technique Details
  9 | 
 10 | We use the same hyperparameter settings as [Peters et al. (2018)](https://arxiv.org/abs/1802.05365) for the biLM
 11 | and the character CNN.
 12 | We train their parameters
 13 | on a set of 20-million-words data randomly
 14 | sampled from the raw text released by the shared task (wikidump + common crawl) for each language.
 15 | We largely based ourselves on the code of [AllenNLP](https://allennlp.org/), but made the following changes:
 16 | 
 17 | * We support unicode characters;
 18 | * We use the *sample softmax* technique
 19 | to make training on large vocabulary feasible ([Jean et al., 2015](https://arxiv.org/abs/1412.2007)).
 20 | However, we use a window of words surrounding the target word
 21 | as negative samples and it shows better performance in our preliminary experiments.
 22 | 
 23 | The training of ELMo on one language takes roughly 3 days on an NVIDIA P100 GPU.
 24 | 
 25 | 
 26 | ## Downloads
 27 | 
 28 | |   |   |   |   |
 29 | |---|---|---|---|
 30 | | [Arabic](http://vectors.nlpl.eu/repository/11/136.zip) | [Bulgarian](http://vectors.nlpl.eu/repository/11/137.zip) | [Catalan](http://vectors.nlpl.eu/repository/11/138.zip) | [Czech](http://vectors.nlpl.eu/repository/11/139.zip) |
 31 | | [Old Church Slavonic](http://vectors.nlpl.eu/repository/11/140.zip) | [Danish](http://vectors.nlpl.eu/repository/11/141.zip) | [German](http://vectors.nlpl.eu/repository/11/142.zip) | [Greek](http://vectors.nlpl.eu/repository/11/143.zip) |
 32 | | [English](http://vectors.nlpl.eu/repository/11/144.zip) | [Spanish](http://vectors.nlpl.eu/repository/11/145.zip) | [Estonian](http://vectors.nlpl.eu/repository/11/146.zip) | [Basque](http://vectors.nlpl.eu/repository/11/147.zip) |
 33 | | [Persian](http://vectors.nlpl.eu/repository/11/148.zip) | [Finnish](http://vectors.nlpl.eu/repository/11/149.zip) | [French](http://vectors.nlpl.eu/repository/11/150.zip) | [Irish](http://vectors.nlpl.eu/repository/11/151.zip) |
 34 | | [Galician](http://vectors.nlpl.eu/repository/11/152.zip) | [Ancient Greek](http://vectors.nlpl.eu/repository/11/153.zip) | [Hebrew](http://vectors.nlpl.eu/repository/11/154.zip) | [Hindi](http://vectors.nlpl.eu/repository/11/155.zip) |
 35 | | [Croatian](http://vectors.nlpl.eu/repository/11/156.zip) | [Hungarian](http://vectors.nlpl.eu/repository/11/157.zip) | [Indonesian](http://vectors.nlpl.eu/repository/11/158.zip) | [Italian](http://vectors.nlpl.eu/repository/11/159.zip) |
 36 | | [Japanese](http://vectors.nlpl.eu/repository/11/160.zip) | [Korean](http://vectors.nlpl.eu/repository/11/161.zip) | [Latin](http://vectors.nlpl.eu/repository/11/162.zip) | [Latvian](http://vectors.nlpl.eu/repository/11/163.zip) |
 37 | | [Norwegian Bokmål](http://vectors.nlpl.eu/repository/11/165.zip) | [Dutch](http://vectors.nlpl.eu/repository/11/164.zip) | [Norwegian Nynorsk](http://vectors.nlpl.eu/repository/11/166.zip) | [Polish](http://vectors.nlpl.eu/repository/11/167.zip) |
 38 | | [Portuguese](http://vectors.nlpl.eu/repository/11/168.zip) | [Romanian](http://vectors.nlpl.eu/repository/11/169.zip) | [Russian](http://vectors.nlpl.eu/repository/11/170.zip) | [Slovak](http://vectors.nlpl.eu/repository/11/171.zip) |
 39 | | [Slovene](http://vectors.nlpl.eu/repository/11/172.zip) | [Swedish](http://vectors.nlpl.eu/repository/11/173.zip) | [Turkish](http://vectors.nlpl.eu/repository/11/174.zip) | [Uyghur](http://vectors.nlpl.eu/repository/11/175.zip) |
 40 | | [Ukrainian](http://vectors.nlpl.eu/repository/11/176.zip) | [Urdu](http://vectors.nlpl.eu/repository/11/177.zip) | [Vietnamese](http://vectors.nlpl.eu/repository/11/178.zip) | [Chinese](http://vectors.nlpl.eu/repository/11/179.zip) |
 41 | 
 42 | The models are hosted on the [NLPL Vectors Repository](http://wiki.nlpl.eu/index.php/Vectors/home).
 43 | 
 44 | **ELMo for Simplified Chinese**
 45 | 
 46 | We also provided [simplified-Chinese ELMo](http://39.96.43.154/zhs.model.tar.bz2).
 47 | It was trained on xinhua proportion of [Chinese gigawords-v5](https://catalog.ldc.upenn.edu/ldc2011t13),
 48 | which is different from the Wikipedia for traditional Chinese ELMo.
 49 | 
 50 | ## Pre-requirements
 51 | 
 52 | * **must** python >= 3.6 (if you use python3.5, you will encounter this issue https://github.com/HIT-SCIR/ELMoForManyLangs/issues/8)
 53 | * pytorch 0.4
 54 | * other requirements from allennlp
 55 | 
 56 | ## Usage
 57 | 
 58 | 
 59 | ### Install the package
 60 | 
 61 | You need to install the package to use the embeddings with the following commends
 62 | ```
 63 | python setup.py install
 64 | ```
 65 | 
 66 | ### Set up the `config_path`
 67 | After unzip the model, you will find a JSON file `${lang}.model/config.json`.
 68 | Please change the `"config_path"` field to the relative path to 
 69 | the model configuration `cnn_50_100_512_4096_sample.json`.
 70 | For example, if your ELMo model is `zht.model/config.json` and your model configuration
 71 | is `zht.model/cnn_50_100_512_4096_sample.json`, you need to change `"config_path"`
 72 | in `zht.model/config.json` to `cnn_50_100_512_4096_sample.json`.
 73 | 
 74 | If there is no configuration `cnn_50_100_512_4096_sample.json` under `${lang}.model`,
 75 | you can copy the `elmoformanylangs/configs/cnn_50_100_512_4096_sample.json` into `${lang}.model`,
 76 | or change the `"config_path"` into  `elmoformanylangs/configs/cnn_50_100_512_4096_sample.json`.
 77 | 
 78 | See [issue 27](https://github.com/HIT-SCIR/ELMoForManyLangs/issues/27) for more details. 
 79 | 
 80 | 
 81 | ### Use ELMoForManyLangs in command line
 82 | 
 83 | Prepare your input file in the [conllu format](http://universaldependencies.org/format.html), like
 84 | ```
 85 | 1   Sue    Sue    _   _   _   _   _   _   _
 86 | 2   likes  like   _   _   _   _   _   _   _
 87 | 3   coffee coffee _   _   _   _   _   _   _
 88 | 4   and    and    _   _   _   _   _   _   _
 89 | 5   Bill   Bill   _   _   _   _   _   _   _
 90 | 6   tea    tea    _   _   _   _   _   _   _
 91 | ```
 92 | Fileds should be separated by `'\t'`. We only use the second column and space (`' '`) is supported in
 93 | this field (for Vietnamese, a word can contains spaces).
 94 | Do remember tokenization!
 95 | 
 96 | When it's all set, run
 97 | 
 98 | ```
 99 | $ python -m elmoformanylangs test \
100 |     --input_format conll \
101 |     --input /path/to/your/input \
102 |     --model /path/to/your/model \
103 |     --output_prefix /path/to/your/output \
104 |     --output_format hdf5 \
105 |     --output_layer -1
106 | ```
107 | 
108 | It will dump an hdf5 encoded `dict` onto the disk, where the key is `'\t'` separated
109 | words in the sentence and the value is it's 3-layer averaged ELMo representation.
110 | You can also dump the cnn encoded word with `--output_layer 0`,
111 | the first layer of the LsTM with `--output_layer 1` and the second layer
112 | of the LSTM with `--output_layer 2`.  
113 | We are actively changing the interface to make it more adapted to the 
114 | AllenNLP ELMo and more programmatically friendly.
115 | 
116 | ### Use ELMoForManyLangs programmatically
117 | 
118 | Thanks @voidism for contributing the API.
119 | By using `Embedder` python object, you can use ELMo into your own code like this:
120 | 
121 | ```python
122 | from elmoformanylangs import Embedder
123 | 
124 | e = Embedder('/path/to/your/model/')
125 | 
126 | sents = [['今', '天', '天氣', '真', '好', '阿'],
127 | ['潮水', '退', '了', '就', '知道', '誰', '沒', '穿', '褲子']]
128 | # the list of lists which store the sentences 
129 | # after segment if necessary.
130 | 
131 | e.sents2elmo(sents)
132 | # will return a list of numpy arrays 
133 | # each with the shape=(seq_len, embedding_size)
134 | ```
135 | 
136 | #### the parameters to init Embedder:
137 | ```python
138 | class Embedder(model_dir='/path/to/your/model/', batch_size=64):
139 | ```
140 | - **model_dir**: the absolute path from the repo top dir to you model dir.
141 | - **batch_size**: the batch_size you want when the model inference, you can specify it properly according to your gpu/cpu ram size. (default: 64)
142 | 
143 | #### the parameters of the function sents2elmo:
144 | ```python
145 | def sents2elmo(sents, output_layer=-1):
146 | ```
147 | - **sents**: the list of lists which store the sentences after segment if necessary.
148 | - **output_layer**: the target layer to output. 
149 |     -  0 for the word encoder
150 |     -  1 for the first LSTM hidden layer
151 |     -  2 for the second LSTM hidden layer
152 |     -  -1 for an average of 3 layers. (default)
153 |     -  -2 for all 3 layers
154 | 
155 | ## Training Your Own ELMo
156 | 
157 | Please run 
158 | ```
159 | $ python -m elmoformanylangs.biLM train -h
160 | ```
161 | to get more details about the ELMo training. 
162 | 
163 | Here is an example for training English ELMo.
164 | ```
165 | $ less data/en.raw
166 | ... (snip) ...
167 | Notable alumni
168 | Aris Kalafatis ( Acting )
169 | Labour Party
170 | They build an open nest in a tree hole , or man - made nest - boxes .
171 | Legacy
172 | ... (snip) ...
173 | 
174 | $ python -m elmoformanylangs.biLM train \
175 |     --train_path data/en.raw \
176 |     --config_path elmoformanylangs/configs/cnn_50_100_512_4096_sample.json \
177 |     --model output/en \
178 |     --optimizer adam \
179 |     --lr 0.001 \
180 |     --lr_decay 0.8 \
181 |     --max_epoch 10 \
182 |     --max_sent_len 20 \
183 |     --max_vocab_size 150000 \
184 |     --min_count 3
185 | ```
186 | However, we
187 | need to add that the training process is not very stable.
188 | In some cases, we end up with a loss of `nan`. We are actively working on that and hopefully
189 | improve it in the future.
190 | 
191 | ## Citation
192 | 
193 | If our ELMo gave you nice improvements, please cite us.
194 | 
195 | ```
196 | @InProceedings{che-EtAl:2018:K18-2,
197 |   author    = {Che, Wanxiang  and  Liu, Yijia  and  Wang, Yuxuan  and  Zheng, Bo  and  Liu, Ting},
198 |   title     = {Towards Better {UD} Parsing: Deep Contextualized Word Embeddings, Ensemble, and Treebank Concatenation},
199 |   booktitle = {Proceedings of the {CoNLL} 2018 Shared Task: Multilingual Parsing from Raw Text to Universal Dependencies},
200 |   month     = {October},
201 |   year      = {2018},
202 |   address   = {Brussels, Belgium},
203 |   publisher = {Association for Computational Linguistics},
204 |   pages     = {55--64},
205 |   url       = {http://www.aclweb.org/anthology/K18-2005}
206 | }
207 | ```
208 | 
209 | Please also cite the 
210 | [NLPL Vectors Repository](http://wiki.nlpl.eu/index.php/Vectors/home)
211 | for hosting the models.
212 | ```
213 | @InProceedings{fares-EtAl:2017:NoDaLiDa,
214 |   author    = {Fares, Murhaf  and  Kutuzov, Andrey  and  Oepen, Stephan  and  Velldal, Erik},
215 |   title     = {Word vectors, reuse, and replicability: Towards a community repository of large-text resources},
216 |   booktitle = {Proceedings of the 21st Nordic Conference on Computational Linguistics},
217 |   month     = {May},
218 |   year      = {2017},
219 |   address   = {Gothenburg, Sweden},
220 |   publisher = {Association for Computational Linguistics},
221 |   pages     = {271--276},
222 |   url       = {http://www.aclweb.org/anthology/W17-0237}
223 | }
224 | ```
225 | 


--------------------------------------------------------------------------------
/elmoformanylangs/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from .elmo import Embedder
 3 | 
 4 | 
 5 | import logging
 6 | logger = logging.getLogger('elmoformanylangs')
 7 | 
 8 | # if the client application hasn't set the log level, we set it
 9 | # ourselves to INFO
10 | if logger.level == 0:
11 |     logger.setLevel(logging.INFO)
12 | 
13 | log_handler = logging.StreamHandler()
14 | log_formatter = logging.Formatter(fmt="%(asctime)-15s %(levelname)s: %(message)s")
15 | log_handler.setFormatter(log_formatter)
16 | 
17 | # also, if the client hasn't added any handlers for this logger
18 | # (or a default handler), we add a handler of our own
19 | #
20 | # client can later do
21 | #   logger.removeHandler(stanza.log_handler)
22 | if not logger.hasHandlers():
23 |     logger.addHandler(log_handler)
24 | 


--------------------------------------------------------------------------------
/elmoformanylangs/__main__.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | from __future__ import print_function
  3 | from __future__ import unicode_literals
  4 | import os
  5 | import sys
  6 | import codecs
  7 | import argparse
  8 | import logging
  9 | import json
 10 | import torch
 11 | from .modules.embedding_layer import EmbeddingLayer
 12 | from .utils import dict2namedtuple
 13 | from .frontend import Model
 14 | from .frontend import create_batches
 15 | import numpy as np
 16 | import h5py
 17 | 
 18 | logger = logging.getLogger('elmoformanylangs')
 19 | 
 20 | 
 21 | def read_corpus(path, max_chars=None):
 22 |   """
 23 |   read raw text file. The format of the input is like, one sentence per line
 24 |   words are separated by '\t'
 25 | 
 26 |   :param path:
 27 |   :param max_chars: int, the number of maximum characters in a word, this
 28 |     parameter is used when the model is configured with CNN word encoder.
 29 |   :return:
 30 |   """
 31 |   dataset = []
 32 |   textset = []
 33 |   with codecs.open(path, 'r', encoding='utf-8') as fin:
 34 |     for line in fin.read().strip().split('\n'):
 35 |       data = ['<bos>']
 36 |       text = []
 37 |       for token in line.split('\t'):
 38 |         text.append(token)
 39 |         if max_chars is not None and len(token) + 2 > max_chars:
 40 |           token = token[:max_chars - 2]
 41 |         data.append(token)
 42 |       data.append('<eos>')
 43 |       dataset.append(data)
 44 |       textset.append(text)
 45 |   return dataset, textset
 46 | 
 47 | 
 48 | def read_conll_corpus(path, max_chars=None):
 49 |   """
 50 |   read text in CoNLL-U format.
 51 | 
 52 |   :param path:
 53 |   :param max_chars:
 54 |   :return:
 55 |   """
 56 |   dataset = []
 57 |   textset = []
 58 |   with codecs.open(path, 'r', encoding='utf-8') as fin:
 59 |     for payload in fin.read().strip().split('\n\n'):
 60 |       data = ['<bos>']
 61 |       text = []
 62 |       lines = payload.splitlines()
 63 |       body = [line for line in lines if not line.startswith('#')]
 64 |       for line in body:
 65 |         fields = line.split('\t')
 66 |         num, token = fields[0], fields[1]
 67 |         if '-' in num or '.' in num:
 68 |           continue
 69 |         text.append(token)
 70 |         if max_chars is not None and len(token) + 2 > max_chars:
 71 |           token = token[:max_chars - 2]
 72 |         data.append(token)
 73 |       data.append('<eos>')
 74 |       dataset.append(data)
 75 |       textset.append(text)
 76 |   return dataset, textset
 77 | 
 78 | 
 79 | def read_conll_char_corpus(path, max_chars=None):
 80 |   """
 81 | 
 82 |   :param path:
 83 |   :param max_chars:
 84 |   :return:
 85 |   """
 86 |   dataset = []
 87 |   textset = []
 88 |   with codecs.open(path, 'r', encoding='utf-8') as fin:
 89 |     for payload in fin.read().strip().split('\n\n'):
 90 |       data = ['<bos>']
 91 |       text = []
 92 |       lines = payload.splitlines()
 93 |       body = [line for line in lines if not line.startswith('#')]
 94 |       for line in body:
 95 |         fields = line.split('\t')
 96 |         num, token = fields[0], fields[1]
 97 |         if '-' in num or '.' in num:
 98 |           continue
 99 |         for ch in token:
100 |           text.append(ch)
101 |           if max_chars is not None and len(ch) + 2 > max_chars:
102 |             ch = ch[:max_chars - 2]
103 |           data.append(ch)
104 |       data.append('<eos>')
105 |       dataset.append(data)
106 |       textset.append(text)
107 |   return dataset, textset
108 | 
109 | 
110 | def read_conll_char_vi_corpus(path, max_chars=None):
111 |   """
112 | 
113 |   :param path:
114 |   :param max_chars:
115 |   :return:
116 |   """
117 |   dataset = []
118 |   textset = []
119 |   with codecs.open(path, 'r', encoding='utf-8') as fin:
120 |     for payload in fin.read().strip().split('\n\n'):
121 |       data = ['<bos>']
122 |       text = []
123 |       lines = payload.splitlines()
124 |       body = [line for line in lines if not line.startswith('#')]
125 |       for line in body:
126 |         fields = line.split('\t')
127 |         num, token = fields[0], fields[1]
128 |         if '-' in num or '.' in num:
129 |           continue
130 |         for ch in token.split():
131 |           text.append(ch)
132 |           if max_chars is not None and len(ch) + 2 > max_chars:
133 |             ch = ch[:max_chars - 2]
134 |           data.append(ch)
135 |       data.append('<eos>')
136 |       dataset.append(data)
137 |       textset.append(text)
138 |   return dataset, textset
139 | 
140 | 
141 | def test_main():
142 |   # Configurations
143 |   cmd = argparse.ArgumentParser('The testing components of')
144 |   cmd.add_argument('--gpu', default=-1, type=int, help='use id of gpu, -1 if cpu.')
145 |   cmd.add_argument('--input_format', default='plain', choices=('plain', 'conll', 'conll_char', 'conll_char_vi'),
146 |                    help='the input format.')
147 |   cmd.add_argument("--input", help="the path to the raw text file.")
148 |   cmd.add_argument("--output_format", default='hdf5', help='the output format. Supported format includes (hdf5, txt).'
149 |                                                            ' Use comma to separate the format identifiers,'
150 |                                                            ' like \'--output_format=hdf5,plain\'')
151 |   cmd.add_argument("--output_prefix", help='the prefix of the output file. The output file is in the format of '
152 |                                            '<output_prefix>.<output_layer>.<output_format>')
153 |   cmd.add_argument("--output_layer", help='the target layer to output. 0 for the word encoder, 1 for the first LSTM '
154 |                                           'hidden layer, 2 for the second LSTM hidden layer, -1 for an average'
155 |                                           'of 3 layers.')
156 |   cmd.add_argument("--model", required=True, help="the path to the model.")
157 |   cmd.add_argument("--batch_size", "--batch", type=int, default=1, help='the batch size.')
158 |   args = cmd.parse_args(sys.argv[2:])
159 | 
160 |   if args.gpu >= 0:
161 |     torch.cuda.set_device(args.gpu)
162 |   use_cuda = args.gpu >= 0 and torch.cuda.is_available()
163 |   # load the model configurations
164 |   args2 = dict2namedtuple(json.load(codecs.open(os.path.join(args.model, 'config.json'), 'r', encoding='utf-8')))
165 | 
166 |   with open(os.path.join(args.model, args2.config_path), 'r') as fin:
167 |     config = json.load(fin)
168 | 
169 |   # For the model trained with character-based word encoder.
170 |   if config['token_embedder']['char_dim'] > 0:
171 |     char_lexicon = {}
172 |     with codecs.open(os.path.join(args.model, 'char.dic'), 'r', encoding='utf-8') as fpi:
173 |       for line in fpi:
174 |         tokens = line.strip().split('\t')
175 |         if len(tokens) == 1:
176 |           tokens.insert(0, '\u3000')
177 |         token, i = tokens
178 |         char_lexicon[token] = int(i)
179 |     char_emb_layer = EmbeddingLayer(config['token_embedder']['char_dim'], char_lexicon, fix_emb=False, embs=None)
180 |     logger.info('char embedding size: ' + str(len(char_emb_layer.word2id)))
181 |   else:
182 |     char_lexicon = None
183 |     char_emb_layer = None
184 | 
185 |   # For the model trained with word form word encoder.
186 |   if config['token_embedder']['word_dim'] > 0:
187 |     word_lexicon = {}
188 |     with codecs.open(os.path.join(args.model, 'word.dic'), 'r', encoding='utf-8') as fpi:
189 |       for line in fpi:
190 |         tokens = line.strip().split('\t')
191 |         if len(tokens) == 1:
192 |           tokens.insert(0, '\u3000')
193 |         token, i = tokens
194 |         word_lexicon[token] = int(i)
195 |     word_emb_layer = EmbeddingLayer(config['token_embedder']['word_dim'], word_lexicon, fix_emb=False, embs=None)
196 |     logger.info('word embedding size: ' + str(len(word_emb_layer.word2id)))
197 |   else:
198 |     word_lexicon = None
199 |     word_emb_layer = None
200 | 
201 |   # instantiate the model
202 |   model = Model(config, word_emb_layer, char_emb_layer, use_cuda)
203 | 
204 |   if use_cuda:
205 |     model.cuda()
206 | 
207 |   logger.info(str(model))
208 |   model.load_model(args.model)
209 | 
210 |   # read test data according to input format
211 |   read_function = read_corpus if args.input_format == 'plain' else (
212 |     read_conll_corpus if args.input_format == 'conll' else (
213 |       read_conll_char_corpus if args.input_format == 'conll_char' else read_conll_char_vi_corpus))
214 | 
215 |   if config['token_embedder']['name'].lower() == 'cnn':
216 |     test, text = read_function(args.input, config['token_embedder']['max_characters_per_token'])
217 |   else:
218 |     test, text = read_function(args.input)
219 | 
220 |   # create test batches from the input data.
221 |   test_w, test_c, test_lens, test_masks, test_text = create_batches(
222 |     test, args.batch_size, word_lexicon, char_lexicon, config, text=text)
223 | 
224 |   # configure the model to evaluation mode.
225 |   model.eval()
226 | 
227 |   sent_set = set()
228 |   cnt = 0
229 | 
230 |   output_formats = args.output_format.split(',')
231 |   output_layers = map(int, args.output_layer.split(','))
232 | 
233 |   handlers = {}
234 |   for output_format in output_formats:
235 |     if output_format not in ('hdf5', 'txt'):
236 |       print('Unknown output_format: {0}'.format(output_format))
237 |       continue
238 |     for output_layer in output_layers:
239 |       filename = '{0}.ly{1}.{2}'.format(args.output_prefix, output_layer, output_format)
240 |       handlers[output_format, output_layer] = \
241 |         h5py.File(filename, 'w') if output_format == 'hdf5' else open(filename, 'w')
242 | 
243 |   for w, c, lens, masks, texts in zip(test_w, test_c, test_lens, test_masks, test_text):
244 |     output = model.forward(w, c, masks)
245 |     for i, text in enumerate(texts):
246 |       sent = '\t'.join(text)
247 |       sent = sent.replace('.', '$period$')
248 |       sent = sent.replace('/', '$backslash$')
249 |       if sent in sent_set:
250 |         continue
251 |       sent_set.add(sent)
252 |       if config['encoder']['name'].lower() == 'lstm':
253 |         data = output[i, 1:lens[i]-1, :].data
254 |         if use_cuda:
255 |           data = data.cpu()
256 |         data = data.numpy()
257 |       elif config['encoder']['name'].lower() == 'elmo':
258 |         data = output[:, i, 1:lens[i]-1, :].data
259 |         if use_cuda:
260 |           data = data.cpu()
261 |         data = data.numpy()
262 | 
263 |       for (output_format, output_layer) in handlers:
264 |         fout = handlers[output_format, output_layer]
265 |         if output_layer == -1:
266 |           payload = np.average(data, axis=0)
267 |         else:
268 |           payload = data[output_layer]
269 |         if output_format == 'hdf5':
270 |           fout.create_dataset(sent, payload.shape, dtype='float32', data=payload)
271 |         else:
272 |           for word, row in zip(text, payload):
273 |             print('{0}\t{1}'.format(word, '\t'.join(['{0:.8f}'.format(elem) for elem in row])), file=fout)
274 |           print('', file=fout)
275 | 
276 |       cnt += 1
277 |       if cnt % 1000 == 0:
278 |         logger.info('Finished {0} sentences.'.format(cnt))
279 |   for _, handler in handlers.items():
280 |     handler.close()
281 | 
282 | 
283 | if __name__ == "__main__":
284 |   if len(sys.argv) > 1 and sys.argv[1] == 'test':
285 |     test_main()
286 |   else:
287 |     print('Usage: {0} [test] [options]'.format(sys.argv[0]), file=sys.stderr)
288 | 


--------------------------------------------------------------------------------
/elmoformanylangs/biLM.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | from __future__ import print_function
  3 | from __future__ import unicode_literals
  4 | import os
  5 | import errno
  6 | import sys
  7 | import codecs
  8 | import argparse
  9 | import time
 10 | import random
 11 | import logging
 12 | import json
 13 | import torch
 14 | import torch.nn as nn
 15 | import torch.nn.functional as F
 16 | import torch.optim as optim
 17 | from torch.autograd import Variable
 18 | from .modules.elmo import ElmobiLm
 19 | from .modules.lstm import LstmbiLm
 20 | from .modules.token_embedder import ConvTokenEmbedder, LstmTokenEmbedder
 21 | from .modules.embedding_layer import EmbeddingLayer
 22 | from .modules.classify_layer import SoftmaxLayer, CNNSoftmaxLayer, SampledSoftmaxLayer
 23 | from .dataloader import load_embedding
 24 | from .utils import dict2namedtuple
 25 | from collections import Counter
 26 | import numpy as np
 27 | 
 28 | logger = logging.getLogger('elmoformanylangs')
 29 | 
 30 | 
 31 | def divide(data, valid_size):
 32 |   valid_size = min(valid_size, len(data) // 10)
 33 |   random.shuffle(data)
 34 |   return data[valid_size:], data[:valid_size]
 35 | 
 36 | 
 37 | def break_sentence(sentence, max_sent_len):
 38 |   """
 39 |   For example, for a sentence with 70 words, supposing the the `max_sent_len'
 40 |   is 30, break it into 3 sentences.
 41 | 
 42 |   :param sentence: list[str] the sentence
 43 |   :param max_sent_len:
 44 |   :return:
 45 |   """
 46 |   ret = []
 47 |   cur = 0
 48 |   length = len(sentence)
 49 |   while cur < length:
 50 |     if cur + max_sent_len + 5 >= length:
 51 |       ret.append(sentence[cur: length])
 52 |       break
 53 |     ret.append(sentence[cur: min(length, cur + max_sent_len)])
 54 |     cur += max_sent_len
 55 |   return ret
 56 | 
 57 | 
 58 | def read_corpus(path, max_chars=None, max_sent_len=20):
 59 |   """
 60 |   read raw text file
 61 |   :param path: str
 62 |   :param max_chars: int
 63 |   :param max_sent_len: int
 64 |   :return:
 65 |   """
 66 |   data = []
 67 |   with codecs.open(path, 'r', encoding='utf-8') as fin:
 68 |     for line in fin:
 69 |       data.append('<bos>')
 70 |       for token in line.strip().split():
 71 |         if max_chars is not None and len(token) + 2 > max_chars:
 72 |           token = token[:max_chars - 2]
 73 |         data.append(token)
 74 |       data.append('<eos>')
 75 |   dataset = break_sentence(data, max_sent_len)
 76 |   return dataset
 77 | 
 78 | 
 79 | def create_one_batch(x, word2id, char2id, config, oov='<oov>', pad='<pad>', sort=True):
 80 |   """
 81 | 
 82 |   :param x:
 83 |   :param word2id: dict
 84 |   :param char2id: dict
 85 |   :param config:
 86 |   :param oov:
 87 |   :param pad:
 88 |   :param sort:
 89 |   :return:
 90 |   """
 91 |   batch_size = len(x)
 92 |   lst = list(range(batch_size))
 93 |   if sort:
 94 |     lst.sort(key=lambda l: -len(x[l]))
 95 | 
 96 |   x = [x[i] for i in lst]
 97 |   lens = [len(x[i]) for i in lst]
 98 |   max_len = max(lens)
 99 | 
100 |   if word2id is not None:
101 |     oov_id, pad_id = word2id.get(oov, None), word2id.get(pad, None)
102 |     assert oov_id is not None and pad_id is not None
103 |     batch_w = torch.LongTensor(batch_size, max_len).fill_(pad_id)
104 |     for i, x_i in enumerate(x):
105 |       for j, x_ij in enumerate(x_i):
106 |         batch_w[i][j] = word2id.get(x_ij, oov_id)
107 |   else:
108 |     batch_w = None
109 | 
110 |   if char2id is not None:
111 |     bow_id, eow_id, oov_id, pad_id = char2id.get('<eow>', None), char2id.get('<bow>', None), char2id.get(oov, None), char2id.get(pad, None)
112 | 
113 |     assert bow_id is not None and eow_id is not None and oov_id is not None and pad_id is not None
114 | 
115 |     if config['token_embedder']['name'].lower() == 'cnn':
116 |       max_chars = config['token_embedder']['max_characters_per_token']
117 |       assert max([len(w) for i in lst for w in x[i]]) + 2 <= max_chars
118 |     elif config['token_embedder']['name'].lower() == 'lstm':
119 |       max_chars = max([len(w) for i in lst for w in x[i]]) + 2  # counting the <bow> and <eow>
120 | 
121 |     batch_c = torch.LongTensor(batch_size, max_len, max_chars).fill_(pad_id)
122 | 
123 |     for i, x_i in enumerate(x):
124 |       for j, x_ij in enumerate(x_i):
125 |         batch_c[i][j][0] = bow_id
126 |         if x_ij == '<bos>' or x_ij == '<eos>':
127 |           batch_c[i][j][1] = char2id.get(x_ij)
128 |           batch_c[i][j][2] = eow_id
129 |         else:
130 |           for k, c in enumerate(x_ij):
131 |             batch_c[i][j][k + 1] = char2id.get(c, oov_id)
132 |           batch_c[i][j][len(x_ij) + 1] = eow_id
133 |   else:
134 |     batch_c = None
135 | 
136 |   masks = [torch.LongTensor(batch_size, max_len).fill_(0), [], []]
137 | 
138 |   for i, x_i in enumerate(x):
139 |     for j in range(len(x_i)):
140 |       masks[0][i][j] = 1
141 |       if j + 1 < len(x_i):
142 |         masks[1].append(i * max_len + j)
143 |       if j > 0: 
144 |         masks[2].append(i * max_len + j)
145 | 
146 |   assert len(masks[1]) <= batch_size * max_len
147 |   assert len(masks[2]) <= batch_size * max_len
148 | 
149 |   masks[1] = torch.LongTensor(masks[1])
150 |   masks[2] = torch.LongTensor(masks[2])
151 | 
152 |   return batch_w, batch_c, lens, masks
153 | 
154 | 
155 | # shuffle training examples and create mini-batches
156 | def create_batches(x, batch_size, word2id, char2id, config, perm=None, shuffle=True, sort=True, use_cuda=False):
157 |   """
158 | 
159 |   :param x:
160 |   :param batch_size:
161 |   :param word2id:
162 |   :param char2id:
163 |   :param config:
164 |   :param perm:
165 |   :param shuffle:
166 |   :param sort:
167 |   :param use_cuda:
168 |   :return:
169 |   """
170 |   lst = perm or list(range(len(x)))
171 |   if shuffle:
172 |     random.shuffle(lst)
173 | 
174 |   if sort:
175 |     lst.sort(key=lambda l: -len(x[l]))
176 | 
177 |   x = [x[i] for i in lst]
178 | 
179 |   sum_len = 0.0
180 |   batches_w, batches_c, batches_lens, batches_masks = [], [], [], []
181 |   size = batch_size
182 |   nbatch = (len(x) - 1) // size + 1
183 |   for i in range(nbatch):
184 |     start_id, end_id = i * size, (i + 1) * size
185 |     bw, bc, blens, bmasks = create_one_batch(x[start_id: end_id], word2id, char2id, config, sort=sort)
186 |     sum_len += sum(blens)
187 |     batches_w.append(bw)
188 |     batches_c.append(bc)
189 |     batches_lens.append(blens)
190 |     batches_masks.append(bmasks)
191 | 
192 |   if sort:
193 |     perm = list(range(nbatch))
194 |     random.shuffle(perm)
195 |     batches_w = [batches_w[i] for i in perm]
196 |     batches_c = [batches_c[i] for i in perm]
197 |     batches_lens = [batches_lens[i] for i in perm]
198 |     batches_masks = [batches_masks[i] for i in perm]
199 | 
200 |   logger.info("{} batches, avg len: {:.1f}".format(nbatch, sum_len / len(x)))
201 |   return batches_w, batches_c, batches_lens, batches_masks
202 | 
203 | 
204 | class Model(nn.Module):
205 |   def __init__(self, config, word_emb_layer, char_emb_layer, n_class, use_cuda=False):
206 |     super(Model, self).__init__() 
207 |     self.use_cuda = use_cuda
208 |     self.config = config
209 | 
210 |     if config['token_embedder']['name'].lower() == 'cnn':
211 |       self.token_embedder = ConvTokenEmbedder(config, word_emb_layer, char_emb_layer, use_cuda)
212 |     elif config['token_embedder']['name'].lower() == 'lstm':
213 |       self.token_embedder = LstmTokenEmbedder(config, word_emb_layer, char_emb_layer, use_cuda)
214 | 
215 |     if config['encoder']['name'].lower() == 'elmo':
216 |       self.encoder = ElmobiLm(config, use_cuda)
217 |     elif config['encoder']['name'].lower() == 'lstm':
218 |       self.encoder = LstmbiLm(config, use_cuda)
219 | 
220 |     self.output_dim = config['encoder']['projection_dim']
221 |     if config['classifier']['name'].lower() == 'softmax':
222 |       self.classify_layer = SoftmaxLayer(self.output_dim, n_class)
223 |     elif config['classifier']['name'].lower() == 'cnn_softmax':
224 |       self.classify_layer = CNNSoftmaxLayer(self.token_embedder, self.output_dim, n_class,
225 |                                             config['classifier']['n_samples'], config['classifier']['corr_dim'],
226 |                                             use_cuda)
227 |     elif config['classifier']['name'].lower() == 'sampled_softmax':
228 |       self.classify_layer = SampledSoftmaxLayer(self.output_dim, n_class, config['classifier']['n_samples'], use_cuda)
229 | 
230 |   def forward(self, word_inp, chars_inp, mask_package):
231 |     """
232 | 
233 |     :param word_inp:
234 |     :param chars_inp:
235 |     :param mask_package: Tuple[]
236 |     :return:
237 |     """
238 |     classifier_name = self.config['classifier']['name'].lower()
239 | 
240 |     if self.training and classifier_name == 'cnn_softmax' or classifier_name == 'sampled_softmax':
241 |       self.classify_layer.update_negative_samples(word_inp, chars_inp, mask_package[0])
242 |       self.classify_layer.update_embedding_matrix()
243 | 
244 |     token_embedding = self.token_embedder(word_inp, chars_inp, (mask_package[0].size(0), mask_package[0].size(1)))
245 |     token_embedding = F.dropout(token_embedding, self.config['dropout'], self.training)
246 | 
247 |     encoder_name = self.config['encoder']['name'].lower()
248 |     if encoder_name == 'elmo':
249 |       mask = Variable(mask_package[0].cuda()).cuda() if self.use_cuda else Variable(mask_package[0])
250 |       encoder_output = self.encoder(token_embedding, mask)
251 |       encoder_output = encoder_output[1]
252 |       # [batch_size, len, hidden_size]
253 |     elif encoder_name == 'lstm':
254 |       encoder_output = self.encoder(token_embedding)
255 |     else:
256 |       raise ValueError('')
257 | 
258 |     encoder_output = F.dropout(encoder_output, self.config['dropout'], self.training)
259 |     forward, backward = encoder_output.split(self.output_dim, 2)
260 | 
261 |     word_inp = Variable(word_inp)
262 |     if self.use_cuda:
263 |       word_inp = word_inp.cuda()
264 | 
265 |     mask1 = Variable(mask_package[1].cuda()).cuda() if self.use_cuda else Variable(mask_package[1])
266 |     mask2 = Variable(mask_package[2].cuda()).cuda() if self.use_cuda else Variable(mask_package[2])
267 | 
268 |     forward_x = forward.contiguous().view(-1, self.output_dim).index_select(0, mask1)
269 |     forward_y = word_inp.contiguous().view(-1).index_select(0, mask2)
270 | 
271 |     backward_x = backward.contiguous().view(-1, self.output_dim).index_select(0, mask2)
272 |     backward_y = word_inp.contiguous().view(-1).index_select(0, mask1)
273 | 
274 |     return self.classify_layer(forward_x, forward_y), self.classify_layer(backward_x, backward_y)
275 | 
276 |   def save_model(self, path, save_classify_layer):
277 |     torch.save(self.token_embedder.state_dict(), os.path.join(path, 'token_embedder.pkl'))    
278 |     torch.save(self.encoder.state_dict(), os.path.join(path, 'encoder.pkl'))
279 |     if save_classify_layer:
280 |       torch.save(self.classify_layer.state_dict(), os.path.join(path, 'classifier.pkl'))
281 | 
282 |   def load_model(self, path):
283 |     self.token_embedder.load_state_dict(torch.load(os.path.join(path, 'token_embedder.pkl')))
284 |     self.encoder.load_state_dict(torch.load(os.path.join(path, 'encoder.pkl')))
285 |     self.classify_layer.load_state_dict(torch.load(os.path.join(path, 'classifier.pkl')))
286 | 
287 | 
288 | def eval_model(model, valid):
289 |   model.eval()
290 |   if model.config['classifier']['name'].lower() == 'cnn_softmax' or \
291 |       model.config['classifier']['name'].lower() == 'sampled_softmax':
292 |     model.classify_layer.update_embedding_matrix()
293 |   total_loss, total_tag = 0.0, 0
294 |   valid_w, valid_c, valid_lens, valid_masks = valid
295 |   for w, c, lens, masks in zip(valid_w, valid_c, valid_lens, valid_masks):
296 |     loss_forward, loss_backward = model.forward(w, c, masks)
297 |     total_loss += loss_forward.data[0]
298 |     n_tags = sum(lens)
299 |     total_tag += n_tags
300 |   model.train()
301 |   return np.exp(total_loss / total_tag)
302 | 
303 | 
304 | def train_model(epoch, opt, model, optimizer,
305 |                 train, valid, test, best_train, best_valid, test_result):
306 |   """
307 |   Training model for one epoch
308 | 
309 |   :param epoch:
310 |   :param opt:
311 |   :param model:
312 |   :param optimizer:
313 |   :param train:
314 |   :param best_train:
315 |   :param valid:
316 |   :param best_valid:
317 |   :param test:
318 |   :param test_result:
319 |   :return:
320 |   """
321 |   model.train()
322 | 
323 |   total_loss, total_tag = 0.0, 0
324 |   cnt = 0
325 |   start_time = time.time()
326 | 
327 |   train_w, train_c, train_lens, train_masks = train
328 | 
329 |   lst = list(range(len(train_w)))
330 |   random.shuffle(lst)
331 |   
332 |   train_w = [train_w[l] for l in lst]
333 |   train_c = [train_c[l] for l in lst]
334 |   train_lens = [train_lens[l] for l in lst]
335 |   train_masks = [train_masks[l] for l in lst]
336 | 
337 |   for w, c, lens, masks in zip(train_w, train_c, train_lens, train_masks):
338 |     cnt += 1
339 |     model.zero_grad()
340 |     loss_forward, loss_backward = model.forward(w, c, masks)
341 | 
342 |     loss = (loss_forward + loss_backward) / 2.0
343 |     total_loss += loss_forward.data[0]
344 |     n_tags = sum(lens)
345 |     total_tag += n_tags
346 |     loss.backward()
347 | 
348 |     torch.nn.utils.clip_grad_norm(model.parameters(), opt.clip_grad)
349 |     optimizer.step()
350 |     if cnt * opt.batch_size % 1024 == 0:
351 |       logger.info("Epoch={} iter={} lr={:.6f} train_ppl={:.6f} time={:.2f}s".format(
352 |         epoch, cnt, optimizer.param_groups[0]['lr'],
353 |         np.exp(total_loss / total_tag), time.time() - start_time
354 |       ))
355 |       start_time = time.time()
356 | 
357 |     if cnt % opt.eval_steps == 0 or cnt % len(train_w) == 0:
358 |       if valid is None:
359 |         train_ppl = np.exp(total_loss / total_tag)
360 |         logger.info("Epoch={} iter={} lr={:.6f} train_ppl={:.6f}".format(
361 |           epoch, cnt, optimizer.param_groups[0]['lr'], train_ppl))
362 |         if train_ppl < best_train:
363 |           best_train = train_ppl
364 |           logger.info("New record achieved on training dataset!")
365 |           model.save_model(opt.model, opt.save_classify_layer)      
366 |       else:
367 |         valid_ppl = eval_model(model, valid)
368 |         logger.info("Epoch={} iter={} lr={:.6f} valid_ppl={:.6f}".format(
369 |           epoch, cnt, optimizer.param_groups[0]['lr'], valid_ppl))
370 | 
371 |         if valid_ppl < best_valid:
372 |           model.save_model(opt.model, opt.save_classify_layer)
373 |           best_valid = valid_ppl
374 |           logger.info("New record achieved!")
375 | 
376 |           if test is not None:
377 |             test_result = eval_model(model, test)
378 |             logger.info("Epoch={} iter={} lr={:.6f} test_ppl={:.6f}".format(
379 |               epoch, cnt, optimizer.param_groups[0]['lr'], test_result))
380 |   return best_train, best_valid, test_result
381 | 
382 | 
383 | def get_truncated_vocab(dataset, min_count):
384 |   """
385 | 
386 |   :param dataset:
387 |   :param min_count: int
388 |   :return:
389 |   """
390 |   word_count = Counter()
391 |   for sentence in dataset:
392 |     word_count.update(sentence)
393 | 
394 |   word_count = list(word_count.items())
395 |   word_count.sort(key=lambda x: x[1], reverse=True)
396 | 
397 |   i = 0
398 |   for word, count in word_count:
399 |     if count < min_count:
400 |       break
401 |     i += 1
402 | 
403 |   logger.info('Truncated word count: {0}.'.format(sum([count for word, count in word_count[i:]])))
404 |   logger.info('Original vocabulary size: {0}.'.format(len(word_count)))
405 |   return word_count[:i]
406 | 
407 | 
408 | def train():
409 |   cmd = argparse.ArgumentParser(sys.argv[0], conflict_handler='resolve')
410 |   cmd.add_argument('--seed', default=1, type=int, help='The random seed.')
411 |   cmd.add_argument('--gpu', default=-1, type=int, help='Use id of gpu, -1 if cpu.')
412 | 
413 |   cmd.add_argument('--train_path', required=True, help='The path to the training file.')
414 |   cmd.add_argument('--valid_path', help='The path to the development file.')
415 |   cmd.add_argument('--test_path', help='The path to the testing file.')
416 | 
417 |   cmd.add_argument('--config_path', required=True, help='the path to the config file.')
418 |   cmd.add_argument("--word_embedding", help="The path to word vectors.")
419 | 
420 |   cmd.add_argument('--optimizer', default='sgd', choices=['sgd', 'adam', 'adagrad'],
421 |                    help='the type of optimizer: valid options=[sgd, adam, adagrad]')
422 |   cmd.add_argument("--lr", type=float, default=0.01, help='the learning rate.')
423 |   cmd.add_argument("--lr_decay", type=float, default=0, help='the learning rate decay.')
424 | 
425 |   cmd.add_argument("--model", required=True, help="path to save model")
426 |   
427 |   cmd.add_argument("--batch_size", "--batch", type=int, default=32, help='the batch size.')
428 |   cmd.add_argument("--max_epoch", type=int, default=100, help='the maximum number of iteration.')
429 |   
430 |   cmd.add_argument("--clip_grad", type=float, default=5, help='the tense of clipped grad.')
431 | 
432 |   cmd.add_argument('--max_sent_len', type=int, default=20, help='maximum sentence length.')
433 | 
434 |   cmd.add_argument('--min_count', type=int, default=5, help='minimum word count.')
435 | 
436 |   cmd.add_argument('--max_vocab_size', type=int, default=150000, help='maximum vocabulary size.')
437 | 
438 |   cmd.add_argument('--save_classify_layer', default=False, action='store_true',
439 |                    help="whether to save the classify layer")
440 | 
441 |   cmd.add_argument('--valid_size', type=int, default=0, help="size of validation dataset when there's no valid.")
442 |   cmd.add_argument('--eval_steps', required=False, type=int, help='report every xx batches.')
443 | 
444 |   opt = cmd.parse_args(sys.argv[2:])
445 | 
446 |   with open(opt.config_path, 'r') as fin:
447 |     config = json.load(fin)
448 | 
449 |   # Dump configurations
450 |   print(opt)
451 |   print(config)
452 | 
453 |   # set seed.
454 |   torch.manual_seed(opt.seed)
455 |   random.seed(opt.seed)
456 |   if opt.gpu >= 0:
457 |     torch.cuda.set_device(opt.gpu)
458 |     if opt.seed > 0:
459 |       torch.cuda.manual_seed(opt.seed)
460 | 
461 |   use_cuda = opt.gpu >= 0 and torch.cuda.is_available()
462 | 
463 |   token_embedder_name = config['token_embedder']['name'].lower()
464 |   token_embedder_max_chars = config['token_embedder'].get('max_characters_per_token', None)
465 |   if token_embedder_name == 'cnn':
466 |     train_data = read_corpus(opt.train_path, token_embedder_max_chars, opt.max_sent_len)
467 |   elif token_embedder_name == 'lstm':
468 |     train_data = read_corpus(opt.train_path, opt.max_sent_len)
469 |   else:
470 |     raise ValueError('Unknown token embedder name: {}'.format(token_embedder_name))
471 | 
472 |   logger.info('training instance: {}, training tokens: {}.'.format(len(train_data),
473 |                                                                     sum([len(s) - 1 for s in train_data])))
474 | 
475 |   if opt.valid_path is not None:
476 |     if token_embedder_name == 'cnn':
477 |       valid_data = read_corpus(opt.valid_path, token_embedder_max_chars, opt.max_sent_len)
478 |     elif token_embedder_name == 'lstm':
479 |       valid_data = read_corpus(opt.valid_path, opt.max_sent_len)
480 |     else:
481 |       raise ValueError('Unknown token embedder name: {}'.format(token_embedder_name))
482 |     logger.info('valid instance: {}, valid tokens: {}.'.format(len(valid_data),
483 |                                                                 sum([len(s) - 1 for s in valid_data])))
484 |   elif opt.valid_size > 0:
485 |     train_data, valid_data = divide(train_data, opt.valid_size)
486 |     logger.info('training instance: {}, training tokens after division: {}.'.format(
487 |       len(train_data), sum([len(s) - 1 for s in train_data])))
488 |     logger.info('valid instance: {}, valid tokens: {}.'.format(
489 |       len(valid_data), sum([len(s) - 1 for s in valid_data])))
490 |   else:
491 |     valid_data = None
492 | 
493 |   if opt.test_path is not None:
494 |     if token_embedder_name == 'cnn':
495 |       test_data = read_corpus(opt.test_path, token_embedder_max_chars, opt.max_sent_len)
496 |     elif token_embedder_name == 'lstm':
497 |       test_data = read_corpus(opt.test_path, opt.max_sent_len)
498 |     else:
499 |       raise ValueError('Unknown token embedder name: {}'.format(token_embedder_name))
500 |     logger.info('testing instance: {}, testing tokens: {}.'.format(
501 |       len(test_data), sum([len(s) - 1 for s in test_data])))
502 |   else:
503 |     test_data = None
504 | 
505 |   if opt.word_embedding is not None:
506 |     embs = load_embedding(opt.word_embedding)
507 |     word_lexicon = {word: i for i, word in enumerate(embs[0])}  
508 |   else:
509 |     embs = None
510 |     word_lexicon = {}
511 | 
512 |   # Maintain the vocabulary. vocabulary is used in either WordEmbeddingInput or softmax classification
513 |   vocab = get_truncated_vocab(train_data, opt.min_count)
514 | 
515 |   # Ensure index of '<oov>' is 0
516 |   for special_word in ['<oov>', '<bos>', '<eos>',  '<pad>']:
517 |     if special_word not in word_lexicon:
518 |       word_lexicon[special_word] = len(word_lexicon)
519 | 
520 |   for word, _ in vocab:
521 |     if word not in word_lexicon:
522 |       word_lexicon[word] = len(word_lexicon)
523 | 
524 |   # Word Embedding
525 |   if config['token_embedder']['word_dim'] > 0:
526 |     word_emb_layer = EmbeddingLayer(config['token_embedder']['word_dim'], word_lexicon, fix_emb=False, embs=embs)
527 |     logger.info('Word embedding size: {0}'.format(len(word_emb_layer.word2id)))
528 |   else:
529 |     word_emb_layer = None
530 |     logger.info('Vocabulary size: {0}'.format(len(word_lexicon)))
531 | 
532 |   # Character Lexicon
533 |   if config['token_embedder']['char_dim'] > 0:
534 |     char_lexicon = {}
535 |     for sentence in train_data:
536 |       for word in sentence:
537 |         for ch in word:
538 |           if ch not in char_lexicon:
539 |             char_lexicon[ch] = len(char_lexicon)
540 | 
541 |     for special_char in ['<bos>', '<eos>', '<oov>', '<pad>', '<bow>', '<eow>']:
542 |       if special_char not in char_lexicon:
543 |         char_lexicon[special_char] = len(char_lexicon)
544 | 
545 |     char_emb_layer = EmbeddingLayer(config['token_embedder']['char_dim'], char_lexicon, fix_emb=False)
546 |     logger.info('Char embedding size: {0}'.format(len(char_emb_layer.word2id)))
547 |   else:
548 |     char_lexicon = None
549 |     char_emb_layer = None
550 | 
551 |   train = create_batches(
552 |     train_data, opt.batch_size, word_lexicon, char_lexicon, config, use_cuda=use_cuda)
553 | 
554 |   if opt.eval_steps is None:
555 |     opt.eval_steps = len(train[0])
556 |   logger.info('Evaluate every {0} batches.'.format(opt.eval_steps))
557 | 
558 |   if valid_data is not None:
559 |     valid = create_batches(
560 |       valid_data, opt.batch_size, word_lexicon, char_lexicon, config, sort=False, shuffle=False, use_cuda=use_cuda)
561 |   else:
562 |     valid = None
563 | 
564 |   if test_data is not None:
565 |     test = create_batches(
566 |       test_data, opt.batch_size, word_lexicon, char_lexicon, config, sort=False, shuffle=False, use_cuda=use_cuda)
567 |   else:
568 |     test = None
569 | 
570 |   label_to_ix = word_lexicon
571 |   logger.info('vocab size: {0}'.format(len(label_to_ix)))
572 |   
573 |   nclasses = len(label_to_ix)
574 | 
575 |   model = Model(config, word_emb_layer, char_emb_layer, nclasses, use_cuda)
576 |   logger.info(str(model))
577 |   if use_cuda:
578 |     model = model.cuda()
579 | 
580 |   need_grad = lambda x: x.requires_grad
581 |   if opt.optimizer.lower() == 'adam':
582 |     optimizer = optim.Adam(filter(need_grad, model.parameters()), lr=opt.lr)
583 |   elif opt.optimizer.lower() == 'sgd':
584 |     optimizer = optim.SGD(filter(need_grad, model.parameters()), lr=opt.lr)
585 |   elif opt.optimizer.lower() == 'adagrad':
586 |     optimizer = optim.Adagrad(filter(need_grad, model.parameters()), lr=opt.lr)
587 |   else:
588 |     raise ValueError('Unknown optimizer {}'.format(opt.optimizer.lower()))
589 | 
590 |   try:
591 |     os.makedirs(opt.model)
592 |   except OSError as exception:
593 |     if exception.errno != errno.EEXIST:
594 |       raise
595 | 
596 |   if config['token_embedder']['char_dim'] > 0:
597 |     with codecs.open(os.path.join(opt.model, 'char.dic'), 'w', encoding='utf-8') as fpo:
598 |       for ch, i in char_emb_layer.word2id.items():
599 |         print('{0}\t{1}'.format(ch, i), file=fpo)
600 | 
601 |   with codecs.open(os.path.join(opt.model, 'word.dic'), 'w', encoding='utf-8') as fpo:
602 |     for w, i in word_lexicon.items():
603 |       print('{0}\t{1}'.format(w, i), file=fpo)
604 | 
605 |   json.dump(vars(opt), codecs.open(os.path.join(opt.model, 'config.json'), 'w', encoding='utf-8'))
606 | 
607 |   best_train = 1e+8
608 |   best_valid = 1e+8
609 |   test_result = 1e+8
610 | 
611 |   for epoch in range(opt.max_epoch):
612 |     best_train, best_valid, test_result = train_model(epoch, opt, model, optimizer,
613 |                                                       train, valid, test, best_train, best_valid, test_result)
614 |     if opt.lr_decay > 0:
615 |       optimizer.param_groups[0]['lr'] *= opt.lr_decay
616 | 
617 |   if valid_data is None:
618 |     logger.info("best train ppl: {:.6f}.".format(best_train))
619 |   elif test_data is None:
620 |     logger.info("best train ppl: {:.6f}, best valid ppl: {:.6f}.".format(best_train, best_valid))
621 |   else:
622 |     logger.info("best train ppl: {:.6f}, best valid ppl: {:.6f}, test ppl: {:.6f}.".format(best_train, best_valid, test_result))
623 | 
624 | 
625 | def test():
626 |   cmd = argparse.ArgumentParser('The testing components of')
627 |   cmd.add_argument('--gpu', default=-1, type=int, help='use id of gpu, -1 if cpu.')
628 |   cmd.add_argument("--input", help="the path to the raw text file.")
629 |   cmd.add_argument("--model", required=True, help="path to save model")
630 |   cmd.add_argument("--batch_size", "--batch", type=int, default=1, help='the batch size.')
631 |   args = cmd.parse_args(sys.argv[2:])
632 | 
633 |   if args.gpu >= 0:
634 |     torch.cuda.set_device(args.gpu)
635 |   use_cuda = args.gpu >= 0 and torch.cuda.is_available()
636 |   
637 |   args2 = dict2namedtuple(json.load(codecs.open(os.path.join(args.model, 'config.json'), 'r', encoding='utf-8')))
638 | 
639 |   with open(args2.config_path, 'r') as fin:
640 |     config = json.load(fin)
641 | 
642 |   if config['token_embedder']['char_dim'] > 0:
643 |     char_lexicon = {}
644 |     with codecs.open(os.path.join(args.model, 'char.dic'), 'r', encoding='utf-8') as fpi:
645 |       for line in fpi:
646 |         tokens = line.strip().split('\t')
647 |         if len(tokens) == 1:
648 |           tokens.insert(0, '\u3000')
649 |         token, i = tokens
650 |         char_lexicon[token] = int(i)
651 |     char_emb_layer = EmbeddingLayer(config['token_embedder']['char_dim'], char_lexicon, fix_emb=False)
652 |     logger.info('char embedding size: ' + str(len(char_emb_layer.word2id)))
653 |   else:
654 |     char_lexicon = None
655 |     char_emb_layer = None
656 | 
657 |   word_lexicon = {}
658 |   with codecs.open(os.path.join(args.model, 'word.dic'), 'r', encoding='utf-8') as fpi:
659 |     for line in fpi:
660 |       tokens = line.strip().split('\t')
661 |       if len(tokens) == 1:
662 |         tokens.insert(0, '\u3000')
663 |       token, i = tokens
664 |       word_lexicon[token] = int(i)
665 | 
666 |   if config['token_embedder']['word_dim'] > 0:
667 |     word_emb_layer = EmbeddingLayer(config['token_embedder']['word_dim'], word_lexicon, fix_emb=False, embs=None)
668 |     logger.info('word embedding size: ' + str(len(word_emb_layer.word2id)))
669 |   else:
670 |     word_emb_layer = None
671 |   
672 |   model = Model(config, word_emb_layer, char_emb_layer, len(word_lexicon), use_cuda)
673 | 
674 |   if use_cuda:
675 |     model.cuda()
676 | 
677 |   logger.info(str(model))
678 |   model.load_model(args.model)
679 |   if config['token_embedder']['name'].lower() == 'cnn':
680 |     test = read_corpus(args.input, config['token_embedder']['max_characters_per_token'], max_sent_len=10000)
681 |   elif config['token_embedder']['name'].lower() == 'lstm':
682 |     test = read_corpus(args.input, max_sent_len=10000)
683 |   else:
684 |     raise ValueError('')
685 | 
686 |   test_w, test_c, test_lens, test_masks = create_batches(
687 |     test, args.batch_size, word_lexicon, char_lexicon, config, sort=False, shuffle=False, use_cuda=use_cuda)
688 | 
689 |   test_result = eval_model(model, (test_w, test_c, test_lens, test_masks))
690 | 
691 |   logger.info("test_ppl={:.6f}".format(test_result))
692 | 
693 | 
694 | if __name__ == "__main__":
695 |   if len(sys.argv) > 1 and sys.argv[1] == 'train':
696 |     train()
697 |   elif len(sys.argv) > 1 and sys.argv[1] == 'test':
698 |     test()
699 |   else:
700 |     print('Usage: {0} [train|test] [options]'.format(sys.argv[0]), file=sys.stderr)
701 | 


--------------------------------------------------------------------------------
/elmoformanylangs/configs/cnn_0_100_512_4096_sample.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"encoder": {
 3 | 		"name": "elmo",
 4 | 		"projection_dim": 512, 
 5 | 		"cell_clip": 3, 
 6 | 		"proj_clip": 3,
 7 | 		"dim": 4096,
 8 | 		"n_layers": 2
 9 |   	},
10 | 
11 |  	"token_embedder": {
12 |  		"name": "cnn",
13 |  		"activation": "relu",
14 |  		"filters": [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]],
15 |  		"n_highway": 2, 
16 |  		"word_dim": 100,
17 |  		"char_dim": 50,
18 | 		"max_characters_per_token": 50 		
19 |  	},
20 | 	
21 | 	"classifier": {
22 | 		"name": "sampled_softmax",
23 | 		"n_samples": 8192
24 | 	},
25 | 	"dropout": 0.1
26 | }
27 | 


--------------------------------------------------------------------------------
/elmoformanylangs/configs/cnn_50_100_512_4096_sample.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"encoder": {
 3 | 		"name": "elmo",
 4 | 		"projection_dim": 512, 
 5 | 		"cell_clip": 3, 
 6 | 		"proj_clip": 3,
 7 | 		"dim": 4096,
 8 | 		"n_layers": 2
 9 |   	},
10 | 
11 |  	"token_embedder": {
12 |  		"name": "cnn",
13 |  		"activation": "relu",
14 |  		"filters": [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]],
15 |  		"n_highway": 2, 
16 |  		"word_dim": 100,
17 |  		"char_dim": 50,
18 | 		"max_characters_per_token": 50 		
19 |  	},
20 | 	
21 | 	"classifier": {
22 | 		"name": "sampled_softmax",
23 | 		"n_samples": 8192
24 | 	},
25 | 	"dropout": 0.1
26 | }
27 | 


--------------------------------------------------------------------------------
/elmoformanylangs/dataloader.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from __future__ import unicode_literals
 3 | import codecs
 4 | import numpy as np
 5 | 
 6 | 
 7 | def pad(sequences, pad_token='<pad>', pad_left=False):
 8 |   """
 9 |   input sequences is a list of text sequence [[str]]
10 |   pad each text sequence to the length of the longest
11 | 
12 |   :param sequences:
13 |   :param pad_token:
14 |   :param pad_left:
15 |   :return:
16 |   """
17 |   # max_len = max(5,max(len(seq) for seq in sequences))
18 |   max_len = max(len(seq) for seq in sequences)
19 |   if pad_left:
20 |     return [[pad_token]*(max_len-len(seq)) + seq for seq in sequences]
21 |   return [seq + [pad_token]*(max_len-len(seq)) for seq in sequences]
22 | 
23 | 
24 | def load_embedding_npz(path):
25 |   data = np.load(path)
26 |   return [str(w) for w in data['words']], data['vals']
27 | 
28 | 
29 | def load_embedding_txt(path):
30 |   words = []
31 |   vals = []
32 |   with codecs.open(path, 'r', encoding='utf-8') as fin:
33 |     fin.readline()
34 |     for line in fin:
35 |       line = line.strip()
36 |       if line:
37 |         parts = line.split()
38 |         words.append(parts[0])
39 |         vals += [float(x) for x in parts[1:]]  # equal to append
40 |   return words, np.asarray(vals).reshape(len(words), -1)  # reshape
41 | 
42 | 
43 | def load_embedding(path):
44 |   if path.endswith(".npz"):
45 |     return load_embedding_npz(path)
46 |   else:
47 |     return load_embedding_txt(path)
48 | 


--------------------------------------------------------------------------------
/elmoformanylangs/elmo.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | from __future__ import print_function
  3 | from __future__ import unicode_literals
  4 | import os
  5 | import codecs
  6 | import random
  7 | import logging
  8 | import json
  9 | import torch
 10 | from .modules.embedding_layer import EmbeddingLayer
 11 | from .utils import dict2namedtuple
 12 | from .frontend import create_one_batch
 13 | from .frontend import Model
 14 | import numpy as np
 15 | 
 16 | logger = logging.getLogger('elmoformanylangs')
 17 | 
 18 | 
 19 | def read_list(sents, max_chars=None):
 20 |     """
 21 |     read raw text file. The format of the input is like, one sentence per line
 22 |     words are separated by '\t'
 23 | 
 24 |     :param path:
 25 |     :param max_chars: int, the number of maximum characters in a word, this
 26 |       parameter is used when the model is configured with CNN word encoder.
 27 |     :return:
 28 |     """
 29 |     dataset = []
 30 |     textset = []
 31 |     for sent in sents:
 32 |         data = ['<bos>']
 33 |         text = []
 34 |         for token in sent:
 35 |             text.append(token)
 36 |             if max_chars is not None and len(token) + 2 > max_chars:
 37 |                 token = token[:max_chars - 2]
 38 |             data.append(token)
 39 |         data.append('<eos>')
 40 |         dataset.append(data)
 41 |         textset.append(text)
 42 |     return dataset, textset
 43 | 
 44 | 
 45 | def recover(li, ind):
 46 |     # li[piv], ind = torch.sort(li[piv], dim=0, descending=(not unsort))
 47 |     dummy = list(range(len(ind)))
 48 |     dummy.sort(key=lambda l: ind[l])
 49 |     li = [li[i] for i in dummy]
 50 |     return li
 51 | 
 52 | 
 53 | # shuffle training examples and create mini-batches
 54 | def create_batches(x, batch_size, word2id, char2id, config, perm=None, shuffle=False, sort=True, text=None):
 55 |     ind = list(range(len(x)))
 56 |     lst = perm or list(range(len(x)))
 57 |     if shuffle:
 58 |         random.shuffle(lst)
 59 | 
 60 |     if sort:
 61 |         lst.sort(key=lambda l: -len(x[l]))
 62 | 
 63 |     x = [x[i] for i in lst]
 64 |     ind = [ind[i] for i in lst]
 65 |     if text is not None:
 66 |         text = [text[i] for i in lst]
 67 | 
 68 |     sum_len = 0.0
 69 |     batches_w, batches_c, batches_lens, batches_masks, batches_text, batches_ind = [], [], [], [], [], []
 70 |     size = batch_size
 71 |     nbatch = (len(x) - 1) // size + 1
 72 |     for i in range(nbatch):
 73 |         start_id, end_id = i * size, (i + 1) * size
 74 |         bw, bc, blens, bmasks = create_one_batch(x[start_id: end_id], word2id, char2id, config, sort=sort)
 75 |         sum_len += sum(blens)
 76 |         batches_w.append(bw)
 77 |         batches_c.append(bc)
 78 |         batches_lens.append(blens)
 79 |         batches_masks.append(bmasks)
 80 |         batches_ind.append(ind[start_id: end_id])
 81 |         if text is not None:
 82 |             batches_text.append(text[start_id: end_id])
 83 | 
 84 |     if sort:
 85 |         perm = list(range(nbatch))
 86 |         random.shuffle(perm)
 87 |         batches_w = [batches_w[i] for i in perm]
 88 |         batches_c = [batches_c[i] for i in perm]
 89 |         batches_lens = [batches_lens[i] for i in perm]
 90 |         batches_masks = [batches_masks[i] for i in perm]
 91 |         batches_ind = [batches_ind[i] for i in perm]
 92 |         if text is not None:
 93 |             batches_text = [batches_text[i] for i in perm]
 94 | 
 95 |     logger.info("{} batches, avg len: {:.1f}".format(
 96 |         nbatch, sum_len / len(x)))
 97 |     recover_ind = [item for sublist in batches_ind for item in sublist]
 98 |     if text is not None:
 99 |         return batches_w, batches_c, batches_lens, batches_masks, batches_text, recover_ind
100 |     return batches_w, batches_c, batches_lens, batches_masks, recover_ind
101 | 
102 | 
103 | class Embedder(object):
104 |     def __init__(self, model_dir, batch_size=64):
105 |         self.model_dir = model_dir
106 |         self.model, self.config = self.get_model()
107 |         self.batch_size = batch_size
108 | 
109 |     def get_model(self):
110 |         # torch.cuda.set_device(1)
111 |         self.use_cuda = torch.cuda.is_available()
112 |         # load the model configurations
113 |         args2 = dict2namedtuple(json.load(codecs.open(
114 |             os.path.join(self.model_dir, 'config.json'), 'r', encoding='utf-8')))
115 | 
116 |         config_path = os.path.join(self.model_dir, args2.config_path)
117 |         # Some of the available models may have the config in the
118 |         # model dir, but the path given in the config directory was an
119 |         # absolute path.
120 |         if not os.path.exists(config_path):
121 |             config_path = os.path.join(self.model_dir,
122 |                                        os.path.split(config_path)[1])
123 |             logger.warning("Could not find config.  Trying " + config_path)
124 |         # In many cases, such as the publicly available English model,
125 |         # the config is one of the default provided configs in
126 |         # elmoformanylangs/configs
127 |         if not os.path.exists(config_path):
128 |             config_path = os.path.join(os.path.split(__file__)[0], "configs",
129 |                                        os.path.split(config_path)[1])
130 |             logger.warning("Could not find config.  Trying " + config_path)
131 | 
132 |         if not os.path.exists(config_path):
133 |             raise FileNotFoundError("Could not find the model config in either the model directory "
134 |                                     "or the default configs.  Path in config file: %s" % args2.config_path)
135 | 
136 |         with open(config_path, 'r') as fin:
137 |             config = json.load(fin)
138 | 
139 |         # For the model trained with character-based word encoder.
140 |         if config['token_embedder']['char_dim'] > 0:
141 |             self.char_lexicon = {}
142 |             with codecs.open(os.path.join(self.model_dir, 'char.dic'), 'r', encoding='utf-8') as fpi:
143 |                 for line in fpi:
144 |                     tokens = line.strip().split('\t')
145 |                     if len(tokens) == 1:
146 |                         tokens.insert(0, '\u3000')
147 |                     token, i = tokens
148 |                     self.char_lexicon[token] = int(i)
149 |             char_emb_layer = EmbeddingLayer(
150 |                 config['token_embedder']['char_dim'], self.char_lexicon, fix_emb=False, embs=None)
151 |             logger.info('char embedding size: ' +
152 |                         str(len(char_emb_layer.word2id)))
153 |         else:
154 |             self.char_lexicon = None
155 |             char_emb_layer = None
156 | 
157 |         # For the model trained with word form word encoder.
158 |         if config['token_embedder']['word_dim'] > 0:
159 |             self.word_lexicon = {}
160 |             with codecs.open(os.path.join(self.model_dir, 'word.dic'), 'r', encoding='utf-8') as fpi:
161 |                 for line in fpi:
162 |                     tokens = line.strip().split('\t')
163 |                     if len(tokens) == 1:
164 |                         tokens.insert(0, '\u3000')
165 |                     token, i = tokens
166 |                     self.word_lexicon[token] = int(i)
167 |             word_emb_layer = EmbeddingLayer(
168 |                 config['token_embedder']['word_dim'], self.word_lexicon, fix_emb=False, embs=None)
169 |             logger.info('word embedding size: ' +
170 |                         str(len(word_emb_layer.word2id)))
171 |         else:
172 |             self.word_lexicon = None
173 |             word_emb_layer = None
174 | 
175 |         # instantiate the model
176 |         model = Model(config, word_emb_layer, char_emb_layer, self.use_cuda)
177 | 
178 |         if self.use_cuda:
179 |             model.cuda()
180 | 
181 |         logger.info(str(model))
182 |         model.load_model(self.model_dir)
183 | 
184 |         # read test data according to input format
185 | 
186 |         # configure the model to evaluation mode.
187 |         model.eval()
188 |         return model, config
189 | 
190 |     def sents2elmo(self, sents, output_layer=-1):
191 |         read_function = read_list
192 | 
193 |         if self.config['token_embedder']['name'].lower() == 'cnn':
194 |             test, text = read_function(sents, self.config['token_embedder']['max_characters_per_token'])
195 |         else:
196 |             test, text = read_function(sents)
197 | 
198 |         # create test batches from the input data.
199 |         test_w, test_c, test_lens, test_masks, test_text, recover_ind = create_batches(
200 |             test, self.batch_size, self.word_lexicon, self.char_lexicon, self.config, text=text)
201 | 
202 |         cnt = 0
203 | 
204 |         after_elmo = []
205 |         for w, c, lens, masks, texts in zip(test_w, test_c, test_lens, test_masks, test_text):
206 |             output = self.model.forward(w, c, masks)
207 |             for i, text in enumerate(texts):
208 | 
209 |                 if self.config['encoder']['name'].lower() == 'lstm':
210 |                     data = output[i, 1:lens[i]-1, :].data
211 |                     if self.use_cuda:
212 |                         data = data.cpu()
213 |                     data = data.numpy()
214 |                 elif self.config['encoder']['name'].lower() == 'elmo':
215 |                     data = output[:, i, 1:lens[i]-1, :].data
216 |                     if self.use_cuda:
217 |                         data = data.cpu()
218 |                     data = data.numpy()
219 | 
220 |                 if output_layer == -1:
221 |                     payload = np.average(data, axis=0)
222 |                 elif output_layer == -2:
223 |                     payload = data
224 |                 else:
225 |                     payload = data[output_layer]
226 |                 after_elmo.append(payload)
227 | 
228 |                 cnt += 1
229 |                 if cnt % 1000 == 0:
230 |                     logger.info('Finished {0} sentences.'.format(cnt))
231 | 
232 |         after_elmo = recover(after_elmo, recover_ind)
233 |         return after_elmo
234 | 


--------------------------------------------------------------------------------
/elmoformanylangs/frontend.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import os
  3 | import random
  4 | import torch
  5 | import torch.nn as nn
  6 | import logging
  7 | from torch.autograd import Variable
  8 | from .modules.elmo import ElmobiLm
  9 | from .modules.lstm import LstmbiLm
 10 | from .modules.token_embedder import ConvTokenEmbedder, LstmTokenEmbedder
 11 | 
 12 | logger = logging.getLogger('elmoformanylangs')
 13 | 
 14 | def create_one_batch(x, word2id, char2id, config, oov='<oov>', pad='<pad>', sort=True):
 15 |   """
 16 |   Create one batch of input.
 17 | 
 18 |   :param x: List[List[str]]
 19 |   :param word2id: Dict | None
 20 |   :param char2id: Dict | None
 21 |   :param config: Dict
 22 |   :param oov: str, the form of OOV token.
 23 |   :param pad: str, the form of padding token.
 24 |   :param sort: bool, specify whether sorting the sentences by their lengths.
 25 |   :return:
 26 |   """
 27 |   batch_size = len(x)
 28 |   # lst represents the order of sentences
 29 |   lst = list(range(batch_size))
 30 |   if sort:
 31 |     lst.sort(key=lambda l: -len(x[l]))
 32 | 
 33 |   # shuffle the sentences by
 34 |   x = [x[i] for i in lst]
 35 |   lens = [len(x[i]) for i in lst]
 36 |   max_len = max(lens)
 37 | 
 38 |   # get a batch of word id whose size is (batch x max_len)
 39 |   if word2id is not None:
 40 |     oov_id, pad_id = word2id.get(oov, None), word2id.get(pad, None)
 41 |     assert oov_id is not None and pad_id is not None
 42 |     batch_w = torch.LongTensor(batch_size, max_len).fill_(pad_id)
 43 |     for i, x_i in enumerate(x):
 44 |       for j, x_ij in enumerate(x_i):
 45 |         batch_w[i][j] = word2id.get(x_ij, oov_id)
 46 |   else:
 47 |     batch_w = None
 48 | 
 49 |   # get a batch of character id whose size is (batch x max_len x max_chars)
 50 |   if char2id is not None:
 51 |     bow_id, eow_id, oov_id, pad_id = [char2id.get(key, None) for key in ('<eow>', '<bow>', oov, pad)]
 52 | 
 53 |     assert bow_id is not None and eow_id is not None and oov_id is not None and pad_id is not None
 54 | 
 55 |     if config['token_embedder']['name'].lower() == 'cnn':
 56 |       max_chars = config['token_embedder']['max_characters_per_token']
 57 |       assert max([len(w) for i in lst for w in x[i]]) + 2 <= max_chars
 58 |     elif config['token_embedder']['name'].lower() == 'lstm':
 59 |       # counting the <bow> and <eow>
 60 |       max_chars = max([len(w) for i in lst for w in x[i]]) + 2
 61 |     else:
 62 |       raise ValueError('Unknown token_embedder: {0}'.format(config['token_embedder']['name']))
 63 | 
 64 |     batch_c = torch.LongTensor(batch_size, max_len, max_chars).fill_(pad_id)
 65 | 
 66 |     for i, x_i in enumerate(x):
 67 |       for j, x_ij in enumerate(x_i):
 68 |         batch_c[i][j][0] = bow_id
 69 |         if x_ij == '<bos>' or x_ij == '<eos>':
 70 |           batch_c[i][j][1] = char2id.get(x_ij)
 71 |           batch_c[i][j][2] = eow_id
 72 |         else:
 73 |           for k, c in enumerate(x_ij):
 74 |             batch_c[i][j][k + 1] = char2id.get(c, oov_id)
 75 |           batch_c[i][j][len(x_ij) + 1] = eow_id
 76 |   else:
 77 |     batch_c = None
 78 | 
 79 |   # mask[0] is the matrix (batch x max_len) indicating whether
 80 |   # there is an id is valid (not a padding) in this batch.
 81 |   # mask[1] stores the flattened ids indicating whether there is a valid
 82 |   # previous token
 83 |   # mask[2] stores the flattened ids indicating whether there is a valid
 84 |   # next token
 85 |   masks = [torch.LongTensor(batch_size, max_len).fill_(0), [], []]
 86 | 
 87 |   for i, x_i in enumerate(x):
 88 |     for j in range(len(x_i)):
 89 |       masks[0][i][j] = 1
 90 |       if j + 1 < len(x_i):
 91 |         masks[1].append(i * max_len + j)
 92 |       if j > 0:
 93 |         masks[2].append(i * max_len + j)
 94 | 
 95 |   assert len(masks[1]) <= batch_size * max_len
 96 |   assert len(masks[2]) <= batch_size * max_len
 97 | 
 98 |   masks[1] = torch.LongTensor(masks[1])
 99 |   masks[2] = torch.LongTensor(masks[2])
100 | 
101 |   return batch_w, batch_c, lens, masks
102 | 
103 | 
104 | # shuffle training examples and create mini-batches
105 | def create_batches(x, batch_size, word2id, char2id, config, perm=None, shuffle=True, sort=True, text=None):
106 |   """
107 | 
108 |   :param x: List[List[str]]
109 |   :param batch_size:
110 |   :param word2id:
111 |   :param char2id:
112 |   :param config:
113 |   :param perm:
114 |   :param shuffle:
115 |   :param sort:
116 |   :param text:
117 |   :return:
118 |   """
119 |   lst = perm or list(range(len(x)))
120 |   if shuffle:
121 |     random.shuffle(lst)
122 | 
123 |   if sort:
124 |     lst.sort(key=lambda l: -len(x[l]))
125 | 
126 |   x = [x[i] for i in lst]
127 |   if text is not None:
128 |     text = [text[i] for i in lst]
129 | 
130 |   sum_len = 0.0
131 |   batches_w, batches_c, batches_lens, batches_masks, batches_text = [], [], [], [], []
132 |   size = batch_size
133 |   nbatch = (len(x) - 1) // size + 1
134 |   for i in range(nbatch):
135 |     start_id, end_id = i * size, (i + 1) * size
136 |     bw, bc, blens, bmasks = create_one_batch(x[start_id: end_id], word2id, char2id, config, sort=sort)
137 |     sum_len += sum(blens)
138 |     batches_w.append(bw)
139 |     batches_c.append(bc)
140 |     batches_lens.append(blens)
141 |     batches_masks.append(bmasks)
142 |     if text is not None:
143 |       batches_text.append(text[start_id: end_id])
144 | 
145 |   if sort:
146 |     perm = list(range(nbatch))
147 |     random.shuffle(perm)
148 |     batches_w = [batches_w[i] for i in perm]
149 |     batches_c = [batches_c[i] for i in perm]
150 |     batches_lens = [batches_lens[i] for i in perm]
151 |     batches_masks = [batches_masks[i] for i in perm]
152 |     if text is not None:
153 |       batches_text = [batches_text[i] for i in perm]
154 | 
155 |   logger.info("{} batches, avg len: {:.1f}".format(nbatch, sum_len / len(x)))
156 |   if text is not None:
157 |     return batches_w, batches_c, batches_lens, batches_masks, batches_text
158 |   return batches_w, batches_c, batches_lens, batches_masks
159 | 
160 | 
161 | class Model(nn.Module):
162 |   def __init__(self, config, word_emb_layer, char_emb_layer, use_cuda=False):
163 |     super(Model, self).__init__()
164 |     self.use_cuda = use_cuda
165 |     self.config = config
166 | 
167 |     if config['token_embedder']['name'].lower() == 'cnn':
168 |       self.token_embedder = ConvTokenEmbedder(
169 |         config, word_emb_layer, char_emb_layer, use_cuda)
170 |     elif config['token_embedder']['name'].lower() == 'lstm':
171 |       self.token_embedder = LstmTokenEmbedder(
172 |         config, word_emb_layer, char_emb_layer, use_cuda)
173 | 
174 |     if config['encoder']['name'].lower() == 'elmo':
175 |       self.encoder = ElmobiLm(config, use_cuda)
176 |     elif config['encoder']['name'].lower() == 'lstm':
177 |       self.encoder = LstmbiLm(config, use_cuda)
178 | 
179 |     self.output_dim = config['encoder']['projection_dim']
180 | 
181 |   def forward(self, word_inp, chars_package, mask_package):
182 |     """
183 | 
184 |     :param word_inp:
185 |     :param chars_package:
186 |     :param mask_package:
187 |     :return:
188 |     """
189 |     token_embedding = self.token_embedder(word_inp, chars_package, (mask_package[0].size(0), mask_package[0].size(1)))
190 |     if self.config['encoder']['name'] == 'elmo':
191 |       mask = Variable(mask_package[0]).cuda() if self.use_cuda else Variable(mask_package[0])
192 |       encoder_output = self.encoder(token_embedding, mask)
193 |       sz = encoder_output.size()
194 |       token_embedding = torch.cat(
195 |         [token_embedding, token_embedding], dim=2).view(1, sz[1], sz[2], sz[3])
196 |       encoder_output = torch.cat(
197 |         [token_embedding, encoder_output], dim=0)
198 |     elif self.config['encoder']['name'] == 'lstm':
199 |       encoder_output = self.encoder(token_embedding)
200 |     else:
201 |       raise ValueError('Unknown encoder: {0}'.format(self.config['encoder']['name']))
202 | 
203 |     return encoder_output
204 | 
205 |   def load_model(self, path):
206 |     self.token_embedder.load_state_dict(torch.load(os.path.join(path, 'token_embedder.pkl'),
207 |                                                    map_location=lambda storage, loc: storage))
208 |     self.encoder.load_state_dict(torch.load(os.path.join(path, 'encoder.pkl'),
209 |                                             map_location=lambda storage, loc: storage))
210 | 


--------------------------------------------------------------------------------
/elmoformanylangs/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HIT-SCIR/ELMoForManyLangs/b3de5f1dc1ac13638a930b49c41e1f1e0e185ca1/elmoformanylangs/modules/__init__.py


--------------------------------------------------------------------------------
/elmoformanylangs/modules/classify_layer.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | from torch.autograd import Variable
  6 | 
  7 | 
  8 | class SoftmaxLayer(nn.Module):
  9 |   """ Naive softmax-layer """
 10 |   def __init__(self, output_dim, n_class):
 11 |     """
 12 | 
 13 |     :param output_dim: int
 14 |     :param n_class: int
 15 |     """
 16 |     super(SoftmaxLayer, self).__init__()
 17 |     self.hidden2tag = nn.Linear(output_dim, n_class)
 18 |     self.criterion = nn.CrossEntropyLoss(size_average=False)
 19 | 
 20 |   def forward(self, x, y):
 21 |     """
 22 | 
 23 |     :param x: torch.Tensor
 24 |     :param y: torch.Tensor
 25 |     :return:
 26 |     """
 27 |     tag_scores = self.hidden2tag(x)
 28 |     return self.criterion(tag_scores, y)
 29 | 
 30 | 
 31 | class SampledSoftmaxLayer(nn.Module):
 32 |   """
 33 | 
 34 |   """
 35 |   def __init__(self, output_dim, n_class, n_samples, use_cuda):
 36 |     """
 37 | 
 38 |     :param output_dim:
 39 |     :param n_class:
 40 |     :param n_samples:
 41 |     :param use_cuda:
 42 |     """
 43 |     super(SampledSoftmaxLayer, self).__init__()
 44 |     self.n_samples = n_samples
 45 |     self.n_class = n_class
 46 |     self.use_cuda = use_cuda
 47 |     self.criterion = nn.CrossEntropyLoss(size_average=False)
 48 |     self.negative_samples = []
 49 |     self.word_to_column = {0: 0}
 50 | 
 51 |     self.all_word = []
 52 |     self.all_word_to_column = {0: 0}
 53 | 
 54 |     self.column_emb = nn.Embedding(n_class, output_dim)
 55 |     self.column_emb.weight.data.uniform_(-0.25, 0.25)
 56 | 
 57 |     self.column_bias = nn.Embedding(n_class, 1)
 58 |     self.column_bias.weight.data.uniform_(-0.25, 0.25)
 59 | 
 60 |     self.oov_column = nn.Parameter(torch.Tensor(output_dim, 1))
 61 |     self.oov_column.data.uniform_(-0.25, 0.25)
 62 | 
 63 |   def forward(self, x, y):
 64 |     if self.training:
 65 |       for i in range(y.size(0)):
 66 |         y[i] = self.word_to_column.get(y[i].tolist())
 67 |       samples = torch.LongTensor(len(self.word_to_column)).fill_(0)
 68 |       for word in self.negative_samples:
 69 |         samples[self.word_to_column[word]] = word
 70 |     else:
 71 |       for i in range(y.size(0)):
 72 |         y[i] = self.all_word_to_column.get(y[i].tolist(), 0)
 73 |       samples = torch.LongTensor(len(self.all_word_to_column)).fill_(0)
 74 |       for word in self.all_word:
 75 |         samples[self.all_word_to_column[word]] = word
 76 | 
 77 |     if self.use_cuda:
 78 |       samples = samples.cuda()
 79 | 
 80 |     tag_scores = (x.matmul(self.embedding_matrix)).view(y.size(0), -1) + \
 81 |                  (self.column_bias.forward(samples)).view(1, -1) 
 82 |     return self.criterion(tag_scores, y)
 83 | 
 84 |   def update_embedding_matrix(self):
 85 |     word_inp, chars_inp = [], []
 86 |     if self.training:  
 87 |       columns = torch.LongTensor(len(self.negative_samples) + 1)
 88 |       samples = self.negative_samples
 89 |       for i, word in enumerate(samples):
 90 |         columns[self.word_to_column[word]] = word
 91 |       columns[0] = 0
 92 |     else:
 93 |       columns = torch.LongTensor(len(self.all_word) + 1)
 94 |       samples = self.all_word
 95 |       for i, word in enumerate(samples):
 96 |         columns[self.all_word_to_column[word]] = word
 97 |       columns[0] = 0
 98 | 
 99 |     if self.use_cuda:
100 |       columns = columns.cuda()
101 |     self.embedding_matrix = self.column_emb.forward(columns).transpose(0, 1)
102 | 
103 |   def update_negative_samples(self, word_inp, chars_inp, mask):
104 |     batch_size, seq_len = word_inp.size(0), word_inp.size(1)
105 |     in_batch = set()
106 |     for i in range(batch_size):
107 |       for j in range(seq_len):
108 |         if mask[i][j] == 0:
109 |           continue
110 |         word = word_inp[i][j].tolist()
111 |         in_batch.add(word)
112 |     for i in range(batch_size):
113 |       for j in range(seq_len):
114 |         if mask[i][j] == 0:
115 |           continue
116 |         word = word_inp[i][j].tolist()
117 |         if word not in self.all_word_to_column:
118 |           self.all_word.append(word)
119 |           self.all_word_to_column[word] = len(self.all_word_to_column)
120 | 
121 |         if word not in self.word_to_column:
122 |           if len(self.negative_samples) < self.n_samples:
123 |             self.negative_samples.append(word)
124 |             self.word_to_column[word] = len(self.word_to_column)
125 |           else:
126 |             while self.negative_samples[0] in in_batch:
127 |               self.negative_samples = self.negative_samples[1:] + [self.negative_samples[0]]
128 |             self.word_to_column[word] = self.word_to_column.pop(self.negative_samples[0])
129 |             self.negative_samples = self.negative_samples[1:] + [word]
130 | 
131 | 
132 | class CNNSoftmaxLayer(nn.Module):
133 |   def __init__(self, token_embedder, output_dim, n_class, n_samples, corr_dim, use_cuda):
134 |     super(CNNSoftmaxLayer, self).__init__()
135 |     self.token_embedder = token_embedder
136 |     self.n_samples = n_samples
137 |     self.use_cuda = use_cuda
138 |     self.criterion = nn.CrossEntropyLoss(size_average=False)
139 |     self.negative_samples = []
140 |     self.word_to_column = {0: 0}
141 | 
142 |     self.all_word = []
143 |     self.all_word_to_column = {0: 0}
144 | 
145 |     self.M = nn.Parameter(torch.Tensor(output_dim, corr_dim))
146 |     stdv = 1. / math.sqrt(self.M.size(1))
147 |     self.M.data.uniform_(-stdv, stdv)
148 | 
149 |     self.corr = nn.Embedding(n_class, corr_dim)
150 |     self.corr.weight.data.uniform_(-0.25, 0.25)
151 | 
152 |     self.oov_column = nn.Parameter(torch.Tensor(output_dim, 1))
153 |     self.oov_column.data.uniform_(-0.25, 0.25)
154 | 
155 |   def forward(self, x, y):
156 |     if self.training:
157 |       for i in range(y.size(0)):
158 |         y[i] = self.word_to_column.get(y[i].tolist())
159 |       samples = torch.LongTensor(len(self.word_to_column)).fill_(0)
160 |       for package in self.negative_samples:
161 |         samples[self.word_to_column[package[0]]] = package[0]
162 |     else:
163 |       for i in range(y.size(0)):
164 |         y[i] = self.all_word_to_column.get(y[i].tolist(), 0)
165 |       samples = torch.LongTensor(len(self.all_word_to_column)).fill_(0)
166 |       for package in self.all_word:
167 |         samples[self.all_word_to_column[package[0]]] = package[0]
168 | 
169 |     if self.use_cuda:
170 |       samples = samples.cuda()
171 | 
172 |     tag_scores = (x.matmul(self.embedding_matrix)).view(y.size(0), -1) + \
173 |                  (x.matmul(self.M).matmul(self.corr.forward(samples).transpose(0, 1))).view(y.size(0), -1)
174 |     return self.criterion(tag_scores, y)
175 | 
176 |   def update_embedding_matrix(self):
177 |     batch_size = 2048
178 |     word_inp, chars_inp = [], []
179 |     if self.training:  
180 |       sub_matrices = [self.oov_column]
181 |       samples = self.negative_samples
182 |       id2pack = {}
183 |       for i, package in enumerate(samples):
184 |         id2pack[self.word_to_column[package[0]]] = i
185 |     else:
186 |       sub_matrices = [self.oov_column]
187 |       samples = self.all_word
188 |       id2pack = {}
189 |       for i, package in enumerate(samples):
190 |         id2pack[self.all_word_to_column[package[0]]] = i
191 | 
192 |     for i in range(len(samples)):
193 |       # [n_samples, 1], [n_samples, 1, x], [n_samples, 1]
194 |       word_inp.append(samples[id2pack[i + 1]][0])
195 |       chars_inp.append(samples[id2pack[i + 1]][1])
196 |       if len(word_inp) == batch_size or i == len(samples) - 1:
197 |         sub_matrices.append(self.token_embedder.forward(torch.LongTensor(word_inp).view(len(word_inp), 1),
198 |                                                         None if chars_inp[0] is None else torch.LongTensor(chars_inp).view(len(word_inp), 1, len(package[1])), 
199 |                                                         (len(word_inp), 1)).squeeze(1).transpose(0, 1))
200 |         if not self.training:
201 |           sub_matrices[-1] = sub_matrices[-1].detach()
202 |         word_inp, chars_inp = [], [] 
203 | 
204 |     sum = 0
205 |     for mat in sub_matrices:
206 |       sum += mat.size(1)
207 |     #print(sum, len(self.word_to_column))    
208 |     self.embedding_matrix = torch.cat(sub_matrices, dim=1)
209 | 
210 |   def update_negative_samples(self, word_inp, chars_inp, mask):
211 |     batch_size, seq_len = word_inp.size(0), word_inp.size(1)
212 |     in_batch = set()
213 |     for i in range(batch_size):
214 |       for j in range(seq_len):
215 |         if mask[i][j] == 0:
216 |           continue
217 |         word = word_inp[i][j].tolist()
218 |         in_batch.add(word)
219 |     for i in range(batch_size):
220 |       for j in range(seq_len):
221 |         if mask[i][j] == 0:
222 |           continue
223 |         package = (word_inp[i][j].tolist(), None if chars_inp is None else chars_inp[i][j].tolist())
224 |         if package[0] not in self.all_word_to_column:
225 |           self.all_word.append(package)
226 |           self.all_word_to_column[package[0]] = len(self.all_word_to_column)
227 | 
228 |         if package[0] not in self.word_to_column:
229 |           if len(self.negative_samples) < self.n_samples:
230 |             self.negative_samples.append(package)
231 |             self.word_to_column[package[0]] = len(self.word_to_column)
232 |           else:
233 |             while self.negative_samples[0][0] in in_batch:
234 |               self.negative_samples = self.negative_samples[1:] + [self.negative_samples[0]]
235 |             self.word_to_column[package[0]] = self.word_to_column.pop(self.negative_samples[0][0])
236 |             self.negative_samples = self.negative_samples[1:] + [package]
237 | 


--------------------------------------------------------------------------------
/elmoformanylangs/modules/elmo.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional, Tuple, List, Callable, Union
  2 | 
  3 | import h5py
  4 | import numpy
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.functional as F
  8 | from torch.nn.utils.rnn import PackedSequence, pad_packed_sequence, pack_padded_sequence
  9 | from torch.autograd import Variable
 10 | 
 11 | from .encoder_base import _EncoderBase
 12 | from .lstm_cell_with_projection import LstmCellWithProjection
 13 | 
 14 | RnnState = Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]  # pylint: disable=invalid-name
 15 | RnnStateStorage = Tuple[torch.Tensor, ...]  # pylint: disable=invalid-name
 16 | 
 17 | 
 18 | class ElmobiLm(_EncoderBase):
 19 |   def __init__(self, config, use_cuda=False):
 20 |     super(ElmobiLm, self).__init__(stateful=True)
 21 |     self.config = config
 22 |     self.use_cuda = use_cuda
 23 |     input_size = config['encoder']['projection_dim']
 24 |     hidden_size = config['encoder']['projection_dim']
 25 |     cell_size = config['encoder']['dim']
 26 |     num_layers = config['encoder']['n_layers']
 27 |     memory_cell_clip_value = config['encoder']['cell_clip']
 28 |     state_projection_clip_value = config['encoder']['proj_clip']
 29 |     recurrent_dropout_probability = config['dropout']
 30 | 
 31 |     self.input_size = input_size
 32 |     self.hidden_size = hidden_size
 33 |     self.num_layers = num_layers
 34 |     self.cell_size = cell_size
 35 |     
 36 |     forward_layers = []
 37 |     backward_layers = []
 38 | 
 39 |     lstm_input_size = input_size
 40 |     go_forward = True
 41 |     for layer_index in range(num_layers):
 42 |       forward_layer = LstmCellWithProjection(lstm_input_size,
 43 |                                              hidden_size,
 44 |                                              cell_size,
 45 |                                              go_forward,
 46 |                                              recurrent_dropout_probability,
 47 |                                              memory_cell_clip_value,
 48 |                                              state_projection_clip_value)
 49 |       backward_layer = LstmCellWithProjection(lstm_input_size,
 50 |                                               hidden_size,
 51 |                                               cell_size,
 52 |                                               not go_forward,
 53 |                                               recurrent_dropout_probability,
 54 |                                               memory_cell_clip_value,
 55 |                                               state_projection_clip_value)
 56 |       lstm_input_size = hidden_size
 57 | 
 58 |       self.add_module('forward_layer_{}'.format(layer_index), forward_layer)
 59 |       self.add_module('backward_layer_{}'.format(layer_index), backward_layer)
 60 |       forward_layers.append(forward_layer)
 61 |       backward_layers.append(backward_layer)
 62 |     self.forward_layers = forward_layers
 63 |     self.backward_layers = backward_layers
 64 | 
 65 |   def forward(self, inputs, mask):
 66 |     batch_size, total_sequence_length = mask.size()
 67 |     stacked_sequence_output, final_states, restoration_indices = \
 68 |       self.sort_and_run_forward(self._lstm_forward, inputs, mask)
 69 | 
 70 |     num_layers, num_valid, returned_timesteps, encoder_dim = stacked_sequence_output.size()
 71 |     # Add back invalid rows which were removed in the call to sort_and_run_forward.
 72 |     if num_valid < batch_size:
 73 |       zeros = stacked_sequence_output.data.new(num_layers,
 74 |                                                batch_size - num_valid,
 75 |                                                returned_timesteps,
 76 |                                                encoder_dim).fill_(0)
 77 |       zeros = Variable(zeros)
 78 |       stacked_sequence_output = torch.cat([stacked_sequence_output, zeros], 1)
 79 | 
 80 |       # The states also need to have invalid rows added back.
 81 |       new_states = []
 82 |       for state in final_states:
 83 |         state_dim = state.size(-1)
 84 |         zeros = state.data.new(num_layers, batch_size - num_valid, state_dim).fill_(0)
 85 |         zeros = Variable(zeros)
 86 |         new_states.append(torch.cat([state, zeros], 1))
 87 |       final_states = new_states
 88 | 
 89 |     # It's possible to need to pass sequences which are padded to longer than the
 90 |     # max length of the sequence to a Seq2StackEncoder. However, packing and unpacking
 91 |     # the sequences mean that the returned tensor won't include these dimensions, because
 92 |     # the RNN did not need to process them. We add them back on in the form of zeros here.
 93 |     sequence_length_difference = total_sequence_length - returned_timesteps
 94 |     if sequence_length_difference > 0:
 95 |       zeros = stacked_sequence_output.data.new(num_layers,
 96 |                                                batch_size,
 97 |                                                sequence_length_difference,
 98 |                                                stacked_sequence_output[0].size(-1)).fill_(0)
 99 |       zeros = Variable(zeros)
100 |       stacked_sequence_output = torch.cat([stacked_sequence_output, zeros], 2)
101 | 
102 |     self._update_states(final_states, restoration_indices)
103 | 
104 |     # Restore the original indices and return the sequence.
105 |     # Has shape (num_layers, batch_size, sequence_length, hidden_size)
106 |     return stacked_sequence_output.index_select(1, restoration_indices)
107 | 
108 | 
109 |   def _lstm_forward(self, 
110 |                     inputs: PackedSequence,
111 |                     initial_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None) -> \
112 |       Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
113 |     """
114 |     Parameters
115 |     ----------
116 |     inputs : ``PackedSequence``, required.
117 |       A batch first ``PackedSequence`` to run the stacked LSTM over.
118 |     initial_state : ``Tuple[torch.Tensor, torch.Tensor]``, optional, (default = None)
119 |       A tuple (state, memory) representing the initial hidden state and memory
120 |       of the LSTM, with shape (num_layers, batch_size, 2 * hidden_size) and
121 |       (num_layers, batch_size, 2 * cell_size) respectively.
122 |     Returns
123 |     -------
124 |     output_sequence : ``torch.FloatTensor``
125 |       The encoded sequence of shape (num_layers, batch_size, sequence_length, hidden_size)
126 |     final_states: ``Tuple[torch.FloatTensor, torch.FloatTensor]``
127 |       The per-layer final (state, memory) states of the LSTM, with shape
128 |       (num_layers, batch_size, 2 * hidden_size) and  (num_layers, batch_size, 2 * cell_size)
129 |       respectively. The last dimension is duplicated because it contains the state/memory
130 |       for both the forward and backward layers.
131 |     """
132 |        
133 |     if initial_state is None:
134 |       hidden_states: List[Optional[Tuple[torch.Tensor,
135 |                                    torch.Tensor]]] = [None] * len(self.forward_layers)
136 |     elif initial_state[0].size()[0] != len(self.forward_layers):
137 |       raise Exception("Initial states were passed to forward() but the number of "
138 |                                "initial states does not match the number of layers.")
139 |     else:
140 |       hidden_states = list(zip(initial_state[0].split(1, 0), initial_state[1].split(1, 0)))
141 | 
142 |     inputs, batch_lengths = pad_packed_sequence(inputs, batch_first=True)
143 |     forward_output_sequence = inputs
144 |     backward_output_sequence = inputs
145 | 
146 |     final_states = []
147 |     sequence_outputs = []
148 |     for layer_index, state in enumerate(hidden_states):
149 |       forward_layer = getattr(self, 'forward_layer_{}'.format(layer_index))
150 |       backward_layer = getattr(self, 'backward_layer_{}'.format(layer_index))
151 | 
152 |       forward_cache = forward_output_sequence
153 |       backward_cache = backward_output_sequence
154 | 
155 |       if state is not None:
156 |         forward_hidden_state, backward_hidden_state = state[0].split(self.hidden_size, 2)
157 |         forward_memory_state, backward_memory_state = state[1].split(self.cell_size, 2)
158 |         forward_state = (forward_hidden_state, forward_memory_state)
159 |         backward_state = (backward_hidden_state, backward_memory_state)
160 |       else:
161 |         forward_state = None
162 |         backward_state = None
163 | 
164 |       forward_output_sequence, forward_state = forward_layer(forward_output_sequence,
165 |                                                              batch_lengths,
166 |                                                              forward_state)
167 |       backward_output_sequence, backward_state = backward_layer(backward_output_sequence,
168 |                                                                 batch_lengths,
169 |                                                                 backward_state)
170 |       # Skip connections, just adding the input to the output.
171 |       if layer_index != 0:
172 |         forward_output_sequence += forward_cache
173 |         backward_output_sequence += backward_cache
174 | 
175 |       sequence_outputs.append(torch.cat([forward_output_sequence,
176 |                                          backward_output_sequence], -1))
177 |       # Append the state tuples in a list, so that we can return
178 |       # the final states for all the layers.
179 |       final_states.append((torch.cat([forward_state[0], backward_state[0]], -1),
180 |                            torch.cat([forward_state[1], backward_state[1]], -1)))
181 | 
182 |     stacked_sequence_outputs: torch.FloatTensor = torch.stack(sequence_outputs)
183 |     # Stack the hidden state and memory for each layer into 2 tensors of shape
184 |     # (num_layers, batch_size, hidden_size) and (num_layers, batch_size, cell_size)
185 |     # respectively.
186 |     final_hidden_states, final_memory_states = zip(*final_states)
187 |     final_state_tuple: Tuple[torch.FloatTensor,
188 |                              torch.FloatTensor] = (torch.cat(final_hidden_states, 0),
189 |                                                    torch.cat(final_memory_states, 0))
190 |     return stacked_sequence_outputs, final_state_tuple
191 | 


--------------------------------------------------------------------------------
/elmoformanylangs/modules/embedding_layer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from torch.autograd import Variable
 5 | import logging
 6 | 
 7 | logger = logging.getLogger('elmoformanylangs')
 8 | 
 9 | 
10 | class EmbeddingLayer(nn.Module):
11 |   def __init__(self, n_d, word2id, embs=None, fix_emb=True, oov='<oov>', pad='<pad>', normalize=True):
12 |     super(EmbeddingLayer, self).__init__()
13 |     if embs is not None:
14 |       embwords, embvecs = embs
15 |       # for word in embwords:
16 |       #  assert word not in word2id, "Duplicate words in pre-trained embeddings"
17 |       #  word2id[word] = len(word2id)
18 | 
19 |       logger.info("{} pre-trained word embeddings loaded.".format(len(word2id)))
20 |       if n_d != len(embvecs[0]):
21 |         logger.warning("[WARNING] n_d ({}) != word vector size ({}). Use {} for embeddings.".format(
22 |           n_d, len(embvecs[0]), len(embvecs[0])))
23 |         n_d = len(embvecs[0])
24 | 
25 |     self.word2id = word2id
26 |     self.id2word = {i: word for word, i in word2id.items()}
27 |     self.n_V, self.n_d = len(word2id), n_d
28 |     self.oovid = word2id[oov]
29 |     self.padid = word2id[pad]
30 |     self.embedding = nn.Embedding(self.n_V, n_d, padding_idx=self.padid)
31 |     self.embedding.weight.data.uniform_(-0.25, 0.25)
32 | 
33 |     if embs is not None:
34 |       weight = self.embedding.weight
35 |       weight.data[:len(embwords)].copy_(torch.from_numpy(embvecs))
36 |       logger.info("embedding shape: {}".format(weight.size()))
37 | 
38 |     if normalize:
39 |       weight = self.embedding.weight
40 |       norms = weight.data.norm(2, 1)
41 |       if norms.dim() == 1:
42 |         norms = norms.unsqueeze(1)
43 |       weight.data.div_(norms.expand_as(weight.data))
44 | 
45 |     if fix_emb:
46 |       self.embedding.weight.requires_grad = False
47 | 
48 |   def forward(self, input_):
49 |     return self.embedding(input_)
50 | 


--------------------------------------------------------------------------------
/elmoformanylangs/modules/encoder_base.py:
--------------------------------------------------------------------------------
  1 | from typing import Tuple, Union, Optional, Callable
  2 | import torch
  3 | from torch.autograd import Variable
  4 | from torch.nn.utils.rnn import pack_padded_sequence, PackedSequence
  5 | 
  6 | from .util import get_lengths_from_binary_sequence_mask, sort_batch_by_length
  7 | 
  8 | # We have two types here for the state, because storing the state in something
  9 | # which is Iterable (like a tuple, below), is helpful for internal manipulation
 10 | # - however, the states are consumed as either Tensors or a Tuple of Tensors, so
 11 | # returning them in this format is unhelpful.
 12 | RnnState = Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]  # pylint: disable=invalid-name
 13 | RnnStateStorage = Tuple[torch.Tensor, ...]  # pylint: disable=invalid-name
 14 | 
 15 | 
 16 | class _EncoderBase(torch.nn.Module):
 17 |     # pylint: disable=abstract-method
 18 |     """
 19 |     This abstract class serves as a base for the 3 ``Encoder`` abstractions in AllenNLP.
 20 |     - :class:`~allennlp.modules.seq2seq_encoders.Seq2SeqEncoders`
 21 |     - :class:`~allennlp.modules.seq2vec_encoders.Seq2VecEncoders`
 22 |     Additionally, this class provides functionality for sorting sequences by length
 23 |     so they can be consumed by Pytorch RNN classes, which require their inputs to be
 24 |     sorted by length. Finally, it also provides optional statefulness to all of it's
 25 |     subclasses by allowing the caching and retrieving of the hidden states of RNNs.
 26 |     """
 27 |     def __init__(self, stateful: bool = False) -> None:
 28 |         super(_EncoderBase, self).__init__()
 29 |         self.stateful = stateful
 30 |         self._states: Optional[RnnStateStorage] = None
 31 | 
 32 |     def sort_and_run_forward(self,
 33 |                              module: Callable[[PackedSequence, Optional[RnnState]],
 34 |                                               Tuple[Union[PackedSequence, torch.Tensor], RnnState]],
 35 |                              inputs: torch.Tensor,
 36 |                              mask: torch.Tensor,
 37 |                              hidden_state: Optional[RnnState] = None):
 38 |         """
 39 |         This function exists because Pytorch RNNs require that their inputs be sorted
 40 |         before being passed as input. As all of our Seq2xxxEncoders use this functionality,
 41 |         it is provided in a base class. This method can be called on any module which
 42 |         takes as input a ``PackedSequence`` and some ``hidden_state``, which can either be a
 43 |         tuple of tensors or a tensor.
 44 |         As all of our Seq2xxxEncoders have different return types, we return `sorted`
 45 |         outputs from the module, which is called directly. Additionally, we return the
 46 |         indices into the batch dimension required to restore the tensor to it's correct,
 47 |         unsorted order and the number of valid batch elements (i.e the number of elements
 48 |         in the batch which are not completely masked). This un-sorting and re-padding
 49 |         of the module outputs is left to the subclasses because their outputs have different
 50 |         types and handling them smoothly here is difficult.
 51 |         Parameters
 52 |         ----------
 53 |         module : ``Callable[[PackedSequence, Optional[RnnState]],
 54 |                             Tuple[Union[PackedSequence, torch.Tensor], RnnState]]``, required.
 55 |             A function to run on the inputs. In most cases, this is a ``torch.nn.Module``.
 56 |         inputs : ``torch.Tensor``, required.
 57 |             A tensor of shape ``(batch_size, sequence_length, embedding_size)`` representing
 58 |             the inputs to the Encoder.
 59 |         mask : ``torch.Tensor``, required.
 60 |             A tensor of shape ``(batch_size, sequence_length)``, representing masked and
 61 |             non-masked elements of the sequence for each element in the batch.
 62 |         hidden_state : ``Optional[RnnState]``, (default = None).
 63 |             A single tensor of shape (num_layers, batch_size, hidden_size) representing the
 64 |             state of an RNN with or a tuple of
 65 |             tensors of shapes (num_layers, batch_size, hidden_size) and
 66 |             (num_layers, batch_size, memory_size), representing the hidden state and memory
 67 |             state of an LSTM-like RNN.
 68 |         Returns
 69 |         -------
 70 |         module_output : ``Union[torch.Tensor, PackedSequence]``.
 71 |             A Tensor or PackedSequence representing the output of the Pytorch Module.
 72 |             The batch size dimension will be equal to ``num_valid``, as sequences of zero
 73 |             length are clipped off before the module is called, as Pytorch cannot handle
 74 |             zero length sequences.
 75 |         final_states : ``Optional[RnnState]``
 76 |             A Tensor representing the hidden state of the Pytorch Module. This can either
 77 |             be a single tensor of shape (num_layers, num_valid, hidden_size), for instance in
 78 |             the case of a GRU, or a tuple of tensors, such as those required for an LSTM.
 79 |         restoration_indices : ``torch.LongTensor``
 80 |             A tensor of shape ``(batch_size,)``, describing the re-indexing required to transform
 81 |             the outputs back to their original batch order.
 82 |         """
 83 |         # In some circumstances you may have sequences of zero length. ``pack_padded_sequence``
 84 |         # requires all sequence lengths to be > 0, so remove sequences of zero length before
 85 |         # calling self._module, then fill with zeros.
 86 | 
 87 |         # First count how many sequences are empty.
 88 |         batch_size = mask.size(0)
 89 |         num_valid = torch.sum(mask[:, 0]).int().item()
 90 | 
 91 |         sequence_lengths = get_lengths_from_binary_sequence_mask(mask)
 92 |         sorted_inputs, sorted_sequence_lengths, restoration_indices, sorting_indices =\
 93 |             sort_batch_by_length(inputs, sequence_lengths)
 94 | 
 95 |         # Now create a PackedSequence with only the non-empty, sorted sequences.
 96 |         packed_sequence_input = pack_padded_sequence(sorted_inputs[:num_valid, :, :],
 97 |                                                      sorted_sequence_lengths[:num_valid].data.tolist(),
 98 |                                                      batch_first=True)
 99 |         # Prepare the initial states.
100 |         if not self.stateful:
101 |             if hidden_state is None:
102 |                 initial_states = hidden_state
103 |             elif isinstance(hidden_state, tuple):
104 |                 initial_states = [state.index_select(1, sorting_indices)[:, :num_valid, :]
105 |                                   for state in hidden_state]
106 |             else:
107 |                 initial_states = hidden_state.index_select(1, sorting_indices)[:, :num_valid, :]
108 | 
109 |         else:
110 |             initial_states = self._get_initial_states(batch_size, num_valid, sorting_indices)
111 | 
112 |         # Actually call the module on the sorted PackedSequence.
113 |         module_output, final_states = module(packed_sequence_input, initial_states)
114 | 
115 |         return module_output, final_states, restoration_indices
116 | 
117 |     def _get_initial_states(self,
118 |                             batch_size: int,
119 |                             num_valid: int,
120 |                             sorting_indices: torch.LongTensor) -> Optional[RnnState]:
121 |         """
122 |         Returns an initial state for use in an RNN. Additionally, this method handles
123 |         the batch size changing across calls by mutating the state to append initial states
124 |         for new elements in the batch. Finally, it also handles sorting the states
125 |         with respect to the sequence lengths of elements in the batch and removing rows
126 |         which are completely padded. Importantly, this `mutates` the state if the
127 |         current batch size is larger than when it was previously called.
128 |         Parameters
129 |         ----------
130 |         batch_size : ``int``, required.
131 |             The batch size can change size across calls to stateful RNNs, so we need
132 |             to know if we need to expand or shrink the states before returning them.
133 |             Expanded states will be set to zero.
134 |         num_valid : ``int``, required.
135 |             The batch may contain completely padded sequences which get removed before
136 |             the sequence is passed through the encoder. We also need to clip these off
137 |             of the state too.
138 |         sorting_indices ``torch.LongTensor``, required.
139 |             Pytorch RNNs take sequences sorted by length. When we return the states to be
140 |             used for a given call to ``module.forward``, we need the states to match up to
141 |             the sorted sequences, so before returning them, we sort the states using the
142 |             same indices used to sort the sequences.
143 |         Returns
144 |         -------
145 |         This method has a complex return type because it has to deal with the first time it
146 |         is called, when it has no state, and the fact that types of RNN have heterogeneous
147 |         states.
148 |         If it is the first time the module has been called, it returns ``None``, regardless
149 |         of the type of the ``Module``.
150 |         Otherwise, for LSTMs, it returns a tuple of ``torch.Tensors`` with shape
151 |         ``(num_layers, num_valid, state_size)`` and ``(num_layers, num_valid, memory_size)``
152 |         respectively, or for GRUs, it returns a single ``torch.Tensor`` of shape
153 |         ``(num_layers, num_valid, state_size)``.
154 |         """
155 |         # We don't know the state sizes the first time calling forward,
156 |         # so we let the module define what it's initial hidden state looks like.
157 |         if self._states is None:
158 |             return None
159 | 
160 |         # Otherwise, we have some previous states.
161 |         if batch_size > self._states[0].size(1):
162 |             # This batch is larger than the all previous states.
163 |             # If so, resize the states.
164 |             num_states_to_concat = batch_size - self._states[0].size(1)
165 |             resized_states = []
166 |             # state has shape (num_layers, batch_size, hidden_size)
167 |             for state in self._states:
168 |                 # This _must_ be inside the loop because some
169 |                 # RNNs have states with different last dimension sizes.
170 |                 zeros = state.data.new(state.size(0),
171 |                                        num_states_to_concat,
172 |                                        state.size(2)).fill_(0)
173 |                 zeros = Variable(zeros)
174 |                 resized_states.append(torch.cat([state, zeros], 1))
175 |             self._states = tuple(resized_states)
176 |             correctly_shaped_states = self._states
177 | 
178 |         elif batch_size < self._states[0].size(1):
179 |             # This batch is smaller than the previous one.
180 |             correctly_shaped_states = tuple(state[:, :batch_size, :] for state in self._states)
181 |         else:
182 |             correctly_shaped_states = self._states
183 | 
184 |         # At this point, our states are of shape (num_layers, batch_size, hidden_size).
185 |         # However, the encoder uses sorted sequences and additionally removes elements
186 |         # of the batch which are fully padded. We need the states to match up to these
187 |         # sorted and filtered sequences, so we do that in the next two blocks before
188 |         # returning the state/s.
189 |         if len(self._states) == 1:
190 |             # GRUs only have a single state. This `unpacks` it from the
191 |             # tuple and returns the tensor directly.
192 |             correctly_shaped_state = correctly_shaped_states[0]
193 |             sorted_state = correctly_shaped_state.index_select(1, sorting_indices)
194 |             return sorted_state[:, :num_valid, :]
195 |         else:
196 |             # LSTMs have a state tuple of (state, memory).
197 |             sorted_states = [state.index_select(1, sorting_indices)
198 |                              for state in correctly_shaped_states]
199 |             return tuple(state[:, :num_valid, :] for state in sorted_states)
200 | 
201 |     def _update_states(self,
202 |                        final_states: RnnStateStorage,
203 |                        restoration_indices: torch.LongTensor) -> None:
204 |         """
205 |         After the RNN has run forward, the states need to be updated.
206 |         This method just sets the state to the updated new state, performing
207 |         several pieces of book-keeping along the way - namely, unsorting the
208 |         states and ensuring that the states of completely padded sequences are
209 |         not updated. Finally, it also detatches the state variable from the
210 |         computational graph, such that the graph can be garbage collected after
211 |         each batch iteration.
212 |         Parameters
213 |         ----------
214 |         final_states : ``RnnStateStorage``, required.
215 |             The hidden states returned as output from the RNN.
216 |         restoration_indices : ``torch.LongTensor``, required.
217 |             The indices that invert the sorting used in ``sort_and_run_forward``
218 |             to order the states with respect to the lengths of the sequences in
219 |             the batch.
220 |         """
221 |         # TODO(Mark): seems weird to sort here, but append zeros in the subclasses.
222 |         # which way around is best?
223 |         new_unsorted_states = [state.index_select(1, restoration_indices)
224 |                                for state in final_states]
225 | 
226 |         if self._states is None:
227 |             # We don't already have states, so just set the
228 |             # ones we receive to be the current state.
229 |             self._states = tuple([torch.autograd.Variable(state.data)
230 |                                   for state in new_unsorted_states])
231 |         else:
232 |             # Now we've sorted the states back so that they correspond to the original
233 |             # indices, we need to figure out what states we need to update, because if we
234 |             # didn't use a state for a particular row, we want to preserve its state.
235 |             # Thankfully, the rows which are all zero in the state correspond exactly
236 |             # to those which aren't used, so we create masks of shape (new_batch_size,),
237 |             # denoting which states were used in the RNN computation.
238 |             current_state_batch_size = self._states[0].size(1)
239 |             new_state_batch_size = final_states[0].size(1)
240 |             # Masks for the unused states of shape (1, new_batch_size, 1)
241 |             used_new_rows_mask = [(state[0, :, :].sum(-1)
242 |                                    != 0.0).float().view(1, new_state_batch_size, 1)
243 |                                   for state in new_unsorted_states]
244 |             new_states = []
245 |             if current_state_batch_size > new_state_batch_size:
246 |                 # The new state is smaller than the old one,
247 |                 # so just update the indices which we used.
248 |                 for old_state, new_state, used_mask in zip(self._states,
249 |                                                            new_unsorted_states,
250 |                                                            used_new_rows_mask):
251 |                     # zero out all rows in the previous state
252 |                     # which _were_ used in the current state.
253 |                     masked_old_state = old_state[:, :new_state_batch_size, :] * (1 - used_mask)
254 |                     # The old state is larger, so update the relevant parts of it.
255 |                     old_state[:, :new_state_batch_size, :] = new_state + masked_old_state
256 |                     # Detatch the Variable.
257 |                     new_states.append(torch.autograd.Variable(old_state.data))
258 |             else:
259 |                 # The states are the same size, so we just have to
260 |                 # deal with the possibility that some rows weren't used.
261 |                 new_states = []
262 |                 for old_state, new_state, used_mask in zip(self._states,
263 |                                                            new_unsorted_states,
264 |                                                            used_new_rows_mask):
265 |                     # zero out all rows which _were_ used in the current state.
266 |                     masked_old_state = old_state * (1 - used_mask)
267 |                     # The old state is larger, so update the relevant parts of it.
268 |                     new_state += masked_old_state
269 |                     # Detatch the Variable.
270 |                     new_states.append(torch.autograd.Variable(new_state.data))
271 | 
272 |             # It looks like there should be another case handled here - when
273 |             # the current_state_batch_size < new_state_batch_size. However,
274 |             # this never happens, because the states themeselves are mutated
275 |             # by appending zeros when calling _get_inital_states, meaning that
276 |             # the new states are either of equal size, or smaller, in the case
277 |             # that there are some unused elements (zero-length) for the RNN computation.
278 |             self._states = tuple(new_states)
279 | 
280 |     def reset_states(self):
281 |         self._states = None
282 | 


--------------------------------------------------------------------------------
/elmoformanylangs/modules/highway.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A `Highway layer <https://arxiv.org/abs/1505.00387>`_ that does a gated combination of a linear
 3 | transformation and a non-linear transformation of its input.
 4 | """
 5 | 
 6 | from typing import Callable
 7 | 
 8 | import torch
 9 | from overrides import overrides
10 | 
11 | 
12 | class Highway(torch.nn.Module):
13 |     """
14 |     A `Highway layer <https://arxiv.org/abs/1505.00387>`_ does a gated combination of a linear
15 |     transformation and a non-linear transformation of its input.  :math:`y = g * x + (1 - g) *
16 |     f(A(x))`, where :math:`A` is a linear transformation, :math:`f` is an element-wise
17 |     non-linearity, and :math:`g` is an element-wise gate, computed as :math:`sigmoid(B(x))`.
18 |     This module will apply a fixed number of highway layers to its input, returning the final
19 |     result.
20 |     Parameters
21 |     ----------
22 |     input_dim : ``int``
23 |         The dimensionality of :math:`x`.  We assume the input has shape ``(batch_size,
24 |         input_dim)``.
25 |     num_layers : ``int``, optional (default=``1``)
26 |         The number of highway layers to apply to the input.
27 |     activation : ``Callable[[torch.Tensor], torch.Tensor]``, optional (default=``torch.nn.functional.relu``)
28 |         The non-linearity to use in the highway layers.
29 |     """
30 |     def __init__(self,
31 |                  input_dim: int,
32 |                  num_layers: int = 1,
33 |                  activation: Callable[[torch.Tensor], torch.Tensor] = torch.nn.functional.relu) -> None:
34 |         super(Highway, self).__init__()
35 |         self._input_dim = input_dim
36 |         self._layers = torch.nn.ModuleList([torch.nn.Linear(input_dim, input_dim * 2)
37 |                                             for _ in range(num_layers)])
38 |         self._activation = activation
39 |         for layer in self._layers:
40 |             # We should bias the highway layer to just carry its input forward.  We do that by
41 |             # setting the bias on `B(x)` to be positive, because that means `g` will be biased to
42 |             # be high, to we will carry the input forward.  The bias on `B(x)` is the second half
43 |             # of the bias vector in each Linear layer.
44 |             layer.bias[input_dim:].data.fill_(1)
45 | 
46 |     @overrides
47 |     def forward(self, inputs: torch.Tensor) -> torch.Tensor:  # pylint: disable=arguments-differ
48 |         current_input = inputs
49 |         for layer in self._layers:
50 |             projected_input = layer(current_input)
51 |             linear_part = current_input
52 |             # NOTE: if you modify this, think about whether you should modify the initialization
53 |             # above, too.
54 |             nonlinear_part = projected_input[:, (0 * self._input_dim):(1 * self._input_dim)]
55 |             gate = projected_input[:, (1 * self._input_dim):(2 * self._input_dim)]
56 |             nonlinear_part = self._activation(nonlinear_part)
57 |             gate = torch.sigmoid(gate)
58 |             current_input = gate * linear_part + (1 - gate) * nonlinear_part
59 |         return current_input
60 | 


--------------------------------------------------------------------------------
/elmoformanylangs/modules/lstm.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from __future__ import unicode_literals
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | from torch.autograd import Variable
 7 | import copy
 8 | 
 9 | 
10 | class LstmbiLm(nn.Module):
11 |   def __init__(self, config, use_cuda=False):
12 |     super(LstmbiLm, self).__init__()
13 |     self.config = config
14 |     self.use_cuda = use_cuda
15 |     
16 |     self.encoder = nn.LSTM(self.config['encoder']['projection_dim'],
17 |                            self.config['encoder']['dim'],
18 |                            num_layers=self.config['encoder']['n_layers'], 
19 |                            bidirectional=True,
20 |                            batch_first=True, 
21 |                            dropout=self.config['dropout'])
22 |     self.projection = nn.Linear(self.config['encoder']['dim'], self.config['encoder']['projection_dim'], bias=True)
23 | 
24 |   def forward(self, inputs):
25 |     forward, backward = self.encoder(inputs)[0].split(self.config['encoder']['dim'], 2)
26 |     return torch.cat([self.projection(forward), self.projection(backward)], dim=2)
27 | 


--------------------------------------------------------------------------------
/elmoformanylangs/modules/lstm_cell_with_projection.py:
--------------------------------------------------------------------------------
  1 | """
  2 | An LSTM with Recurrent Dropout, a hidden_state which is projected and
  3 | clipping on both the hidden state and the memory state of the LSTM.
  4 | """
  5 | 
  6 | from typing import Optional, Tuple, List
  7 | 
  8 | import torch
  9 | from torch.autograd import Variable
 10 | 
 11 | from .util import block_orthogonal, get_dropout_mask
 12 | 
 13 | class LstmCellWithProjection(torch.nn.Module):
 14 |     """
 15 |     An LSTM with Recurrent Dropout and a projected and clipped hidden state and
 16 |     memory. Note: this implementation is slower than the native Pytorch LSTM because
 17 |     it cannot make use of CUDNN optimizations for stacked RNNs due to and
 18 |     variational dropout and the custom nature of the cell state.
 19 |     Parameters
 20 |     ----------
 21 |     input_size : ``int``, required.
 22 |         The dimension of the inputs to the LSTM.
 23 |     hidden_size : ``int``, required.
 24 |         The dimension of the outputs of the LSTM.
 25 |     cell_size : ``int``, required.
 26 |         The dimension of the memory cell used for the LSTM.
 27 |     go_forward: ``bool``, optional (default = True)
 28 |         The direction in which the LSTM is applied to the sequence.
 29 |         Forwards by default, or backwards if False.
 30 |     recurrent_dropout_probability: ``float``, optional (default = 0.0)
 31 |         The dropout probability to be used in a dropout scheme as stated in
 32 |         `A Theoretically Grounded Application of Dropout in Recurrent Neural Networks
 33 |         <https://arxiv.org/abs/1512.05287>`_ . Implementation wise, this simply
 34 |         applies a fixed dropout mask per sequence to the recurrent connection of the
 35 |         LSTM.
 36 |     state_projection_clip_value: ``float``, optional, (default = None)
 37 |         The magnitude with which to clip the hidden_state after projecting it.
 38 |     memory_cell_clip_value: ``float``, optional, (default = None)
 39 |         The magnitude with which to clip the memory cell.
 40 |     Returns
 41 |     -------
 42 |     output_accumulator : ``torch.FloatTensor``
 43 |         The outputs of the LSTM for each timestep. A tensor of shape
 44 |         (batch_size, max_timesteps, hidden_size) where for a given batch
 45 |         element, all outputs past the sequence length for that batch are
 46 |         zero tensors.
 47 |     final_state: ``Tuple[torch.FloatTensor, torch.FloatTensor]``
 48 |         The final (state, memory) states of the LSTM, with shape
 49 |         (1, batch_size, hidden_size) and  (1, batch_size, cell_size)
 50 |         respectively. The first dimension is 1 in order to match the Pytorch
 51 |         API for returning stacked LSTM states.
 52 |     """
 53 |     def __init__(self,
 54 |                  input_size: int,
 55 |                  hidden_size: int,
 56 |                  cell_size: int,
 57 |                  go_forward: bool = True,
 58 |                  recurrent_dropout_probability: float = 0.0,
 59 |                  memory_cell_clip_value: Optional[float] = None,
 60 |                  state_projection_clip_value: Optional[float] = None) -> None:
 61 |         super(LstmCellWithProjection, self).__init__()
 62 |         # Required to be wrapped with a :class:`PytorchSeq2SeqWrapper`.
 63 |         self.input_size = input_size
 64 |         self.hidden_size = hidden_size
 65 |         self.cell_size = cell_size
 66 | 
 67 |         self.go_forward = go_forward
 68 |         self.state_projection_clip_value = state_projection_clip_value
 69 |         self.memory_cell_clip_value = memory_cell_clip_value
 70 |         self.recurrent_dropout_probability = recurrent_dropout_probability
 71 | 
 72 |         # We do the projections for all the gates all at once.
 73 |         self.input_linearity = torch.nn.Linear(input_size, 4 * cell_size, bias=False)
 74 |         self.state_linearity = torch.nn.Linear(hidden_size, 4 * cell_size, bias=True)
 75 | 
 76 |         # Additional projection matrix for making the hidden state smaller.
 77 |         self.state_projection = torch.nn.Linear(cell_size, hidden_size, bias=False)
 78 |         self.reset_parameters()
 79 | 
 80 |     def reset_parameters(self):
 81 |         # Use sensible default initializations for parameters.
 82 |         block_orthogonal(self.input_linearity.weight.data, [self.cell_size, self.input_size])
 83 |         block_orthogonal(self.state_linearity.weight.data, [self.cell_size, self.hidden_size])
 84 | 
 85 |         self.state_linearity.bias.data.fill_(0.0)
 86 |         # Initialize forget gate biases to 1.0 as per An Empirical
 87 |         # Exploration of Recurrent Network Architectures, (Jozefowicz, 2015).
 88 |         self.state_linearity.bias.data[self.cell_size:2 * self.cell_size].fill_(1.0)
 89 | 
 90 |     def forward(self,  # pylint: disable=arguments-differ
 91 |                 inputs: torch.FloatTensor,
 92 |                 batch_lengths: List[int],
 93 |                 initial_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None):
 94 |         """
 95 |         Parameters
 96 |         ----------
 97 |         inputs : ``torch.FloatTensor``, required.
 98 |             A tensor of shape (batch_size, num_timesteps, input_size)
 99 |             to apply the LSTM over.
100 |         batch_lengths : ``List[int]``, required.
101 |             A list of length batch_size containing the lengths of the sequences in batch.
102 |         initial_state : ``Tuple[torch.Tensor, torch.Tensor]``, optional, (default = None)
103 |             A tuple (state, memory) representing the initial hidden state and memory
104 |             of the LSTM. The ``state`` has shape (1, batch_size, hidden_size) and the
105 |             ``memory`` has shape (1, batch_size, cell_size).
106 |         Returns
107 |         -------
108 |         output_accumulator : ``torch.FloatTensor``
109 |             The outputs of the LSTM for each timestep. A tensor of shape
110 |             (batch_size, max_timesteps, hidden_size) where for a given batch
111 |             element, all outputs past the sequence length for that batch are
112 |             zero tensors.
113 |         final_state : ``Tuple[``torch.FloatTensor, torch.FloatTensor]``
114 |             A tuple (state, memory) representing the initial hidden state and memory
115 |             of the LSTM. The ``state`` has shape (1, batch_size, hidden_size) and the
116 |             ``memory`` has shape (1, batch_size, cell_size).
117 |         """
118 |         batch_size = inputs.size()[0]
119 |         total_timesteps = inputs.size()[1]
120 | 
121 |         # We have to use this '.data.new().fill_' pattern to create tensors with the correct
122 |         # type - forward has no knowledge of whether these are torch.Tensors or torch.cuda.Tensors.
123 |         output_accumulator = Variable(inputs.data.new(batch_size,
124 |                                                       total_timesteps,
125 |                                                       self.hidden_size).fill_(0))
126 |         if initial_state is None:
127 |             full_batch_previous_memory = Variable(inputs.data.new(batch_size,
128 |                                                                   self.cell_size).fill_(0))
129 |             full_batch_previous_state = Variable(inputs.data.new(batch_size,
130 |                                                                  self.hidden_size).fill_(0))
131 |         else:
132 |             full_batch_previous_state = initial_state[0].squeeze(0)
133 |             full_batch_previous_memory = initial_state[1].squeeze(0)
134 | 
135 |         current_length_index = batch_size - 1 if self.go_forward else 0
136 |         if self.recurrent_dropout_probability > 0.0 and self.training:
137 |             dropout_mask = get_dropout_mask(self.recurrent_dropout_probability,
138 |                                             full_batch_previous_state)
139 |         else:
140 |             dropout_mask = None
141 | 
142 |         for timestep in range(total_timesteps):
143 |             # The index depends on which end we start.
144 |             index = timestep if self.go_forward else total_timesteps - timestep - 1
145 | 
146 |             # What we are doing here is finding the index into the batch dimension
147 |             # which we need to use for this timestep, because the sequences have
148 |             # variable length, so once the index is greater than the length of this
149 |             # particular batch sequence, we no longer need to do the computation for
150 |             # this sequence. The key thing to recognise here is that the batch inputs
151 |             # must be _ordered_ by length from longest (first in batch) to shortest
152 |             # (last) so initially, we are going forwards with every sequence and as we
153 |             # pass the index at which the shortest elements of the batch finish,
154 |             # we stop picking them up for the computation.
155 |             if self.go_forward:
156 |                 while batch_lengths[current_length_index] <= index:
157 |                     current_length_index -= 1
158 |             # If we're going backwards, we are _picking up_ more indices.
159 |             else:
160 |                 # First conditional: Are we already at the maximum number of elements in the batch?
161 |                 # Second conditional: Does the next shortest sequence beyond the current batch
162 |                 # index require computation use this timestep?
163 |                 while current_length_index < (len(batch_lengths) - 1) and \
164 |                                 batch_lengths[current_length_index + 1] > index:
165 |                     current_length_index += 1
166 | 
167 |             # Actually get the slices of the batch which we
168 |             # need for the computation at this timestep.
169 |             # shape (batch_size, cell_size)
170 |             previous_memory = full_batch_previous_memory[0: current_length_index + 1].clone()
171 |             # Shape (batch_size, hidden_size)
172 |             previous_state = full_batch_previous_state[0: current_length_index + 1].clone()
173 |             # Shape (batch_size, input_size)
174 |             timestep_input = inputs[0: current_length_index + 1, index]
175 | 
176 |             # Do the projections for all the gates all at once.
177 |             # Both have shape (batch_size, 4 * cell_size)
178 |             projected_input = self.input_linearity(timestep_input)
179 |             projected_state = self.state_linearity(previous_state)
180 | 
181 |             # Main LSTM equations using relevant chunks of the big linear
182 |             # projections of the hidden state and inputs.
183 |             input_gate = torch.sigmoid(projected_input[:, (0 * self.cell_size):(1 * self.cell_size)] +
184 |                                        projected_state[:, (0 * self.cell_size):(1 * self.cell_size)])
185 |             forget_gate = torch.sigmoid(projected_input[:, (1 * self.cell_size):(2 * self.cell_size)] +
186 |                                         projected_state[:, (1 * self.cell_size):(2 * self.cell_size)])
187 |             memory_init = torch.tanh(projected_input[:, (2 * self.cell_size):(3 * self.cell_size)] +
188 |                                      projected_state[:, (2 * self.cell_size):(3 * self.cell_size)])
189 |             output_gate = torch.sigmoid(projected_input[:, (3 * self.cell_size):(4 * self.cell_size)] +
190 |                                         projected_state[:, (3 * self.cell_size):(4 * self.cell_size)])
191 |             memory = input_gate * memory_init + forget_gate * previous_memory
192 | 
193 |             # Here is the non-standard part of this LSTM cell; first, we clip the
194 |             # memory cell, then we project the output of the timestep to a smaller size
195 |             # and again clip it.
196 | 
197 |             if self.memory_cell_clip_value:
198 |                 # pylint: disable=invalid-unary-operand-type
199 |                 memory = torch.clamp(memory, -self.memory_cell_clip_value, self.memory_cell_clip_value)
200 | 
201 |             # shape (current_length_index, cell_size)
202 |             pre_projection_timestep_output = output_gate * torch.tanh(memory)
203 | 
204 |             # shape (current_length_index, hidden_size)
205 |             timestep_output = self.state_projection(pre_projection_timestep_output)
206 |             if self.state_projection_clip_value:
207 |                 # pylint: disable=invalid-unary-operand-type
208 |                 timestep_output = torch.clamp(timestep_output,
209 |                                               -self.state_projection_clip_value,
210 |                                               self.state_projection_clip_value)
211 | 
212 |             # Only do dropout if the dropout prob is > 0.0 and we are in training mode.
213 |             if dropout_mask is not None:
214 |                 timestep_output = timestep_output * dropout_mask[0: current_length_index + 1]
215 | 
216 |             # We've been doing computation with less than the full batch, so here we create a new
217 |             # variable for the the whole batch at this timestep and insert the result for the
218 |             # relevant elements of the batch into it.
219 |             full_batch_previous_memory = Variable(full_batch_previous_memory.data.clone())
220 |             full_batch_previous_state = Variable(full_batch_previous_state.data.clone())
221 |             full_batch_previous_memory[0:current_length_index + 1] = memory
222 |             full_batch_previous_state[0:current_length_index + 1] = timestep_output
223 |             output_accumulator[0:current_length_index + 1, index] = timestep_output
224 | 
225 |         # Mimic the pytorch API by returning state in the following shape:
226 |         # (num_layers * num_directions, batch_size, ...). As this
227 |         # LSTM cell cannot be stacked, the first dimension here is just 1.
228 |         final_state = (full_batch_previous_state.unsqueeze(0),
229 |                        full_batch_previous_memory.unsqueeze(0))
230 | 
231 |         return output_accumulator, final_state
232 | 


--------------------------------------------------------------------------------
/elmoformanylangs/modules/token_embedder.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import unicode_literals
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | from torch.autograd import Variable
  7 | import copy
  8 | from .highway import Highway
  9 | 
 10 | 
 11 | class LstmTokenEmbedder(nn.Module):
 12 |   def __init__(self, config, word_emb_layer, char_emb_layer, use_cuda=False):
 13 |     super(LstmTokenEmbedder, self).__init__()
 14 |     self.config = config
 15 |     self.use_cuda = use_cuda
 16 |     self.word_emb_layer = word_emb_layer
 17 |     self.char_emb_layer = char_emb_layer
 18 |     self.output_dim = config['encoder']['projection_dim']
 19 |     emb_dim = 0
 20 |     if word_emb_layer is not None:
 21 |       emb_dim += word_emb_layer.n_d
 22 | 
 23 |     if char_emb_layer is not None:
 24 |       emb_dim += char_emb_layer.n_d * 2
 25 |       self.char_lstm = nn.LSTM(char_emb_layer.n_d, char_emb_layer.n_d, num_layers=1, bidirectional=True,
 26 |                                batch_first=True, dropout=config['dropout'])
 27 | 
 28 |     self.projection = nn.Linear(emb_dim, self.output_dim, bias=True)
 29 | 
 30 |   def forward(self, word_inp, chars_inp, shape):
 31 |     embs = []
 32 |     batch_size, seq_len = shape
 33 |     if self.word_emb_layer is not None:
 34 |       word_emb = self.word_emb_layer(Variable(word_inp).cuda() if self.use_cuda else Variable(word_inp))
 35 |       embs.append(word_emb)
 36 | 
 37 |     if self.char_emb_layer is not None:
 38 |       chars_inp = chars_inp.view(batch_size * seq_len, -1)
 39 |       chars_emb = self.char_emb_layer(Variable(chars_inp).cuda() if self.use_cuda else Variable(chars_inp))
 40 |       _, (chars_outputs, __) = self.char_lstm(chars_emb)
 41 |       chars_outputs = chars_outputs.contiguous().view(-1, self.config['token_embedder']['char_dim'] * 2)
 42 |       embs.append(chars_outputs)
 43 | 
 44 |     token_embedding = torch.cat(embs, dim=2)
 45 | 
 46 |     return self.projection(token_embedding)
 47 | 
 48 | 
 49 | class ConvTokenEmbedder(nn.Module):
 50 |   def __init__(self, config, word_emb_layer, char_emb_layer, use_cuda):
 51 |     super(ConvTokenEmbedder, self).__init__()
 52 |     self.config = config
 53 |     self.use_cuda = use_cuda
 54 | 
 55 |     self.word_emb_layer = word_emb_layer
 56 |     self.char_emb_layer = char_emb_layer
 57 | 
 58 |     self.output_dim = config['encoder']['projection_dim']
 59 |     self.emb_dim = 0
 60 |     if word_emb_layer is not None:
 61 |       self.emb_dim += word_emb_layer.n_d
 62 | 
 63 |     if char_emb_layer is not None:
 64 |       self.convolutions = []
 65 |       cnn_config = config['token_embedder']
 66 |       filters = cnn_config['filters']
 67 |       char_embed_dim = cnn_config['char_dim']
 68 | 
 69 |       for i, (width, num) in enumerate(filters):
 70 |         conv = torch.nn.Conv1d(
 71 |           in_channels=char_embed_dim,
 72 |           out_channels=num,
 73 |           kernel_size=width,
 74 |           bias=True
 75 |         )
 76 |         self.convolutions.append(conv)
 77 | 
 78 |       self.convolutions = nn.ModuleList(self.convolutions)
 79 |       
 80 |       self.n_filters = sum(f[1] for f in filters)
 81 |       self.n_highway = cnn_config['n_highway']
 82 | 
 83 |       self.highways = Highway(self.n_filters, self.n_highway, activation=torch.nn.functional.relu)
 84 |       self.emb_dim += self.n_filters
 85 | 
 86 |     self.projection = nn.Linear(self.emb_dim, self.output_dim, bias=True)
 87 |     
 88 |   def forward(self, word_inp, chars_inp, shape):
 89 |     embs = []
 90 |     batch_size, seq_len = shape
 91 |     if self.word_emb_layer is not None:
 92 |       batch_size, seq_len = word_inp.size(0), word_inp.size(1)
 93 |       word_emb = self.word_emb_layer(Variable(word_inp).cuda() if self.use_cuda else Variable(word_inp))
 94 |       embs.append(word_emb)
 95 | 
 96 |     if self.char_emb_layer is not None:
 97 |       chars_inp = chars_inp.view(batch_size * seq_len, -1)
 98 | 
 99 |       character_embedding = self.char_emb_layer(Variable(chars_inp).cuda() if self.use_cuda else Variable(chars_inp))
100 | 
101 |       character_embedding = torch.transpose(character_embedding, 1, 2)
102 | 
103 |       cnn_config = self.config['token_embedder']
104 |       if cnn_config['activation'] == 'tanh':
105 |         activation = torch.nn.functional.tanh
106 |       elif cnn_config['activation'] == 'relu':
107 |         activation = torch.nn.functional.relu
108 |       else:
109 |         raise Exception("Unknown activation")
110 | 
111 |       convs = []
112 |       for i in range(len(self.convolutions)):
113 |         convolved = self.convolutions[i](character_embedding)
114 |         # (batch_size * sequence_length, n_filters for this width)
115 |         convolved, _ = torch.max(convolved, dim=-1)
116 |         convolved = activation(convolved)
117 |         convs.append(convolved)
118 |       char_emb = torch.cat(convs, dim=-1)
119 |       char_emb = self.highways(char_emb)
120 | 
121 |       embs.append(char_emb.view(batch_size, -1, self.n_filters))
122 |       
123 |     token_embedding = torch.cat(embs, dim=2)
124 | 
125 |     return self.projection(token_embedding)
126 | 


--------------------------------------------------------------------------------
/elmoformanylangs/modules/util.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Assorted utilities for working with neural networks in AllenNLP.
  3 | """
  4 | from collections import defaultdict
  5 | from typing import Dict, List, Optional, Any, Tuple, Callable
  6 | import itertools
  7 | import math
  8 | import torch
  9 | from torch.autograd import Variable
 10 | 
 11 | def get_lengths_from_binary_sequence_mask(mask: torch.Tensor):
 12 |     """
 13 |     Compute sequence lengths for each batch element in a tensor using a
 14 |     binary mask.
 15 |     Parameters
 16 |     ----------
 17 |     mask : torch.Tensor, required.
 18 |         A 2D binary mask of shape (batch_size, sequence_length) to
 19 |         calculate the per-batch sequence lengths from.
 20 |     Returns
 21 |     -------
 22 |     A torch.LongTensor of shape (batch_size,) representing the lengths
 23 |     of the sequences in the batch.
 24 |     """
 25 |     return mask.long().sum(-1)
 26 | 
 27 | 
 28 | def sort_batch_by_length(tensor: torch.autograd.Variable,
 29 |                          sequence_lengths: torch.autograd.Variable):
 30 |     """
 31 |     Sort a batch first tensor by some specified lengths.
 32 |     Parameters
 33 |     ----------
 34 |     tensor : Variable(torch.FloatTensor), required.
 35 |         A batch first Pytorch tensor.
 36 |     sequence_lengths : Variable(torch.LongTensor), required.
 37 |         A tensor representing the lengths of some dimension of the tensor which
 38 |         we want to sort by.
 39 |     Returns
 40 |     -------
 41 |     sorted_tensor : Variable(torch.FloatTensor)
 42 |         The original tensor sorted along the batch dimension with respect to sequence_lengths.
 43 |     sorted_sequence_lengths : Variable(torch.LongTensor)
 44 |         The original sequence_lengths sorted by decreasing size.
 45 |     restoration_indices : Variable(torch.LongTensor)
 46 |         Indices into the sorted_tensor such that
 47 |         ``sorted_tensor.index_select(0, restoration_indices) == original_tensor``
 48 |     permuation_index : Variable(torch.LongTensor)
 49 |         The indices used to sort the tensor. This is useful if you want to sort many
 50 |         tensors using the same ordering.
 51 |     """
 52 | 
 53 |     if not isinstance(tensor, Variable) or not isinstance(sequence_lengths, Variable):
 54 |         raise Exception("Both the tensor and sequence lengths must be torch.autograd.Variables.")
 55 | 
 56 |     sorted_sequence_lengths, permutation_index = sequence_lengths.sort(0, descending=True)
 57 |     sorted_tensor = tensor.index_select(0, permutation_index)
 58 | 
 59 |     # This is ugly, but required - we are creating a new variable at runtime, so we
 60 |     # must ensure it has the correct CUDA vs non-CUDA type. We do this by cloning and
 61 |     # refilling one of the inputs to the function.
 62 |     index_range = sequence_lengths.data.clone().copy_(torch.arange(0, len(sequence_lengths)))
 63 |     # This is the equivalent of zipping with index, sorting by the original
 64 |     # sequence lengths and returning the now sorted indices.
 65 |     index_range = Variable(index_range.long())
 66 |     _, reverse_mapping = permutation_index.sort(0, descending=False)
 67 |     restoration_indices = index_range.index_select(0, reverse_mapping)
 68 |     return sorted_tensor, sorted_sequence_lengths, restoration_indices, permutation_index
 69 | 
 70 | 
 71 | def get_final_encoder_states(encoder_outputs: torch.Tensor,
 72 |                              mask: torch.Tensor,
 73 |                              bidirectional: bool = False) -> torch.Tensor:
 74 |     """
 75 |     Given the output from a ``Seq2SeqEncoder``, with shape ``(batch_size, sequence_length,
 76 |     encoding_dim)``, this method returns the final hidden state for each element of the batch,
 77 |     giving a tensor of shape ``(batch_size, encoding_dim)``.  This is not as simple as
 78 |     ``encoder_outputs[:, -1]``, because the sequences could have different lengths.  We use the
 79 |     mask (which has shape ``(batch_size, sequence_length)``) to find the final state for each batch
 80 |     instance.
 81 |     Additionally, if ``bidirectional`` is ``True``, we will split the final dimension of the
 82 |     ``encoder_outputs`` into two and assume that the first half is for the forward direction of the
 83 |     encoder and the second half is for the backward direction.  We will concatenate the last state
 84 |     for each encoder dimension, giving ``encoder_outputs[:, -1, :encoding_dim/2]`` concated with
 85 |     ``encoder_outputs[:, 0, encoding_dim/2:]``.
 86 |     """
 87 |     # These are the indices of the last words in the sequences (i.e. length sans padding - 1).  We
 88 |     # are assuming sequences are right padded.
 89 |     # Shape: (batch_size,)
 90 |     last_word_indices = mask.sum(1).long() - 1
 91 |     batch_size, _, encoder_output_dim = encoder_outputs.size()
 92 |     expanded_indices = last_word_indices.view(-1, 1, 1).expand(batch_size, 1, encoder_output_dim)
 93 |     # Shape: (batch_size, 1, encoder_output_dim)
 94 |     final_encoder_output = encoder_outputs.gather(1, expanded_indices)
 95 |     final_encoder_output = final_encoder_output.squeeze(1)  # (batch_size, encoder_output_dim)
 96 |     if bidirectional:
 97 |         final_forward_output = final_encoder_output[:, :(encoder_output_dim // 2)]
 98 |         final_backward_output = encoder_outputs[:, 0, (encoder_output_dim // 2):]
 99 |         final_encoder_output = torch.cat([final_forward_output, final_backward_output], dim=-1)
100 |     return final_encoder_output
101 | 
102 | 
103 | def get_dropout_mask(dropout_probability: float, tensor_for_masking: torch.autograd.Variable):
104 |     """
105 |     Computes and returns an element-wise dropout mask for a given tensor, where
106 |     each element in the mask is dropped out with probability dropout_probability.
107 |     Note that the mask is NOT applied to the tensor - the tensor is passed to retain
108 |     the correct CUDA tensor type for the mask.
109 |     Parameters
110 |     ----------
111 |     dropout_probability : float, required.
112 |         Probability of dropping a dimension of the input.
113 |     tensor_for_masking : torch.Variable, required.
114 |     Returns
115 |     -------
116 |     A torch.FloatTensor consisting of the binary mask scaled by 1/ (1 - dropout_probability).
117 |     This scaling ensures expected values and variances of the output of applying this mask
118 |      and the original tensor are the same.
119 |     """
120 |     binary_mask = tensor_for_masking.clone()
121 |     binary_mask.data.copy_(torch.rand(tensor_for_masking.size()) > dropout_probability)
122 |     # Scale mask by 1/keep_prob to preserve output statistics.
123 |     dropout_mask = binary_mask.float().div(1.0 - dropout_probability)
124 |     return dropout_mask
125 |     
126 | def block_orthogonal(tensor: torch.Tensor,
127 |                      split_sizes: List[int],
128 |                      gain: float = 1.0) -> None:
129 |         """
130 |         An initializer which allows initializing model parameters in "blocks". This is helpful
131 |         in the case of recurrent models which use multiple gates applied to linear projections,
132 |         which can be computed efficiently if they are concatenated together. However, they are
133 |         separate parameters which should be initialized independently.
134 |         Parameters
135 |         ----------
136 |         tensor : ``torch.Tensor``, required.
137 |             A tensor to initialize.
138 |         split_sizes : List[int], required.
139 |             A list of length ``tensor.ndim()`` specifying the size of the
140 |             blocks along that particular dimension. E.g. ``[10, 20]`` would
141 |             result in the tensor being split into chunks of size 10 along the
142 |             first dimension and 20 along the second.
143 |         gain : float, optional (default = 1.0)
144 |             The gain (scaling) applied to the orthogonal initialization.
145 |         """
146 | 
147 |         if isinstance(tensor, Variable):
148 |         # in pytorch 4.0, Variable equals Tensor
149 |         #    block_orthogonal(tensor.data, split_sizes, gain)
150 |         #else:
151 |             sizes = list(tensor.size())
152 |             if any([a % b != 0 for a, b in zip(sizes, split_sizes)]):
153 |                 raise ConfigurationError("tensor dimensions must be divisible by their respective "
154 |                                          "split_sizes. Found size: {} and split_sizes: {}".format(sizes, split_sizes))
155 |             indexes = [list(range(0, max_size, split))
156 |                        for max_size, split in zip(sizes, split_sizes)]
157 |             # Iterate over all possible blocks within the tensor.
158 |             for block_start_indices in itertools.product(*indexes):
159 |                 # A list of tuples containing the index to start at for this block
160 |                 # and the appropriate step size (i.e split_size[i] for dimension i).
161 |                 index_and_step_tuples = zip(block_start_indices, split_sizes)
162 |                 # This is a tuple of slices corresponding to:
163 |                 # tensor[index: index + step_size, ...]. This is
164 |                 # required because we could have an arbitrary number
165 |                 # of dimensions. The actual slices we need are the
166 |                 # start_index: start_index + step for each dimension in the tensor.
167 |                 block_slice = tuple([slice(start_index, start_index + step)
168 |                                      for start_index, step in index_and_step_tuples])
169 |                 tensor[block_slice] = torch.nn.init.orthogonal_(tensor[block_slice].contiguous(), gain=gain)
170 | 


--------------------------------------------------------------------------------
/elmoformanylangs/utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from __future__ import unicode_literals
 3 | import collections
 4 | import itertools
 5 | 
 6 | 
 7 | def flatten(lst):
 8 |   return list(itertools.chain.from_iterable(lst))
 9 | 
10 | 
11 | def deep_iter(x):
12 |   if isinstance(x, list) or isinstance(x, tuple):
13 |     for u in x:
14 |       for v in deep_iter(u):
15 |         yield v
16 |   else:
17 |     yield
18 | 
19 | 
20 | def dict2namedtuple(dic):
21 |   return collections.namedtuple('Namespace', dic.keys())(**dic)
22 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import setuptools
 3 | 
 4 | # read the contents of your README file
 5 | from os import path
 6 | this_directory = path.abspath(path.dirname(__file__))
 7 | with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f:
 8 |     long_description = f.read()
 9 | 
10 | 
11 | setuptools.setup(
12 |   name="elmoformanylangs",
13 |   version="0.0.4.post2",
14 |   packages=setuptools.find_packages(),
15 |   install_requires=[
16 |     "torch",
17 |     "h5py",
18 |     "numpy",
19 |     "overrides",
20 |   ],
21 |   package_data={'configs': ['elmoformanylangs/configs/*.json']},
22 |   include_package_data=True,
23 |   author="Research Center for Social Computing and Information Retrieval",
24 |   description="ELMo, updated to be usable with models for many languages",
25 |   long_description=long_description,
26 |   long_description_content_type='text/markdown',
27 |   url="https://github.com/HIT-SCIR/ELMoForManyLangs",
28 |   classifiers=[
29 |     "Development Status :: 3 - Alpha",
30 |     "Programming Language :: Python",
31 |     "Programming Language :: Python :: 3.6",
32 |   ],
33 | )
34 | 


--------------------------------------------------------------------------------