├── .gitignore ├── LICENSE ├── README.md ├── data └── get_evaluation.sh ├── demo.ipynb ├── evaluate.py ├── outline_all.png ├── src ├── __init__.py ├── dico_builder.py ├── dictionary.py ├── evaluation │ ├── __init__.py │ ├── evaluator.py │ ├── sent_translation.py │ ├── word_translation.py │ └── wordsim.py ├── logger.py ├── models.py ├── trainer.py └── utils.py ├── supervised.py └── unsupervised.py /.gitignore: -------------------------------------------------------------------------------- 1 | # MUSE data and result folders 2 | 3 | data/crosslingual/ 4 | data/monolingual/ 5 | dumped/ 6 | 7 | # Created by https://www.gitignore.io/api/vim,python 8 | 9 | ### Python ### 10 | # Byte-compiled / optimized / DLL files 11 | __pycache__/ 12 | *.py[cod] 13 | *$py.class 14 | 15 | # C extensions 16 | *.so 17 | 18 | # Distribution / packaging 19 | .Python 20 | build/ 21 | develop-eggs/ 22 | dist/ 23 | downloads/ 24 | eggs/ 25 | .eggs/ 26 | lib/ 27 | lib64/ 28 | parts/ 29 | sdist/ 30 | var/ 31 | wheels/ 32 | *.egg-info/ 33 | .installed.cfg 34 | *.egg 35 | 36 | # PyInstaller 37 | # Usually these files are written by a python script from a template 38 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 39 | *.manifest 40 | *.spec 41 | 42 | # Installer logs 43 | pip-log.txt 44 | pip-delete-this-directory.txt 45 | 46 | # Unit test / coverage reports 47 | htmlcov/ 48 | .tox/ 49 | .coverage 50 | .coverage.* 51 | .cache 52 | nosetests.xml 53 | coverage.xml 54 | *.cover 55 | .hypothesis/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # pyenv 82 | .python-version 83 | 84 | # celery beat schedule file 85 | celerybeat-schedule 86 | 87 | # SageMath parsed files 88 | *.sage.py 89 | 90 | # Environments 91 | .env 92 | .venv 93 | env/ 94 | venv/ 95 | ENV/ 96 | env.bak/ 97 | venv.bak/ 98 | 99 | # Spyder project settings 100 | .spyderproject 101 | .spyproject 102 | 103 | # Rope project settings 104 | .ropeproject 105 | 106 | # mkdocs documentation 107 | /site 108 | 109 | # mypy 110 | .mypy_cache/ 111 | 112 | ### Vim ### 113 | # swap 114 | [._]*.s[a-v][a-z] 115 | [._]*.sw[a-p] 116 | [._]s[a-v][a-z] 117 | [._]sw[a-p] 118 | # session 119 | Session.vim 120 | # temporary 121 | .netrwhist 122 | *~ 123 | # auto-generated tag files 124 | tags 125 | 126 | # End of https://www.gitignore.io/api/vim,python 127 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Attribution-NonCommercial 4.0 International 2 | 3 | ======================================================================= 4 | 5 | Creative Commons Corporation ("Creative Commons") is not a law firm and 6 | does not provide legal services or legal advice. Distribution of 7 | Creative Commons public licenses does not create a lawyer-client or 8 | other relationship. Creative Commons makes its licenses and related 9 | information available on an "as-is" basis. Creative Commons gives no 10 | warranties regarding its licenses, any material licensed under their 11 | terms and conditions, or any related information. Creative Commons 12 | disclaims all liability for damages resulting from their use to the 13 | fullest extent possible. 14 | 15 | Using Creative Commons Public Licenses 16 | 17 | Creative Commons public licenses provide a standard set of terms and 18 | conditions that creators and other rights holders may use to share 19 | original works of authorship and other material subject to copyright 20 | and certain other rights specified in the public license below. The 21 | following considerations are for informational purposes only, are not 22 | exhaustive, and do not form part of our licenses. 23 | 24 | Considerations for licensors: Our public licenses are 25 | intended for use by those authorized to give the public 26 | permission to use material in ways otherwise restricted by 27 | copyright and certain other rights. Our licenses are 28 | irrevocable. Licensors should read and understand the terms 29 | and conditions of the license they choose before applying it. 30 | Licensors should also secure all rights necessary before 31 | applying our licenses so that the public can reuse the 32 | material as expected. Licensors should clearly mark any 33 | material not subject to the license. This includes other CC- 34 | licensed material, or material used under an exception or 35 | limitation to copyright. More considerations for licensors: 36 | wiki.creativecommons.org/Considerations_for_licensors 37 | 38 | Considerations for the public: By using one of our public 39 | licenses, a licensor grants the public permission to use the 40 | licensed material under specified terms and conditions. If 41 | the licensor's permission is not necessary for any reason--for 42 | example, because of any applicable exception or limitation to 43 | copyright--then that use is not regulated by the license. Our 44 | licenses grant only permissions under copyright and certain 45 | other rights that a licensor has authority to grant. Use of 46 | the licensed material may still be restricted for other 47 | reasons, including because others have copyright or other 48 | rights in the material. A licensor may make special requests, 49 | such as asking that all changes be marked or described. 50 | Although not required by our licenses, you are encouraged to 51 | respect those requests where reasonable. More_considerations 52 | for the public: 53 | wiki.creativecommons.org/Considerations_for_licensees 54 | 55 | ======================================================================= 56 | 57 | Creative Commons Attribution-NonCommercial 4.0 International Public 58 | License 59 | 60 | By exercising the Licensed Rights (defined below), You accept and agree 61 | to be bound by the terms and conditions of this Creative Commons 62 | Attribution-NonCommercial 4.0 International Public License ("Public 63 | License"). To the extent this Public License may be interpreted as a 64 | contract, You are granted the Licensed Rights in consideration of Your 65 | acceptance of these terms and conditions, and the Licensor grants You 66 | such rights in consideration of benefits the Licensor receives from 67 | making the Licensed Material available under these terms and 68 | conditions. 69 | 70 | Section 1 -- Definitions. 71 | 72 | a. Adapted Material means material subject to Copyright and Similar 73 | Rights that is derived from or based upon the Licensed Material 74 | and in which the Licensed Material is translated, altered, 75 | arranged, transformed, or otherwise modified in a manner requiring 76 | permission under the Copyright and Similar Rights held by the 77 | Licensor. For purposes of this Public License, where the Licensed 78 | Material is a musical work, performance, or sound recording, 79 | Adapted Material is always produced where the Licensed Material is 80 | synched in timed relation with a moving image. 81 | 82 | b. Adapter's License means the license You apply to Your Copyright 83 | and Similar Rights in Your contributions to Adapted Material in 84 | accordance with the terms and conditions of this Public License. 85 | 86 | c. Copyright and Similar Rights means copyright and/or similar rights 87 | closely related to copyright including, without limitation, 88 | performance, broadcast, sound recording, and Sui Generis Database 89 | Rights, without regard to how the rights are labeled or 90 | categorized. For purposes of this Public License, the rights 91 | specified in Section 2(b)(1)-(2) are not Copyright and Similar 92 | Rights. 93 | d. Effective Technological Measures means those measures that, in the 94 | absence of proper authority, may not be circumvented under laws 95 | fulfilling obligations under Article 11 of the WIPO Copyright 96 | Treaty adopted on December 20, 1996, and/or similar international 97 | agreements. 98 | 99 | e. Exceptions and Limitations means fair use, fair dealing, and/or 100 | any other exception or limitation to Copyright and Similar Rights 101 | that applies to Your use of the Licensed Material. 102 | 103 | f. Licensed Material means the artistic or literary work, database, 104 | or other material to which the Licensor applied this Public 105 | License. 106 | 107 | g. Licensed Rights means the rights granted to You subject to the 108 | terms and conditions of this Public License, which are limited to 109 | all Copyright and Similar Rights that apply to Your use of the 110 | Licensed Material and that the Licensor has authority to license. 111 | 112 | h. Licensor means the individual(s) or entity(ies) granting rights 113 | under this Public License. 114 | 115 | i. NonCommercial means not primarily intended for or directed towards 116 | commercial advantage or monetary compensation. For purposes of 117 | this Public License, the exchange of the Licensed Material for 118 | other material subject to Copyright and Similar Rights by digital 119 | file-sharing or similar means is NonCommercial provided there is 120 | no payment of monetary compensation in connection with the 121 | exchange. 122 | 123 | j. Share means to provide material to the public by any means or 124 | process that requires permission under the Licensed Rights, such 125 | as reproduction, public display, public performance, distribution, 126 | dissemination, communication, or importation, and to make material 127 | available to the public including in ways that members of the 128 | public may access the material from a place and at a time 129 | individually chosen by them. 130 | 131 | k. Sui Generis Database Rights means rights other than copyright 132 | resulting from Directive 96/9/EC of the European Parliament and of 133 | the Council of 11 March 1996 on the legal protection of databases, 134 | as amended and/or succeeded, as well as other essentially 135 | equivalent rights anywhere in the world. 136 | 137 | l. You means the individual or entity exercising the Licensed Rights 138 | under this Public License. Your has a corresponding meaning. 139 | 140 | Section 2 -- Scope. 141 | 142 | a. License grant. 143 | 144 | 1. Subject to the terms and conditions of this Public License, 145 | the Licensor hereby grants You a worldwide, royalty-free, 146 | non-sublicensable, non-exclusive, irrevocable license to 147 | exercise the Licensed Rights in the Licensed Material to: 148 | 149 | a. reproduce and Share the Licensed Material, in whole or 150 | in part, for NonCommercial purposes only; and 151 | 152 | b. produce, reproduce, and Share Adapted Material for 153 | NonCommercial purposes only. 154 | 155 | 2. Exceptions and Limitations. For the avoidance of doubt, where 156 | Exceptions and Limitations apply to Your use, this Public 157 | License does not apply, and You do not need to comply with 158 | its terms and conditions. 159 | 160 | 3. Term. The term of this Public License is specified in Section 161 | 6(a). 162 | 163 | 4. Media and formats; technical modifications allowed. The 164 | Licensor authorizes You to exercise the Licensed Rights in 165 | all media and formats whether now known or hereafter created, 166 | and to make technical modifications necessary to do so. The 167 | Licensor waives and/or agrees not to assert any right or 168 | authority to forbid You from making technical modifications 169 | necessary to exercise the Licensed Rights, including 170 | technical modifications necessary to circumvent Effective 171 | Technological Measures. For purposes of this Public License, 172 | simply making modifications authorized by this Section 2(a) 173 | (4) never produces Adapted Material. 174 | 175 | 5. Downstream recipients. 176 | 177 | a. Offer from the Licensor -- Licensed Material. Every 178 | recipient of the Licensed Material automatically 179 | receives an offer from the Licensor to exercise the 180 | Licensed Rights under the terms and conditions of this 181 | Public License. 182 | 183 | b. No downstream restrictions. You may not offer or impose 184 | any additional or different terms or conditions on, or 185 | apply any Effective Technological Measures to, the 186 | Licensed Material if doing so restricts exercise of the 187 | Licensed Rights by any recipient of the Licensed 188 | Material. 189 | 190 | 6. No endorsement. Nothing in this Public License constitutes or 191 | may be construed as permission to assert or imply that You 192 | are, or that Your use of the Licensed Material is, connected 193 | with, or sponsored, endorsed, or granted official status by, 194 | the Licensor or others designated to receive attribution as 195 | provided in Section 3(a)(1)(A)(i). 196 | 197 | b. Other rights. 198 | 199 | 1. Moral rights, such as the right of integrity, are not 200 | licensed under this Public License, nor are publicity, 201 | privacy, and/or other similar personality rights; however, to 202 | the extent possible, the Licensor waives and/or agrees not to 203 | assert any such rights held by the Licensor to the limited 204 | extent necessary to allow You to exercise the Licensed 205 | Rights, but not otherwise. 206 | 207 | 2. Patent and trademark rights are not licensed under this 208 | Public License. 209 | 210 | 3. To the extent possible, the Licensor waives any right to 211 | collect royalties from You for the exercise of the Licensed 212 | Rights, whether directly or through a collecting society 213 | under any voluntary or waivable statutory or compulsory 214 | licensing scheme. In all other cases the Licensor expressly 215 | reserves any right to collect such royalties, including when 216 | the Licensed Material is used other than for NonCommercial 217 | purposes. 218 | 219 | Section 3 -- License Conditions. 220 | 221 | Your exercise of the Licensed Rights is expressly made subject to the 222 | following conditions. 223 | 224 | a. Attribution. 225 | 226 | 1. If You Share the Licensed Material (including in modified 227 | form), You must: 228 | 229 | a. retain the following if it is supplied by the Licensor 230 | with the Licensed Material: 231 | 232 | i. identification of the creator(s) of the Licensed 233 | Material and any others designated to receive 234 | attribution, in any reasonable manner requested by 235 | the Licensor (including by pseudonym if 236 | designated); 237 | 238 | ii. a copyright notice; 239 | 240 | iii. a notice that refers to this Public License; 241 | 242 | iv. a notice that refers to the disclaimer of 243 | warranties; 244 | 245 | v. a URI or hyperlink to the Licensed Material to the 246 | extent reasonably practicable; 247 | 248 | b. indicate if You modified the Licensed Material and 249 | retain an indication of any previous modifications; and 250 | 251 | c. indicate the Licensed Material is licensed under this 252 | Public License, and include the text of, or the URI or 253 | hyperlink to, this Public License. 254 | 255 | 2. You may satisfy the conditions in Section 3(a)(1) in any 256 | reasonable manner based on the medium, means, and context in 257 | which You Share the Licensed Material. For example, it may be 258 | reasonable to satisfy the conditions by providing a URI or 259 | hyperlink to a resource that includes the required 260 | information. 261 | 262 | 3. If requested by the Licensor, You must remove any of the 263 | information required by Section 3(a)(1)(A) to the extent 264 | reasonably practicable. 265 | 266 | 4. If You Share Adapted Material You produce, the Adapter's 267 | License You apply must not prevent recipients of the Adapted 268 | Material from complying with this Public License. 269 | 270 | Section 4 -- Sui Generis Database Rights. 271 | 272 | Where the Licensed Rights include Sui Generis Database Rights that 273 | apply to Your use of the Licensed Material: 274 | 275 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right 276 | to extract, reuse, reproduce, and Share all or a substantial 277 | portion of the contents of the database for NonCommercial purposes 278 | only; 279 | 280 | b. if You include all or a substantial portion of the database 281 | contents in a database in which You have Sui Generis Database 282 | Rights, then the database in which You have Sui Generis Database 283 | Rights (but not its individual contents) is Adapted Material; and 284 | 285 | c. You must comply with the conditions in Section 3(a) if You Share 286 | all or a substantial portion of the contents of the database. 287 | 288 | For the avoidance of doubt, this Section 4 supplements and does not 289 | replace Your obligations under this Public License where the Licensed 290 | Rights include other Copyright and Similar Rights. 291 | 292 | Section 5 -- Disclaimer of Warranties and Limitation of Liability. 293 | 294 | a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE 295 | EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS 296 | AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF 297 | ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, 298 | IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, 299 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR 300 | PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, 301 | ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT 302 | KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT 303 | ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. 304 | 305 | b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE 306 | TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, 307 | NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, 308 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, 309 | COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR 310 | USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN 311 | ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR 312 | DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR 313 | IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. 314 | 315 | c. The disclaimer of warranties and limitation of liability provided 316 | above shall be interpreted in a manner that, to the extent 317 | possible, most closely approximates an absolute disclaimer and 318 | waiver of all liability. 319 | 320 | Section 6 -- Term and Termination. 321 | 322 | a. This Public License applies for the term of the Copyright and 323 | Similar Rights licensed here. However, if You fail to comply with 324 | this Public License, then Your rights under this Public License 325 | terminate automatically. 326 | 327 | b. Where Your right to use the Licensed Material has terminated under 328 | Section 6(a), it reinstates: 329 | 330 | 1. automatically as of the date the violation is cured, provided 331 | it is cured within 30 days of Your discovery of the 332 | violation; or 333 | 334 | 2. upon express reinstatement by the Licensor. 335 | 336 | For the avoidance of doubt, this Section 6(b) does not affect any 337 | right the Licensor may have to seek remedies for Your violations 338 | of this Public License. 339 | 340 | c. For the avoidance of doubt, the Licensor may also offer the 341 | Licensed Material under separate terms or conditions or stop 342 | distributing the Licensed Material at any time; however, doing so 343 | will not terminate this Public License. 344 | 345 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public 346 | License. 347 | 348 | Section 7 -- Other Terms and Conditions. 349 | 350 | a. The Licensor shall not be bound by any additional or different 351 | terms or conditions communicated by You unless expressly agreed. 352 | 353 | b. Any arrangements, understandings, or agreements regarding the 354 | Licensed Material not stated herein are separate from and 355 | independent of the terms and conditions of this Public License. 356 | 357 | Section 8 -- Interpretation. 358 | 359 | a. For the avoidance of doubt, this Public License does not, and 360 | shall not be interpreted to, reduce, limit, restrict, or impose 361 | conditions on any use of the Licensed Material that could lawfully 362 | be made without permission under this Public License. 363 | 364 | b. To the extent possible, if any provision of this Public License is 365 | deemed unenforceable, it shall be automatically reformed to the 366 | minimum extent necessary to make it enforceable. If the provision 367 | cannot be reformed, it shall be severed from this Public License 368 | without affecting the enforceability of the remaining terms and 369 | conditions. 370 | 371 | c. No term or condition of this Public License will be waived and no 372 | failure to comply consented to unless expressly agreed to by the 373 | Licensor. 374 | 375 | d. Nothing in this Public License constitutes or may be interpreted 376 | as a limitation upon, or waiver of, any privileges and immunities 377 | that apply to the Licensor or You, including from the legal 378 | processes of any jurisdiction or authority. 379 | 380 | ======================================================================= 381 | 382 | Creative Commons is not a party to its public 383 | licenses. Notwithstanding, Creative Commons may elect to apply one of 384 | its public licenses to material it publishes and in those instances 385 | will be considered the “Licensor.” The text of the Creative Commons 386 | public licenses is dedicated to the public domain under the CC0 Public 387 | Domain Dedication. Except for the limited purpose of indicating that 388 | material is shared under a Creative Commons public license or as 389 | otherwise permitted by the Creative Commons policies published at 390 | creativecommons.org/policies, Creative Commons does not authorize the 391 | use of the trademark "Creative Commons" or any other trademark or logo 392 | of Creative Commons without its prior written consent including, 393 | without limitation, in connection with any unauthorized modifications 394 | to any of its public licenses or any other arrangements, 395 | understandings, or agreements concerning use of licensed material. For 396 | the avoidance of doubt, this paragraph does not form part of the 397 | public licenses. 398 | 399 | Creative Commons may be contacted at creativecommons.org. 400 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## MUSE: Multilingual Unsupervised and Supervised Embeddings 2 |  3 | 4 | MUSE is a Python library for *multilingual word embeddings*, whose goal is to provide the community with: 5 | * state-of-the-art multilingual word embeddings ([fastText](https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md) embeddings aligned in a common space) 6 | * large-scale high-quality bilingual dictionaries for training and evaluation 7 | 8 | We include two methods, one *supervised* that uses a bilingual dictionary or identical character strings, and one *unsupervised* that does not use any parallel data (see [Word Translation without Parallel Data](https://arxiv.org/pdf/1710.04087.pdf) for more details). 9 | 10 | ## Dependencies 11 | * Python 2/3 with [NumPy](http://www.numpy.org/)/[SciPy](https://www.scipy.org/) 12 | * [PyTorch](http://pytorch.org/) 13 | * [Faiss](https://github.com/facebookresearch/faiss) (recommended) for fast nearest neighbor search (CPU or GPU). 14 | 15 | MUSE is available on CPU or GPU, in Python 2 or 3. Faiss is *optional* for GPU users - though Faiss-GPU will greatly speed up nearest neighbor search - and *highly recommended* for CPU users. Faiss can be installed using "conda install faiss-cpu -c pytorch" or "conda install faiss-gpu -c pytorch". 16 | 17 | ## Get evaluation datasets 18 | To download monolingual and cross-lingual word embeddings evaluation datasets: 19 | * Our 110 [bilingual dictionaries](https://github.com/facebookresearch/MUSE#ground-truth-bilingual-dictionaries) 20 | * 28 monolingual word similarity tasks for 6 languages, and the English word analogy task 21 | * Cross-lingual word similarity tasks from [SemEval2017](http://alt.qcri.org/semeval2017/task2/) 22 | * Sentence translation retrieval with [Europarl](http://www.statmt.org/europarl/) corpora 23 | 24 | You can simply run: 25 | 26 | ```bash 27 | cd data/ 28 | wget https://dl.fbaipublicfiles.com/arrival/vectors.tar.gz 29 | wget https://dl.fbaipublicfiles.com/arrival/wordsim.tar.gz 30 | wget https://dl.fbaipublicfiles.com/arrival/dictionaries.tar.gz 31 | ``` 32 | 33 | Alternatively, you can also download the data with: 34 | 35 | ```bash 36 | cd data/ 37 | ./get_evaluation.sh 38 | ``` 39 | 40 | *Note: Requires bash 4. The download of Europarl is disabled by default (slow), you can enable it [here](https://github.com/facebookresearch/MUSE/blob/master/data/get_evaluation.sh#L99-L100).* 41 | 42 | ## Get monolingual word embeddings 43 | For pre-trained monolingual word embeddings, we highly recommend [fastText Wikipedia embeddings](https://fasttext.cc/docs/en/pretrained-vectors.html), or using [fastText](https://github.com/facebookresearch/fastText) to train your own word embeddings from your corpus. 44 | 45 | You can download the English (en) and Spanish (es) embeddings this way: 46 | ```bash 47 | # English fastText Wikipedia embeddings 48 | curl -Lo data/wiki.en.vec https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.vec 49 | # Spanish fastText Wikipedia embeddings 50 | curl -Lo data/wiki.es.vec https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.es.vec 51 | ``` 52 | 53 | ## Align monolingual word embeddings 54 | This project includes two ways to obtain cross-lingual word embeddings: 55 | * **Supervised**: using a train bilingual dictionary (or identical character strings as anchor points), learn a mapping from the source to the target space using (iterative) [Procrustes](https://en.wikipedia.org/wiki/Orthogonal_Procrustes_problem) alignment. 56 | * **Unsupervised**: without any parallel data or anchor point, learn a mapping from the source to the target space using adversarial training and (iterative) Procrustes refinement. 57 | 58 | For more details on these approaches, please check [here](https://arxiv.org/pdf/1710.04087.pdf). 59 | 60 | ### The supervised way: iterative Procrustes (CPU|GPU) 61 | To learn a mapping between the source and the target space, simply run: 62 | ```bash 63 | python supervised.py --src_lang en --tgt_lang es --src_emb data/wiki.en.vec --tgt_emb data/wiki.es.vec --n_refinement 5 --dico_train default 64 | ``` 65 | By default, *dico_train* will point to our ground-truth dictionaries (downloaded above); when set to "identical_char" it will use identical character strings between source and target languages to form a vocabulary. Logs and embeddings will be saved in the dumped/ directory. 66 | 67 | ### The unsupervised way: adversarial training and refinement (CPU|GPU) 68 | To learn a mapping using adversarial training and iterative Procrustes refinement, run: 69 | ```bash 70 | python unsupervised.py --src_lang en --tgt_lang es --src_emb data/wiki.en.vec --tgt_emb data/wiki.es.vec --n_refinement 5 71 | ``` 72 | By default, the validation metric is the mean cosine of word pairs from a synthetic dictionary built with CSLS (Cross-domain similarity local scaling). For some language pairs (e.g. En-Zh), 73 | we recommend to center the embeddings using `--normalize_embeddings center`. 74 | 75 | ### Evaluate monolingual or cross-lingual embeddings (CPU|GPU) 76 | We also include a simple script to evaluate the quality of monolingual or cross-lingual word embeddings on several tasks: 77 | 78 | **Monolingual** 79 | ```bash 80 | python evaluate.py --src_lang en --src_emb data/wiki.en.vec --max_vocab 200000 81 | ``` 82 | 83 | **Cross-lingual** 84 | ```bash 85 | python evaluate.py --src_lang en --tgt_lang es --src_emb data/wiki.en-es.en.vec --tgt_emb data/wiki.en-es.es.vec --max_vocab 200000 86 | ``` 87 | 88 | ## Word embedding format 89 | By default, the aligned embeddings are exported to a text format at the end of experiments: `--export txt`. Exporting embeddings to a text file can take a while if you have a lot of embeddings. For a very fast export, you can set `--export pth` to export the embeddings in a PyTorch binary file, or simply disable the export (`--export ""`). 90 | 91 | When loading embeddings, the model can load: 92 | * PyTorch binary files previously generated by MUSE (.pth files) 93 | * fastText binary files previously generated by fastText (.bin files) 94 | * text files (text file with one word embedding per line) 95 | 96 | The two first options are very fast and can load 1 million embeddings in a few seconds, while loading text files can take a while. 97 | 98 | ## Download 99 | We provide multilingual embeddings and ground-truth bilingual dictionaries. These embeddings are fastText embeddings that have been aligned in a common space. 100 | 101 | ### Multilingual word Embeddings 102 | We release fastText Wikipedia **supervised** word embeddings for **30** languages, aligned in a **single vector space**. 103 | 104 | | | | | | | | 105 | |---|---|---|---|---|---| 106 | | Arabic: [*text*](https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.ar.vec) | Bulgarian: [*text*](https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.bg.vec) | Catalan: [*text*](https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.ca.vec) | Croatian: [*text*](https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.hr.vec) | Czech: [*text*](https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.cs.vec) | Danish: [*text*](https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.da.vec) 107 | | Dutch: [*text*](https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.nl.vec) | English: [*text*](https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.en.vec) | Estonian: [*text*](https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.et.vec) | Finnish: [*text*](https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.fi.vec) | French: [*text*](https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.fr.vec) | German: [*text*](https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.de.vec) 108 | | Greek: [*text*](https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.el.vec) | Hebrew: [*text*](https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.he.vec) | Hungarian: [*text*](https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.hu.vec) | Indonesian: [*text*](https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.id.vec) | Italian: [*text*](https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.it.vec) | Macedonian: [*text*](https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.mk.vec) 109 | | Norwegian: [*text*](https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.no.vec) | Polish: [*text*](https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.pl.vec) | Portuguese: [*text*](https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.pt.vec) | Romanian: [*text*](https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.ro.vec) | Russian: [*text*](https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.ru.vec) | Slovak: [*text*](https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.sk.vec) 110 | | Slovenian: [*text*](https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.sl.vec) | Spanish: [*text*](https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.es.vec) | Swedish: [*text*](https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.sv.vec) | Turkish: [*text*](https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.tr.vec) | Ukrainian: [*text*](https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.uk.vec) | Vietnamese: [*text*](https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.vi.vec) 111 | 112 | You can visualize crosslingual nearest neighbors using [**demo.ipynb**](https://github.com/facebookresearch/MUSE/blob/master/demo.ipynb). 113 | 114 | 115 | ### Ground-truth bilingual dictionaries 116 | We created **110 large-scale ground-truth bilingual dictionaries** using an internal translation tool. The dictionaries handle well the polysemy of words. We provide a train and test split of 5000 and 1500 unique source words, as well as a larger set of up to 100k pairs. Our goal is to *ease the development and the evaluation of cross-lingual word embeddings and multilingual NLP*. 117 | 118 | **European languages in every direction** 119 | 120 | | src-tgt | German | English | Spanish | French | Italian | Portuguese | 121 | |:----------:|:------:|:-------:|:-------:|:------:|:-------:|:----------:| 122 | | German | - |[full](https://dl.fbaipublicfiles.com/arrival/dictionaries/de-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/de-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/de-en.5000-6500.txt)|[full](https://dl.fbaipublicfiles.com/arrival/dictionaries/de-es.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/de-es.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/de-es.5000-6500.txt)|[full](https://dl.fbaipublicfiles.com/arrival/dictionaries/de-fr.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/de-fr.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/de-fr.5000-6500.txt)|[full](https://dl.fbaipublicfiles.com/arrival/dictionaries/de-it.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/de-it.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/de-it.5000-6500.txt)|[full](https://dl.fbaipublicfiles.com/arrival/dictionaries/de-pt.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/de-pt.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/de-pt.5000-6500.txt)| 123 | | English |[full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-de.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-de.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-de.5000-6500.txt)| - |[full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-es.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-es.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-es.5000-6500.txt)|[full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-fr.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-fr.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-fr.5000-6500.txt)|[full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-it.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-it.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-it.5000-6500.txt)|[full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-pt.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-pt.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-pt.5000-6500.txt)| 124 | | Spanish |[full](https://dl.fbaipublicfiles.com/arrival/dictionaries/es-de.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/es-de.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/es-de.5000-6500.txt)|[full](https://dl.fbaipublicfiles.com/arrival/dictionaries/es-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/es-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/es-en.5000-6500.txt)| - |[full](https://dl.fbaipublicfiles.com/arrival/dictionaries/es-fr.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/es-fr.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/es-fr.5000-6500.txt)|[full](https://dl.fbaipublicfiles.com/arrival/dictionaries/es-it.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/es-it.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/es-it.5000-6500.txt)|[full](https://dl.fbaipublicfiles.com/arrival/dictionaries/es-pt.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/es-pt.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/es-pt.5000-6500.txt)| 125 | | French |[full](https://dl.fbaipublicfiles.com/arrival/dictionaries/fr-de.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/fr-de.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/fr-de.5000-6500.txt)|[full](https://dl.fbaipublicfiles.com/arrival/dictionaries/fr-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/fr-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/fr-en.5000-6500.txt)|[full](https://dl.fbaipublicfiles.com/arrival/dictionaries/fr-es.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/fr-es.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/fr-es.5000-6500.txt)| - |[full](https://dl.fbaipublicfiles.com/arrival/dictionaries/fr-it.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/fr-it.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/fr-it.5000-6500.txt)|[full](https://dl.fbaipublicfiles.com/arrival/dictionaries/fr-pt.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/fr-pt.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/fr-pt.5000-6500.txt)| 126 | | Italian |[full](https://dl.fbaipublicfiles.com/arrival/dictionaries/it-de.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/it-de.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/it-de.5000-6500.txt)|[full](https://dl.fbaipublicfiles.com/arrival/dictionaries/it-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/it-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/it-en.5000-6500.txt)|[full](https://dl.fbaipublicfiles.com/arrival/dictionaries/it-es.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/it-es.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/it-es.5000-6500.txt)|[full](https://dl.fbaipublicfiles.com/arrival/dictionaries/it-fr.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/it-fr.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/it-fr.5000-6500.txt)| - |[full](https://dl.fbaipublicfiles.com/arrival/dictionaries/it-pt.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/it-pt.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/it-pt.5000-6500.txt)| 127 | | Portuguese |[full](https://dl.fbaipublicfiles.com/arrival/dictionaries/pt-de.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/pt-de.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/pt-de.5000-6500.txt)|[full](https://dl.fbaipublicfiles.com/arrival/dictionaries/pt-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/pt-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/pt-en.5000-6500.txt)|[full](https://dl.fbaipublicfiles.com/arrival/dictionaries/pt-es.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/pt-es.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/pt-es.5000-6500.txt)|[full](https://dl.fbaipublicfiles.com/arrival/dictionaries/pt-fr.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/pt-fr.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/pt-fr.5000-6500.txt)|[full](https://dl.fbaipublicfiles.com/arrival/dictionaries/pt-it.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/pt-it.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/pt-it.5000-6500.txt)| - | 128 | 129 | 130 | **Other languages to English (e.g. {fr,es}-en)** 131 | 132 | ||||| 133 | |-|-|-|-| 134 | | Afrikaans: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/af-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/af-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/af-en.5000-6500.txt) | Albanian: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/sq-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/sq-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/sq-en.5000-6500.txt) | Arabic: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/ar-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/ar-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/ar-en.5000-6500.txt) | Bengali: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/bn-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/bn-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/bn-en.5000-6500.txt) 135 | | Bosnian: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/bs-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/bs-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/bs-en.5000-6500.txt) | Bulgarian: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/bg-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/bg-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/bg-en.5000-6500.txt) | Catalan: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/ca-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/ca-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/ca-en.5000-6500.txt) | Chinese: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/zh-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/zh-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/zh-en.5000-6500.txt) 136 | | Croatian: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/hr-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/hr-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/hr-en.5000-6500.txt) | Czech: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/cs-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/cs-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/cs-en.5000-6500.txt) | Danish: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/da-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/da-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/da-en.5000-6500.txt) | Dutch: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/nl-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/nl-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/nl-en.5000-6500.txt) 137 | | English: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-en.5000-6500.txt) | Estonian: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/et-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/et-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/et-en.5000-6500.txt) | Filipino: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/tl-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/tl-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/tl-en.5000-6500.txt) | Finnish: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/fi-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/fi-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/fi-en.5000-6500.txt) 138 | | French: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/fr-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/fr-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/fr-en.5000-6500.txt) | German: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/de-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/de-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/de-en.5000-6500.txt) | Greek: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/el-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/el-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/el-en.5000-6500.txt) | Hebrew: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/he-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/he-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/he-en.5000-6500.txt) 139 | | Hindi: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/hi-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/hi-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/hi-en.5000-6500.txt) | Hungarian: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/hu-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/hu-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/hu-en.5000-6500.txt) | Indonesian: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/id-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/id-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/id-en.5000-6500.txt) | Italian: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/it-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/it-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/it-en.5000-6500.txt) 140 | | Japanese: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/ja-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/ja-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/ja-en.5000-6500.txt) | Korean: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/ko-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/ko-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/ko-en.5000-6500.txt) | Latvian: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/lv-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/lv-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/lv-en.5000-6500.txt) | Littuanian: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/lt-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/lt-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/lt-en.5000-6500.txt) 141 | | Macedonian: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/mk-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/mk-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/mk-en.5000-6500.txt) | Malay: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/ms-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/ms-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/ms-en.5000-6500.txt) | Norwegian: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/no-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/no-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/no-en.5000-6500.txt) | Persian: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/fa-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/fa-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/fa-en.5000-6500.txt) 142 | | Polish: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/pl-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/pl-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/pl-en.5000-6500.txt) | Portuguese: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/pt-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/pt-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/pt-en.5000-6500.txt) | Romanian: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/ro-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/ro-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/ro-en.5000-6500.txt) | Russian: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/ru-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/ru-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/ru-en.5000-6500.txt) 143 | | Slovak: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/sk-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/sk-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/sk-en.5000-6500.txt) | Slovenian: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/sl-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/sl-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/sl-en.5000-6500.txt) | Spanish: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/es-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/es-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/es-en.5000-6500.txt) | Swedish: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/sv-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/sv-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/sv-en.5000-6500.txt) 144 | | Tamil: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/ta-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/ta-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/ta-en.5000-6500.txt) | Thai: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/th-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/th-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/th-en.5000-6500.txt) | Turkish: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/tr-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/tr-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/tr-en.5000-6500.txt) | Ukrainian: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/uk-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/uk-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/uk-en.5000-6500.txt) 145 | | Vietnamese: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/vi-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/vi-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/vi-en.5000-6500.txt) 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | **English to other languages (e.g. en-{fr,es})** 156 | 157 | ||||| 158 | |-|-|-|-| 159 | | Afrikaans: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-af.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-af.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-af.5000-6500.txt) | Albanian: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-sq.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-sq.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-sq.5000-6500.txt) | Arabic: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-ar.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-ar.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-ar.5000-6500.txt) | Bengali: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-bn.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-bn.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-bn.5000-6500.txt) 160 | | Bosnian: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-bs.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-bs.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-bs.5000-6500.txt) | Bulgarian: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-bg.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-bg.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-bg.5000-6500.txt) | Catalan: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-ca.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-ca.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-ca.5000-6500.txt) | Chinese: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-zh.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-zh.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-zh.5000-6500.txt) 161 | | Croatian: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-hr.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-hr.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-hr.5000-6500.txt) | Czech: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-cs.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-cs.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-cs.5000-6500.txt) | Danish: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-da.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-da.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-da.5000-6500.txt) | Dutch: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-nl.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-nl.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-nl.5000-6500.txt) 162 | | English: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-en.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-en.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-en.5000-6500.txt) | Estonian: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-et.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-et.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-et.5000-6500.txt) | Filipino: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-tl.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-tl.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-tl.5000-6500.txt) | Finnish: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-fi.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-fi.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-fi.5000-6500.txt) 163 | | French: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-fr.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-fr.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-fr.5000-6500.txt) | German: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-de.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-de.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-de.5000-6500.txt) | Greek: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-el.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-el.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-el.5000-6500.txt) | Hebrew: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-he.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-he.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-he.5000-6500.txt) 164 | | Hindi: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-hi.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-hi.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-hi.5000-6500.txt) | Hungarian: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-hu.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-hu.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-hu.5000-6500.txt) | Indonesian: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-id.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-id.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-id.5000-6500.txt) | Italian: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-it.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-it.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-it.5000-6500.txt) 165 | | Japanese: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-ja.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-ja.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-ja.5000-6500.txt) | Korean: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-ko.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-ko.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-ko.5000-6500.txt) | Latvian: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-lv.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-lv.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-lv.5000-6500.txt) | Littuanian: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-lt.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-lt.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-lt.5000-6500.txt) 166 | | Macedonian: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-mk.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-mk.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-mk.5000-6500.txt) | Malay: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-ms.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-ms.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-ms.5000-6500.txt) | Norwegian: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-no.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-no.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-no.5000-6500.txt) | Persian: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-fa.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-fa.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-fa.5000-6500.txt) 167 | | Polish: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-pl.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-pl.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-pl.5000-6500.txt) | Portuguese: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-pt.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-pt.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-pt.5000-6500.txt) | Romanian: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-ro.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-ro.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-ro.5000-6500.txt) | Russian: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-ru.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-ru.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-ru.5000-6500.txt) 168 | | Slovak: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-sk.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-sk.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-sk.5000-6500.txt) | Slovenian: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-sl.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-sl.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-sl.5000-6500.txt) | Spanish: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-es.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-es.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-es.5000-6500.txt) | Swedish: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-sv.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-sv.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-sv.5000-6500.txt) 169 | | Tamil: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-ta.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-ta.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-ta.5000-6500.txt) | Thai: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-th.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-th.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-th.5000-6500.txt) | Turkish: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-tr.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-tr.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-tr.5000-6500.txt) | Ukrainian: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-uk.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-uk.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-uk.5000-6500.txt) 170 | | Vietnamese: [full](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-vi.txt) [train](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-vi.0-5000.txt) [test](https://dl.fbaipublicfiles.com/arrival/dictionaries/en-vi.5000-6500.txt) 171 | 172 | 173 | 174 | ## References 175 | Please cite [[1]](https://arxiv.org/pdf/1710.04087.pdf) if you found the resources in this repository useful. 176 | 177 | ### Word Translation Without Parallel Data 178 | 179 | [1] A. Conneau\*, G. Lample\*, L. Denoyer, MA. Ranzato, H. Jégou, [*Word Translation Without Parallel Data*](https://arxiv.org/pdf/1710.04087.pdf) 180 | 181 | \* Equal contribution. Order has been determined with a coin flip. 182 | ``` 183 | @article{conneau2017word, 184 | title={Word Translation Without Parallel Data}, 185 | author={Conneau, Alexis and Lample, Guillaume and Ranzato, Marc'Aurelio and Denoyer, Ludovic and J{\'e}gou, Herv{\'e}}, 186 | journal={arXiv preprint arXiv:1710.04087}, 187 | year={2017} 188 | } 189 | ``` 190 | 191 | MUSE is the project at the origin of the work on unsupervised machine translation with monolingual data only [[2]](https://arxiv.org/abs/1711.00043). 192 | 193 | ### Unsupervised Machine Translation With Monolingual Data Only 194 | 195 | [2] G. Lample, A. Conneau, L. Denoyer, MA. Ranzato [*Unsupervised Machine Translation With Monolingual Data Only*](https://arxiv.org/abs/1711.00043) 196 | 197 | ``` 198 | @article{lample2017unsupervised, 199 | title={Unsupervised Machine Translation Using Monolingual Corpora Only}, 200 | author={Lample, Guillaume and Conneau, Alexis and Denoyer, Ludovic and Ranzato, Marc'Aurelio}, 201 | journal={arXiv preprint arXiv:1711.00043}, 202 | year={2017} 203 | } 204 | ``` 205 | 206 | ### Related work 207 | * [T. Mikolov, Q. V Le, I. Sutskever - Exploiting similarities among languages for machine translation, 2013](https://arxiv.org/abs/1309.4168) 208 | * [G. Dinu, A. Lazaridou, M. Baroni - Improving zero-shot learning by mitigating the hubness problem, 2015](https://arxiv.org/abs/1412.6568) 209 | * [S. L Smith, D. HP Turban, S. Hamblin, N. Y Hammerla - Offline bilingual word vectors, orthogonal transformations and the inverted softmax, 2017](https://arxiv.org/abs/1702.03859) 210 | * [M. Artetxe, G. Labaka, E. Agirre - Learning bilingual word embeddings with (almost) no bilingual data, 2017](https://aclanthology.coli.uni-saarland.de/papers/P17-1042/p17-1042) 211 | * [M. Zhang, Y. Liu, H. Luan, and M. Sun - Adversarial training for unsupervised bilingual lexicon induction, 2017](https://aclanthology.coli.uni-saarland.de/papers/P17-1179/p17-1179) 212 | * [Y. Hoshen, L. Wolf - An Iterative Closest Point Method for Unsupervised Word Translation, 2018](https://arxiv.org/abs/1801.06126) 213 | * [A. Joulin, P. Bojanowski, T. Mikolov, E. Grave - Improving supervised bilingual mapping of word embeddings, 2018](https://arxiv.org/abs/1804.07745) 214 | * [E. Grave, A. Joulin, Q. Berthet - Unsupervised Alignment of Embeddings with Wasserstein Procrustes, 2018](https://arxiv.org/abs/1805.11222) 215 | 216 | Contact: [gl@fb.com](mailto:gl@fb.com) [aconneau@fb.com](mailto:aconneau@fb.com) 217 | -------------------------------------------------------------------------------- /data/get_evaluation.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | en_analogy='https://storage.googleapis.com/google-code-archive-source/v2/code.google.com/word2vec/source-archive.zip' 9 | dl_path='https://dl.fbaipublicfiles.com/arrival' 10 | semeval_2017='http://alt.qcri.org/semeval2017/task2/data/uploads' 11 | europarl='http://www.statmt.org/europarl/v7' 12 | 13 | declare -A wordsim_lg 14 | wordsim_lg=(["en"]="EN_MC-30.txt EN_MTurk-287.txt EN_RG-65.txt EN_VERB-143.txt EN_WS-353-REL.txt EN_YP-130.txt EN_MEN-TR-3k.txt EN_MTurk-771.txt EN_RW-STANFORD.txt EN_SIMLEX-999.txt EN_WS-353-ALL.txt EN_WS-353-SIM.txt" ["es"]="ES_MC-30.txt ES_RG-65.txt ES_WS-353.txt" ["de"]="DE_GUR350.txt DE_GUR65.txt DE_SIMLEX-999.txt DE_WS-353.txt DE_ZG222.txt" ["fr"]="FR_RG-65.txt" ["it"]="IT_SIMLEX-999.txt IT_WS-353.txt") 15 | 16 | mkdir monolingual crosslingual 17 | 18 | ## English word analogy task 19 | curl -Lo source-archive.zip $en_analogy 20 | mkdir -p monolingual/en/ 21 | unzip -p source-archive.zip word2vec/trunk/questions-words.txt > monolingual/en/questions-words.txt 22 | rm source-archive.zip 23 | 24 | 25 | ## Downloading en-{} or {}-en dictionaries 26 | lgs="af ar bg bn bs ca cs da de el en es et fa fi fr he hi hr hu id it ja ko lt lv mk ms nl no pl pt ro ru sk sl sq sv ta th tl tr uk vi zh" 27 | mkdir -p crosslingual/dictionaries/ 28 | for lg in ${lgs} 29 | do 30 | for suffix in .txt .0-5000.txt .5000-6500.txt 31 | do 32 | fname=en-$lg$suffix 33 | curl -Lo crosslingual/dictionaries/$fname $dl_path/dictionaries/$fname 34 | fname=$lg-en$suffix 35 | curl -Lo crosslingual/dictionaries/$fname $dl_path/dictionaries/$fname 36 | done 37 | done 38 | 39 | ## Download European dictionaries 40 | for src_lg in de es fr it pt 41 | do 42 | for tgt_lg in de es fr it pt 43 | do 44 | if [ $src_lg != $tgt_lg ] 45 | then 46 | for suffix in .txt .0-5000.txt .5000-6500.txt 47 | do 48 | fname=$src_lg-$tgt_lg$suffix 49 | curl -Lo crosslingual/dictionaries/$fname $dl_path/dictionaries/$fname 50 | done 51 | fi 52 | done 53 | done 54 | 55 | ## Download Dinu et al. dictionaries 56 | for fname in OPUS_en_it_europarl_train_5K.txt OPUS_en_it_europarl_test.txt 57 | do 58 | echo $fname 59 | curl -Lo crosslingual/dictionaries/$fname $dl_path/dictionaries/$fname 60 | done 61 | 62 | ## Monolingual wordsim tasks 63 | for lang in "${!wordsim_lg[@]}" 64 | do 65 | echo $lang 66 | mkdir monolingual/$lang 67 | for wsim in ${wordsim_lg[$lang]} 68 | do 69 | echo $wsim 70 | curl -Lo monolingual/$lang/$wsim $dl_path/$lang/$wsim 71 | done 72 | done 73 | 74 | ## SemEval 2017 monolingual and cross-lingual wordsim tasks 75 | # 1) Task1: monolingual 76 | curl -Lo semeval2017-task2.zip $semeval_2017/semeval2017-task2.zip 77 | unzip semeval2017-task2.zip 78 | 79 | fdir='SemEval17-Task2/test/subtask1-monolingual' 80 | for lang in en es de fa it 81 | do 82 | mkdir -p monolingual/$lang 83 | uplang=`echo $lang | awk '{print toupper($0)}'` 84 | paste $fdir/data/$lang.test.data.txt $fdir/keys/$lang.test.gold.txt > monolingual/$lang/${uplang}_SEMEVAL17.txt 85 | done 86 | 87 | # 2) Task2: cross-lingual 88 | mkdir -p crosslingual/wordsim 89 | fdir='SemEval17-Task2/test/subtask2-crosslingual' 90 | for lg_pair in de-es de-fa de-it en-de en-es en-fa en-it es-fa es-it it-fa 91 | do 92 | echo $lg_pair 93 | paste $fdir/data/$lg_pair.test.data.txt $fdir/keys/$lg_pair.test.gold.txt > crosslingual/wordsim/$lg_pair-SEMEVAL17.txt 94 | done 95 | rm semeval2017-task2.zip 96 | rm -r SemEval17-Task2/ 97 | 98 | ## Europarl for sentence retrieval 99 | # TODO: set to true to activate download of Europarl (slow) 100 | if false; then 101 | mkdir -p crosslingual/europarl 102 | # Tokenize EUROPARL with MOSES 103 | echo 'Cloning Moses github repository (for tokenization scripts)...' 104 | git clone https://github.com/moses-smt/mosesdecoder.git 105 | SCRIPTS=mosesdecoder/scripts 106 | TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl 107 | 108 | for lg_pair in it-en # es-en etc 109 | do 110 | curl -Lo $lg_pair.tgz $europarl/$lg_pair.tgz 111 | tar -xvf $lg_pair.tgz 112 | rm $lg_pair.tgz 113 | lgs=(${lg_pair//-/ }) 114 | for lg in ${lgs[0]} ${lgs[1]} 115 | do 116 | cat europarl-v7.$lg_pair.$lg | $TOKENIZER -threads 8 -l $lg -no-escape > euro.$lg.txt 117 | rm europarl-v7.$lg_pair.$lg 118 | done 119 | 120 | paste euro.${lgs[0]}.txt euro.${lgs[1]}.txt | shuf > euro.paste.txt 121 | rm euro.${lgs[0]}.txt euro.${lgs[1]}.txt 122 | 123 | cut -f1 euro.paste.txt > crosslingual/europarl/europarl-v7.$lg_pair.${lgs[0]} 124 | cut -f2 euro.paste.txt > crosslingual/europarl/europarl-v7.$lg_pair.${lgs[1]} 125 | rm euro.paste.txt 126 | done 127 | 128 | rm -rf mosesdecoder 129 | fi 130 | -------------------------------------------------------------------------------- /demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Loading word embeddings" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "import io\n", 19 | "import numpy as np" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "metadata": { 26 | "collapsed": true 27 | }, 28 | "outputs": [], 29 | "source": [ 30 | "def load_vec(emb_path, nmax=50000):\n", 31 | " vectors = []\n", 32 | " word2id = {}\n", 33 | " with io.open(emb_path, 'r', encoding='utf-8', newline='\\n', errors='ignore') as f:\n", 34 | " next(f)\n", 35 | " for i, line in enumerate(f):\n", 36 | " word, vect = line.rstrip().split(' ', 1)\n", 37 | " vect = np.fromstring(vect, sep=' ')\n", 38 | " assert word not in word2id, 'word found twice'\n", 39 | " vectors.append(vect)\n", 40 | " word2id[word] = len(word2id)\n", 41 | " if len(word2id) == nmax:\n", 42 | " break\n", 43 | " id2word = {v: k for k, v in word2id.items()}\n", 44 | " embeddings = np.vstack(vectors)\n", 45 | " return embeddings, id2word, word2id" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 3, 51 | "metadata": { 52 | "collapsed": true 53 | }, 54 | "outputs": [], 55 | "source": [ 56 | "src_path = '/private/home/aconneau/projects/MUSE/dumped/debug/8gqhcyvoto/vectors-en.txt'\n", 57 | "tgt_path = '/private/home/aconneau/projects/MUSE/dumped/debug/8gqhcyvoto/vectors-es.txt'\n", 58 | "nmax = 50000 # maximum number of word embeddings to load\n", 59 | "\n", 60 | "src_embeddings, src_id2word, src_word2id = load_vec(src_path, nmax)\n", 61 | "tgt_embeddings, tgt_id2word, tgt_word2id = load_vec(tgt_path, nmax)" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "# Get nearest neighbors" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 4, 74 | "metadata": { 75 | "collapsed": true 76 | }, 77 | "outputs": [], 78 | "source": [ 79 | "def get_nn(word, src_emb, src_id2word, tgt_emb, tgt_id2word, K=5):\n", 80 | " print(\"Nearest neighbors of \\\"%s\\\":\" % word)\n", 81 | " word2id = {v: k for k, v in src_id2word.items()}\n", 82 | " word_emb = src_emb[word2id[word]]\n", 83 | " scores = (tgt_emb / np.linalg.norm(tgt_emb, 2, 1)[:, None]).dot(word_emb / np.linalg.norm(word_emb))\n", 84 | " k_best = scores.argsort()[-K:][::-1]\n", 85 | " for i, idx in enumerate(k_best):\n", 86 | " print('%.4f - %s' % (scores[idx], tgt_id2word[idx]))" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 5, 92 | "metadata": {}, 93 | "outputs": [ 94 | { 95 | "name": "stdout", 96 | "output_type": "stream", 97 | "text": [ 98 | "Nearest neighbors of \"cat\":\n", 99 | "1.0000 - cat\n", 100 | "0.7322 - cats\n", 101 | "0.6453 - kitten\n", 102 | "0.6381 - dog\n", 103 | "0.6218 - kittens\n" 104 | ] 105 | } 106 | ], 107 | "source": [ 108 | "# printing nearest neighbors in the source space\n", 109 | "src_word = 'cat'\n", 110 | "get_nn(src_word, src_embeddings, src_id2word, src_embeddings, src_id2word, K=5)" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 6, 116 | "metadata": {}, 117 | "outputs": [ 118 | { 119 | "name": "stdout", 120 | "output_type": "stream", 121 | "text": [ 122 | "Nearest neighbors of \"cat\":\n", 123 | "0.6201 - gato\n", 124 | "0.5380 - perro\n", 125 | "0.4922 - gorila\n", 126 | "0.4809 - mapache\n", 127 | "0.4751 - conejo\n" 128 | ] 129 | } 130 | ], 131 | "source": [ 132 | "# printing nearest neighbors in the target space\n", 133 | "src_word = 'cat'\n", 134 | "get_nn(src_word, src_embeddings, src_id2word, tgt_embeddings, tgt_id2word, K=5)" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": {}, 140 | "source": [ 141 | "# Visualize multilingual embeddings" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 7, 147 | "metadata": {}, 148 | "outputs": [ 149 | { 150 | "name": "stdout", 151 | "output_type": "stream", 152 | "text": [ 153 | "Variance explained: 0.07\n" 154 | ] 155 | } 156 | ], 157 | "source": [ 158 | "from sklearn.decomposition import PCA\n", 159 | "pca = PCA(n_components=2, whiten=True) # TSNE(n_components=2, n_iter=3000, verbose=2)\n", 160 | "pca.fit(np.vstack([src_embeddings, tgt_embeddings]))\n", 161 | "print('Variance explained: %.2f' % pca.explained_variance_ratio_.sum())" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 8, 167 | "metadata": { 168 | "collapsed": true 169 | }, 170 | "outputs": [], 171 | "source": [ 172 | "import matplotlib.pyplot as plt\n", 173 | "\n", 174 | "\n", 175 | "def plot_similar_word(src_words, src_word2id, src_emb, tgt_words, tgt_word2id, tgt_emb, pca):\n", 176 | "\n", 177 | " Y = []\n", 178 | " word_labels = []\n", 179 | " for sw in src_words:\n", 180 | " Y.append(src_emb[src_word2id[sw]])\n", 181 | " word_labels.append(sw)\n", 182 | " for tw in tgt_words:\n", 183 | " Y.append(tgt_emb[tgt_word2id[tw]])\n", 184 | " word_labels.append(tw)\n", 185 | "\n", 186 | " # find tsne coords for 2 dimensions\n", 187 | " Y = pca.transform(Y)\n", 188 | " x_coords = Y[:, 0]\n", 189 | " y_coords = Y[:, 1]\n", 190 | "\n", 191 | " # display scatter plot\n", 192 | " plt.figure(figsize=(10, 8), dpi=80)\n", 193 | " plt.scatter(x_coords, y_coords, marker='x')\n", 194 | "\n", 195 | " for k, (label, x, y) in enumerate(zip(word_labels, x_coords, y_coords)):\n", 196 | " color = 'blue' if k < len(src_words) else 'red' # src words in blue / tgt words in red\n", 197 | " plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points', fontsize=19,\n", 198 | " color=color, weight='bold')\n", 199 | "\n", 200 | " plt.xlim(x_coords.min() - 0.2, x_coords.max() + 0.2)\n", 201 | " plt.ylim(y_coords.min() - 0.2, y_coords.max() + 0.2)\n", 202 | " plt.title('Visualization of the multilingual word embedding space')\n", 203 | "\n", 204 | " plt.show()" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 9, 210 | "metadata": {}, 211 | "outputs": [ 212 | { 213 | "data": { 214 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAqkAAAIWCAYAAAB0uhRnAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAAMTQAADE0B0s6tTgAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAAIABJREFUeJzs3XecFdX9//H3h6VJU2QBkVWKAgZB\nkNhQUcQu9orGguWnftUokcQYo0hETUzEWKIxVowmggVjI2pQMCq2KCiKCiogICCIKEVxgfP743Ov\nM3P33i2ysLPL6/l4zGPvzJw5c2bu3d3PPW0shCAAAAAgTerVdAEAAACAXASpAAAASB2CVAAAAKQO\nQSoAAABShyAVAAAAqUOQCgAAgNQhSAUAAEDqEKSizjKzUWb2wAY612Azm7uhzp17vppgZo3M7CEz\n+8bMKj3hspkNN7OX12fZqouZzTKzsyqbxsw6mlkws20z6zX+PuWTKeN+NV2OODN7wMxGbeBz9s/c\ni/rVnO/LZja8nP37xX9natPvBLAhEaSi1jGzp83soQL7/mRm72VWL5J0/oYrWUK1nbvAP+8xknas\njvzXwQmS+kvaXVK7fAkq+mddm5jZtpmApmPOrp0l/aPAYWl4n5B+10s6vKYLAaRNtX57BDaQ+yTd\nZ2abhRCWZjeaWT1JJ0m6UZJCCF/XUPnW+7lDCN9K+nZ9nqMSOkv6IITwXoUp67AQwqJy9qXhfUoN\nMzNJ9UMIpTVdljQJISyv6TIAaURNKmqjxyV9J+m4nO37SWqrTK1WbpO7mQ0xs5lmtsrM5mZr+HKb\naDPbEs2AZra7mU0ws6VmtsjMHjSz4kIFjJ870+Qb8iyDM/vPNLMpZrbCzGab2YjYeYdL+pmk07LH\nxfKMdy+ob2Z/NLMvzOxbM/uPmXWJ7R+eqdW8wMzmm9niTHor5xqamtldZvaVmS03s0fNrG32+iRd\nKWmvTLkm5rsHkvaQdGUmzayc/QXLYmatzewfmfu9OPO6VTllzV7fRZk8l5rZZZkuCbdnuiR8bGb7\n5x5T6H3LY0bm58zM9QzPHFOwS0Ce92lUpmb8ajNbYmafm9nFOcfsZ2YfZd7Hp8zs1/F7Z2YTzezq\nnGPiXQ4amdnfzWxO5jP1lpkNKHTv8pT5ZjO7L7Z+r5mVmlnTzPqumc9D9jO6i5m9mvm9mmNml+Tk\nF8zsLDN7Xh6wH5HZ/nMzW2hmX5vZSEkFP4uxvC40s0/NbKWZvWlm/WP7Bpv/Xg8y/z1fbma3mFlR\n5nfqy8z+k/NkfYCZTc/c87Fmtlks3+zxc81sWeb+7xDbb2Z2Teb3ZJGZ/SpPuXua2f/M7LvMZ65j\nzv7EZzFzjj+a2d8y55xlZoNyjjnJzD7LvMf3mdn1luf3MJa+T+Z3ZEWmrC9mrzPzufyHmY3MvB8L\nzezC2LEVfqbMrE3ms70kc+9fMbNtKvPeAYUQpKLWCSGskjejnpqz61RJ/wkhfJ57jJntLOl3ks6V\n1EXS8ZI+rsJpm0n6q6SdJB0saStJt1Xy2DHy5vDscq78n/Vbmf31JP1SUo/MvrMknZ3Zd72kRyU9\nFDs+n0sknSbpdHnz87eSnjCzoliaHTL7BmTOMUTSoeWU+8+S9pYHFXtJai/p/sy+i+Q11q9mynR0\nnuMvkvSGpJGZNDtXoSyPZH72k3cp2ExSRX18d5DUW9I+mfyukfSEpPcl/VTSs5L+bmYNK8inkL6Z\nn7vIr+f6H5nP4ZIaSNpN0nBJI7NBj5m1lDRW0jPya3lC0q+rmH99SdMlHSapVyaPx82sTSWPf0l+\n37P6Sfo6U97s+qshhNVm1lzSOPk97i3/HF5pZifl5Dlc/vvTXdJLZra3pBvkX3R2lbSJKmjuNrMz\n5J+p8+S/K3+XNM6S3S9ayVtTDpN3Rzlb0r/lv2N9M2W408xa52T/O/nvzz6StlOmNSbjSkmHSDpR\n3nXjFUn/MbMWmf2nSrowc67+mfP0ipW7SP6efib/HN4kaUR515pxjqQPM+ccJene7HtoZt0y1/9X\nSX3k7/fZ+bP5wQOZsveUtKfKdlE5XP4+7CrpCknXxwLJynymxkraJpNmR0l/yxxX2fcOKCuEwMJS\n6xZ5P8i1kjpl1ptJWi5pUCzNKEkPZF4fI+kjeVNjbl4dJQVJ28a29c9sK5M+s383SaWSijLrgyXN\nzXfunOO2k//DP7mca7tU0gux9QckjcpJk3u+BZLOi61vLmmlpIGZ9eGSlkhqHEvzrKTrC5Sheeb6\nDskpe5C0fWb9akkTK3ifXpY0PGdbuWWRB8QL4vde0paZc5cUOE82z0axbR9Keiq2vkUmj56xY17O\nySfxvkmaJemszOttM8d3zDkmnibxWSrwuXg/5/iPJF2QeX2+pNmS6sX2/1PSrNj6RElXFypDgfvz\noaRTY+tB0n4F0mbvU/vMff9K/kXjd5n9T0galnl9rqR5Oe/VHyS9mXOuK3POMUbS6Nh6fUlzlfM5\nzznmU0mH5mx7TtLlsXu9VlLb2P5nJL0XWy+S/504LOf3/KBYmv3kn/3NJDWW/x71yDnvdGV+hyW9\nLukPsX0tM8cMz6wfnFlvmXOPQs7n9+XY+kRJ43Luz4rs9Uv6k8p+diepnN9HScsk9Suwb1Tm/sff\nxwckPVKZz5Q8uF8lqf2Pee9YWAot1KSiVgohTJLXhJ6S2XSMpDXyrgD5jJf/M/rEvPl3oFnhpu5c\nZlZiZvdnmquWSXpe/o9jiyrk0ULSvyTdHUKId0PY3cyeM7N5ZrZc/g9rqyrku6m8m8Nr2W0hhCXy\n4KdbLOmMEMJ3sfUFkgrVrnWWX188zw8lLc3J88cqryw9JbWWtDTTbLhcHhRky1Venqti6wvlNXzx\ndWXyrkm5fXjj195F0pQQwtrY/v9V9QRm9kszezfb9JrJt1KfqRDCAvnvVr/M8oo8aOqX+Z3ZQ17b\nKvln4a0QwupYFq+q7Gdkcs56N3kte/acqyW9Xc71NJPUSdKY7Gcic137KPmZWBRCWBhbT3wGQghr\nJH2psp+BN3Je15fXCm4jr118Lee828TOm3stXynZStNN0seZ7fnOV8jUWJ6rJS1W8nPyVk76ij4n\nf5H0nJn9y8zOt7Ldld7OeR/fUOx9rOAz1UP++zcv96RVeO+AMhg4hdrs7/ImuqvkwepDwQeqlBFC\n+DrTpLqfpIMk3SOvATlcXvsiJfvENcjJYpSkhvImtbnyP7rj8qTLK/PP/e/yWqdfxbY3l/S0vDl/\nmLw28CR5rVB1yx2sEuQ1S/lUOoBfD2VpJv8nPzDPcWX+CVaQ5w/bQggh870k++V8rcpeZ4PMcetT\nvnJmy2SVOH+hcnsG3udymKSfS5oir4F7TJX8rGbEm/xfkteIPyhvxm0u/93JlrcyVuasV+Y645pm\nfp6k5BcPyWsIs8r9DMS25VbQhAKvm2V+9pd/QYtbUuCYXFW91qx1/ZwkDw7hN2b2D3m3mlMkXWVm\nu4UQsn2tC+ZXic9UeZ+Dyr53QBnUpKI2u1/SNmZ2nPxb+X3lJQ4hfB9CGBdCuFDeb+qwTJ+q7Ojs\neK1oz5zDd5N0QwhhfKZGseCgqQKukPfZOyFTm5PVTd6s+OsQwmshhOkqW+NVqsLBpILPJLBQUZ9B\nmdnmmbw/rGI5sz6RtDonz+0yZa1KnuWWvYB3JG0t6ZsQwsc5S3WOlF+ksjXhue97XDZoqOr1VMV0\nSTuaz1SR9dOcNIlyZ/pXxq9jN3l3kftCCO/Ia2q3rmI5XpL3W+wn6b+ZWsBPJV0srznNBp0fSvqp\nJecZ7auKPyMfyfv2Zq+hSOVP1fWFMteR5zOxsJzjKmuXnNer5b8DH0j6XlK7POfNBqnTc65lM3nX\nkKyPJHWx2GAsJftn/xjTVfZzkbteRgjhvRDCH0IIu8nv51Gx3X1y+rDvLC+7VPFnaqr8GrfMc9r1\n/d6hDiNIRa0VQpgtb4a8Q9LMEELBybDN7NBME1dPM+ssH1SxWNKXmcDnf5J+Y2bdzOwweQf/uE8k\nnWJmXczsIEmXVbacZnaApN9K+j9J9c1si8yyiXwwRamk88yss5mdK+nInCxmywOXjnma6LJukg9Y\nOcTMtpfX/M6W9/WsshDCMnlt841m1s/M+mTy/E8IYVoVspotaTcza58ZFFQZz8n/6Y3NnLuzme1v\nZndU5Roq4SVJnc3s/zLv67XKGXWdY4E8YDnAfPaBJtVcHsn7n7aUD6bqamZnymv+47Vc/5V0tJkN\nMLOeku6S9wfM+kTS7pl7t738y1tV/9a/JG/C7ayoGfklSYMUNfVLPvimkaS/mtl2ZnaivLYtPvAo\nn79KOtbMzs4MArpJ/gUorxBCkHStpBFmdrqZbWNmO5nZpVaFmQvKMcJ81oJdM2X5ZwhhaQjhG3kz\n+V/N7Bgz62Rmfc3s2sy9zV7L+WZ2rJl1l78f8S+iz0qaL+kuM+tuZsfKW4DWxV2S+prP/NDVzC6V\nf8HKWxtqZpuYz9rQz8w6mNkh8iDzo1iyzSTdlPkbeJb8b+StmX3lfqZCCBMkvSnpUTPbI/P+/MzM\num2A9w51GEEqarv75H9c768g3VL5H92XJL0rr/k4NFareaa8v9dkSUPlXQjizpLXjkyVj8y9vApl\n3F3eVWCc/J9VdjkhhPCFvAvBeZm8D5APqoi7U960OE1RrW+uP8nvxSh5UNFE0uE5tbZVNVR+v56U\nB0bzFPUBrqzr5SOuP1XZfol5ZfpjHiT/BzpW3kR4i8o2t66T4PO7/kJey/2m/O/hY+WkXyXvqjFM\nXnN9SaG061Cmr+T9qwfKa5SPknSzkkHoXfL7MlbeVeRBeW1V1u3yPtPjJP1H/h6+U8VyfCy/xjdC\nNKfpf+W1yC/H0i2Tj3zvmTnHn+QDrP5ZQf4T5DNaXC2/96vlA7LKO+YW+T2/RF7D+aT897i8LiCV\nNUIecL8o72oyJLbvV/KZPK6XfyYfkrd2fJnZP0oezN0lv0f/U+x+Z34Hj5Z3EZosr40evi6FDSF8\nJA90L8jk2V3+N3BVgUPWyP++PSivhf2LpKtCCPE+/E/I34c35EHlJZn3SarcZ+po+QC+cfIuAecq\n0/qwnt871GHmX3IAAGlkZnfJm5vz9dEFJElmNl7SRyGEKj/pznxO4/ohhHxzyAI1hoFTAJAi5g95\n+FBea76/vPZ6cA0WCSlkZufLp51aLp/3eYC8lh+oMwhSASBdtpY3gxdLminpohDCgzVbJKRQD3lQ\n2lzehH9MZmo+oM6guR8AAACpw8ApAAAApA5BKgAAAFKnzvRJbdSoUWjduqafdggAAIBC5s2b930I\noVFl0taZILV169aaO3duTRcDAAAABZhZofm+y6C5HwAAAKlDkAoAAIDUIUgFAABA6hCkAgAAIHUI\nUgEAAJA6BKkAAABIHYJUAAAApA5BKgAAAFKHIBUAAACpQ5AKAACA1CFIBQAAQOoQpAIAACB1CFIB\nAACQOgSpAAAASB2CVAAAAKQOQSoAAABShyAVAAAAqUOQCgAAgNQhSAUAAEDqEKQCAAAgdQhSAQAA\nkDoEqQAAAEgdglQAAACkDkEqAAAAUocgFQAAAKlDkAoAAIDUIUgFAABA6hCkAgAAIHUIUgEAAJA6\nBKkAAABIHYJUAAAApA5BKgAAAFKHIBUAAACpQ5AKANgoDB4smUXLrFk1XSIA5SFIBQAAQOoQpAIA\nACB1CFIBAACQOgSpAJASEycm+0wOHy5NmyYde6zUurVUr570r3952tJSadQo6cADpbZtpYYNpc03\nl/bZR7rnHmn16rL5r13r+/bd149p0EBq3lzq3NnzGTZMev/9ssetXCndfLPUv79UXOzHtW4tHXKI\nNHZs/muZMEH6+c+lPfeUOnb08zRs6MfttZf0pz9JK1aUPW748OQ9mDhRevJJP6ZFC9+2dGmUfvly\n6aabpAEDpDZt/BzFxVKfPtIvfyktWVL+PX/2WT+2eXNf9t9f+t//yj8GwAYSQqgTS/v27QMA1GYT\nJoQgRcuhh4bQtGly22OPhfDFFyHsumtye+7Sv38Iy5Yl8z/jjPKPkUIYMSJ5zIwZIXTtWv4xJ5wQ\nQmlp8rgzz6z4XN26hbB4cfK4K68sm3fucV995WknTw6hQ4fyzzF5cpT3aacl9517bv5jmjQJ4YMP\n1v39BFCWpLmhkrEdNakAkFJPPeW1jdtu67WWXbv69mOPlV5/PUrXrZt06KHST34SbZs4Ufp//y9a\nnzvXa1GzWrWSDjrIa1C7d5caNy57/u++8/NOnx5t69XLz9WxY7RtzBjpiivKHl+/vtSzp9fAHnGE\ntN9+XtuZ9dFH+Y+LGzPGa5B33NHLmz1+8WJfnz07Stu0qbT77l5TvMUW5ecrSbff7rXP++3nP7NW\nrpT+8IeKjwewnlU2mk37Qk0qgNoutyZVCuGaa5JpHn88uf/qq5P7hw5N7n/3Xd/+2mvJ7XPnJo9b\nuTKEp54K4cUXo2233po85oEHon1r1oRw3HHRvkaNQli0KNo/Y0bZmtwQQvj++xD22CM6rnXr5P7c\nmtRGjUJ4/vlof2mpn/vSS5Pp+vYN4fPPk3k980wIc+ZE67k1qd26ea10CCF8+mkIjRtH+zp0KFt2\nAOtOVahJrV/TQTIAIL+f/ET6zW+S2555Jrn+6qtes5q1aFFy/7//7bWZ226b3H7JJdLAgV47262b\n98ccODCZ5qmnotf16kmPPeZLVrwWc9Uq6YUXpOOP9/WOHaX775ceflh6913pyy+9ZjbXokXSV19J\nLVuW3SdJp5/ufUaz6mf+az3xRDLdvfdK7doltx14YP48sy65xPvISlKnTtJ220lTpvj6/PnlHwtg\n/SNIBYCU2nNPHygUlzsB/dNPl59HNpBs1Uo6/3zp1lt9/Z//9EXyc/ToIQ0aJA0ZIjVpUvZca9dK\njz5auXOtXesB73PPlZ8+65tvCgep/frl3x4vW3GxB9pVteOOyfVmzaLX339f9fwAVC+CVABIqXz9\nKkOoWh7xEfQ33yztuqsHp6+/7jWY2TynTvVl8mSv/VyXcz38cDJArVdP2mUXr+msV89Hz8drYcs7\nT6G+pVUtWz65gXFR0brnCaD6MHAKAGrAjIXLVLpmbWLb6pz1enn+QscHLEnSJ5+UP4Z+1Khkfqec\n4l0AlizxJvhJk5LdBR55RFq4sOy5Gjf2ILS8cw0f7mlfeSVZxoce8m4JY8d6/tttV9HdKf8eSN48\nn7V4sQ/CAlC3EKQCwAY2Y+EyHXXbJA0ZPeWHQLV0zVr95YWPKzz2kEOS67/4hbRsWXJbaak0frx0\n4ok+ql/y+USvu076OHaKzTeX+vb1UfJx2ab0+Lm++04aOtT7nsZ9+630+OM+4j8rd47WbPcByecl\nHT++3EuslPj5JOmMM8r2I338cWnOnHU/F4CaQXM/AGxgHYubau+urfX0VI+qRh7fS0MfekevzVxV\nwZEenPXt6zWTkg8gKinxyetbtPBa0Pfei5ref/97//ndd9Kll/rSvr0f07at16bGp7MqKvLJ/SXp\nzDOlP/9ZmjnT12+/3ful9urlNavz5/u5cgPXnXeW/vrXaP3oo30y/mXLpNdeq+rdym/oUB8slR0o\nNmmS1KWL1Lu3tMkm0gcfSPPmefeFrbaqnnMC2LCqNUg1sy6S7pNULGmppMEhhGk5aS6VNCi2qbOk\nu0IIF5tZf0njJMVm5VPfEMK31VlOAKhJDYrq6cZBvSVJT0+d/0OwulunLnq8gmPNfIT94YdLb7zh\n2775xudFzSdfP8t583zJ5ze/iUa8N2niXQMOPzyaK3XRovw1ofFm+ZNOkm65xQNEyQPkbB/VHXeU\nttnGm/3XRZs2PtPBkUdGtaUrVpTtagCg9qru5v6/SbojhNBV0h8l3Z2bIITwhxBC7xBCb0m7SPpe\n0j9iSaZl92cWAlQAdU6DonoaeXyvxLbz9tmmUse2bevB2P33e81qu3b+ONBGjbzW8MADpWuvlWbM\niGoRN9tMeuAB6ZxzvNY1fszWW0tHHeWPHx0xInmubt082Lzttmgy/vr1vbayUyfpsMO8tjXerN6o\nkU9Hdf75fp4GDaQOHaSLL5b++1+fdL869OnjNbk33CDtvbfPYFC/vndj6N3bz7f11tVzLgAbnoXq\nGCIpyczayGtAi0MIq83MJM2XtFsIYVaBY46XdFkmYFWmJvX6EMJOVT1/SUlJmJvtfAUAKVe6Zq2G\njJ7yQy2qJA3s2U43DuqtBkUMFwBQN5nZvBBCSWXSVudfwq0kfR5CWC1JmacKfCapvO+xZ6psbWs3\nM3vbzN40s/OqsXwAkArxAHVgz3b6cMRBGtiznZ6eOj8xmAoANmbVPXAqt1rW8qaSZGZbSdpT0omx\nzW9LKgkhfG1mJZLGmdniEMJDeY6/WNLF2fVNN910nQoOABvKrMUr9OL0RYma02wf1RenL9KsxSvU\npW3zGi4lANSs6m7unyGpVWWa+81smKSfhBBOzN0XS/MbSVuGEH5e0flp7gdQm8xYuEwdi5smmvZL\n16wlQAVQp9VIc38I4QtJkyWdnNl0jKRZBQJUkzRYOU39ZtbOzOplXjeXdGgmTwCoU7q0bV6m72mD\nonoEqACQUd2988+RdI6ZTZd0qbzPqcxsnJnFB0MNkHcFeD7n+GMkTTWzdyS9Juk/ku6t5jICAAAg\n5aqtub+m0dwPAACQbjU1uh8AAACoFgSpAAAASJ3qnoIKAIDqs3atNHKkP/O0eXPpoov8sVIA6jx+\n0wEA6fX730uXX+7PYR03jgAV2IjQ3A+g5k2cKJlFy/DhNV2idKkL92fWrOQ1DB5c8TGTJvm1Nmwo\nPfqo1L//+i0jgFQhSAWAmvJjAreNxdKl0kknSSFI//yndPDBNV0iABsY7SYAal7r1tIxx0Tr3bvX\nXFnSqC7cn6ZNk9ew887lp3/3XQ/ad9pJOvTQ9Vo0AOnEPKkAUFNmzZI6dYrWTztNGjWqpkoDAOsd\n86QCqF3K63MZ396/v4/yvvJKqUsXqVEjqX176cILpWXLomNefjl53E03lT3ne+8l0wwbltw/b550\n6aXSjjtKm27q59p6a+mUU6S3385/HQsWSL/6ldSrl49Eb9DAa0F79JBOPlm65Rbpu+88bceOyQBV\nku67L/99qEyf1GnTpKOOklq2lJo1k3bbTRozpuIuBf/6l3TOOdKuu0pbbeU1no0aSVtsIe2/v/S3\nv0mrV+e/Xkl64w3pzDOlbt38vJts4td21FHS449H6SrTteH776W77pIOOEBq08b7orZs6WUbMUJa\nsqTsMaNGJfMdNcrf2+OPl4qLpcaNpd69vcsAgNolhFAnlvbt2wcAtdSECSF470Nfrrwy2hffvv32\nIfTokdyWXfbdN4S1a6PjunSJ9u26a9lzXnZZtN8shI8/jvY99VQILVrkP48UQr16Idx4YzK/BQtC\naNeu8DHZZc4cT9+hQ8Vps/ehvPsTQgivvhpCs2b58xg8OLl+2mnJY/fdt+Jy7LlnCN9+mzxu7doQ\nhgwp/7gjjojSz5xZfjnmzw+hT5/y89tiixBefz153L33JtMcdVQIDRvmP/6++8p+DgBsUJLmhkrG\ndtSkAqg93n/fa8m6d5f69fOas6znn5defDFaP+206PXrr0uffprMa8yY6PVee0nbbBOd47jjpG++\n8fX69f1cBx/stXqSz935i19IzzwT5XHXXdL8+dF6jx7S4YdLe+4pdehQ9loOOaTsYKAOHbzfZnap\nTN/T77/3AUbLl0fb2rb1WtA2bSrXfSBb2zhggHTEEdI++0ibbRbtf/ll6eabk8dcc410443Jbdts\n49e0ww5ei1xZIUhHH52soS4u9hrVzp2jbQsWeP/URYsK5/XYY/652Hvv5LGS18ADqD0qG82mfaEm\nFajFKluTKoUwdGi079prCx83Z47XeGb3XXNNtO+NN5LHjRoV7TvuuGh7ixYhTJsW7fvqqxC22Sba\n/9OfRvvOPTfavt9+Za/xs89CuP32EJYujbZVVLtYmfvz4IPJfTvvHMI330Tl7dWr/HNMm1a2ljSE\nEJYtC6Fz52S+WUuWhNCkSbTPLIR77kkev2hRCI89VrlrfeKJ5L6ddvKyhxDCmjWeNr7/t7+Njs2t\nSW3UKIQ33/R9334bwg47JPfPnJn/HgPYIERNKoA6qUmTZG3YIYck98drMktKpP32i9ZHj87/ulkz\n6dhj/fXatdK//x3t22QT6YorfP+xx0pnnZXsn/nWW9IXX/jrbbeNtr/5ptc0PvWUNGOGtGaN9/c8\n5xzv31qdXnghuX7ppd4fVvLa0EsuKf/4jh293+mAAVK7dt4f1czziNc+f/RR9Hr8eGnlymj9hBOk\n009P5ltcLB15ZOWuIX7PJemyy6Ka3Hr1fEL/uHHjCud1/PE+I4DkNcQDBiT3xz8jAFKNKagA1B7b\nbhsFYJIHmHGrViXXBw+WnnvOX0+d6k353btLDz0UpTn+eB8sJEmLFyebzRcu9EnkyzN7tjern366\nD9CaM0f6+mt/SlJWkybe/HzBBWUD63U1Z05yvUeP5Pr22xc+dsUKaY89pHfeqfg82e4Pkg+Cittz\nz4qPL09ufrllbtfOu1p89ZWvz55dOK8dd0yuV/QZAZBa1KQCqD2yfUKziorKT3/UUcm+lQ8+KL30\nkhSfri5eAxhC1cu0YoX/3Hxzr0G97DLv39mwYZRm5UqvLRw40GtXq1Numevl/FmP99vN9Ze/JAPU\nhg29f+7RR3uf2OLiyp2zuuUrc2XPWdXPCIDUIkgFsEHNWLhMpWvWJratzlmvNo0bS4MGReujRyeb\n+rfdNlkLWFwc1apKHrCV7RWbXOKP6mzb1pv5J0/2wHT2bGnsWO96kHXrrdHr8gLIytp66+T6Bx8k\n1997r/Cxr7ySXJ80yQefPfqo9MgjUqtW+Y/LnTrr5ZcrV9ZCcgeWvf9+cn3+fH8CVaH0AOokglQA\nG8yMhct01G2TNGT0lB8C1dI1a/WXFz5efyeNz8f5ySfSPfdE67n9KIuKpAMPjNZfesnnLs21aJF0\nxx0+P2vW8897AJydr7WoyAPII49MjjKPN21vskky33nzKnNFSbl9Lq+7Luov+vXX0h//WPjY3PlP\nmzSJXt99d7Ifaty++ybLPmYuQRjOAAAgAElEQVRM2VkEFixIdqsoT+4sB9de62WXvJ/wb39bfnoA\ndRJBKoANpmNxU+3dtbWenjpfQ0ZP0XelazRk9BS9NvPL9XfSXXdNTuWU7ZNYr5506qll0w8b5oOH\nJK8pHTzYa1wHDvTgaLvtfKL7c87xR3dmTZ4snXii18Zut50Hj4cfLnXtKv33v1G6Ll2i161bJ7sj\njB8v7b57NFArt79pPkcfnazZfOUVP8eBB/q5y+tvmvto0t128+vccUcfJFaopnfzzZMDskLwgL9L\nF+9z26ePD8iq7AT6hx0m7bJLtP7mm57XQQf5Ndx7b7SvVSvpoosqly+AWo2BUwA2mAZF9XTjoN6S\npKenztfTU32k9ZBOBZqVq8vgwWVHue+/f7IZPqtXL68ZPPXUaLDQJ5/4kqt+nj+h33/vNZD5aiFb\ntJB+97toPfvkpfh8o6++Gr0ePtxnBShPw4YeDO63X9Q/9vPPfZGkc8+Vbr89mT7rwgs9AMwGw998\nE42cP/hgrxUu1JR/5ZXSl196v9asjz/2parMfH7TgQOlKVN826JF0rPPJtO1aeNPyGrTpurnAFDr\nUJMKYINqUFRPI4/vldh23j7brN+TnnJK2YAy32M5s444wvt2Xn651zZutpk33zdv7iPPTz5Zuv/+\n5GM/jztOGjnSB2t16+a1jUVFPrq8Z0+v/XvnnbKjz6+7zqe56tIlGUBWxW67+eNJjzzSp7hq0sS3\njR3r5YrbYovodatWHhSffLK/btTIay5HjPBrK2/QkZk/5nXSJL+XXbr4eRs39m4ORxxR/j3OteWW\n0muv+XRY++7rNdL163tgv9NOHrC//77Ut28VbgyA2szC+h6luYGUlJSEufERuwBSqXTNWg0ZPeWH\nWlRJGtiznW4c1FsNivje/KN8+aUHh/FBX5JUWupdDuJPxpowITnYCwA2IDObF0LI04xVFs39ADaY\neIA6sGc7jTy+l4Y+9M4PASuB6o80YYJ3T9hnHx+k1bKlD1x65plkv9a99iJABVBrEKQC2GBmLV6h\nF6cvStScZvuovjh9kWYtXqEubZtXkAvy+vbb8p/EtNtuFT+YAABShOZ+ABvUjIXL1LG4aaLGtHTN\nWgLUdTFnjvTXv/ocp7Nm+ZOziop83taf/tSfqnXssWUn+geADawqzf0EqQAAANggqhKk8rUaAAAA\nqUOQCgAAgNQhSAUAAEDqEKQCAAAgdQhSAQAAkDoEqQAAAEgdglQAAACkDkEqAAAAUocgFQAAAKlD\nkAoAAIDUIUgFAABA6hCkAgAAIHUIUgEAAJA6BKkAAABIHYJUAAAApA5BKgAAAFKHIBUAAACpQ5AK\nAACA1CFIBQAAQOoQpAIAACB1CFIBAACQOgSpAAAASB2CVAAAAKQOQSoAAABShyAVAAAAqUOQCgAA\ngNQhSAUAAEDqEKQCAAAgdQhSAQAAkDrVGqSaWRczm2Rm083sDTPrnifNYDNbamZTMsuEnP2Xm9kn\nmWVEdZYPAAAAtUN116T+TdIdIYSukv4o6e4C6caHEHpnln2yG81sL0knStpBUndJB5vZgdVcRgAA\nAKRctQWpZtZGUh9JD2Q2PSqpk5l1rEI2J0gaFUJYEUJYJekeedAKAACAjUh11qRuJenzEMJqSQoh\nBEmfSdo6T9q9M039r5jZsbHtW0uaHVufVeB4mdnFZjY3uyxfvrxaLgIAAAA1r7qb+0POuuVJ85Sk\nDiGE3pLOkvRnM9utQB75jvdEIdwQQijJLs2aNfvRhQYAAEC6VGeQOkdSiZnVlyQzM3nt6mfxRCGE\nxSGElZnXH0gaJ2mPzO7PJHWMJe+QezwAAADqvmoLUkMIX0iaLOnkzKZjJM0KIcyKpzOz9rHXbSUN\nyBwnSQ9LOs3MmppZI0lnSBpdXWUEAABA7VC/mvM7R9IoM7tM0jeSTpMkMxsnaVgI4X+SzjezIySV\nyoPkP4cQXpCkEMJEM3tI0tRMfqNDCM9UcxkBAACQcubjm2q/kpKSMHfu3JouBgAAAAows3khhJLK\npOWJUwAAAEgdglQAAACkDkEqAAAAUocgFQAAAKlDkAoAAIDUIUgFAABA6hCkAgAAIHUIUgEAAJA6\nBKkAAABIHYJUAAAApA5BKgAAAFKHIBUAAACpQ5AKAACA1CFIBQAAQOoQpAIAACB1CFIBAACQOgSp\nAAAASB2CVAAAAKQOQSoAAABShyAVAAAAqUOQCgAAgNQhSAUAAEDqEKQCAAAgdQhSAQAAkDoEqQAA\nAEgdglQAAACkDkEqAAAAUocgFQAAAKlDkAoAAIDUIUgFAABA6hCkAgAAIHUIUgEAAJA6BKkAAABI\nHYJUAAAApA5BKgAAAFKHIBUAAACpQ5AKAACA1CFIBQAAQOoQpAIAACB1CFIBAACQOgSpAAAASB2C\nVAAAAKQOQSoAAABShyAVAAAAqUOQCgAAgNQhSAUAAEDqEKQCAAAgdQhSAQAAkDoEqQAAAEgdglQA\nAACkDkEqAAAAUocgFQAAAKlDkAoAAIDUIUgFAABA6lRrkGpmXcxskplNN7M3zKx7njQnmNlkM3vP\nzKaa2c9j+/qb2UozmxJbNqnOMgIAACD96ldzfn+TdEcIYZSZHSvpbkl9c9LMlXRwCGGBmW0q6S0z\nezuE8Epm/7QQwk7VXC4AAADUItVWk2pmbST1kfRAZtOjkjqZWcd4uhDCKyGEBZnXX0v6UFKn6ioH\nAAAAar/qbO7fStLnIYTVkhRCCJI+k7R1oQMy3QH6Snohtrmbmb1tZm+a2XnlHHuxmc3NLsuXL6+e\nqwAAAECNq+7m/pCzboUSmlmJpMclnRtC+Dyz+W1JJSGErzP7x5nZ4hDCQ2VOFMINkm7IrpeUlOSe\nGwAAALVUddakzpFUYmb1JcnMTF67+lluQjPbUtJ4SVeHEB7Obg8hfJPpAqAQwlxJD0rqV41lBAAA\nQC1QbUFqCOELSZMlnZzZdIykWSGEWfF0ZtZO0vOSrgsh3Je7z8zqZV43l3RoJk8AAABsRKp7ntRz\nJJ1jZtMlXSrpTEkys3Fmlh2xf5W8n+pFsWmmTs/sO0bSVDN7R9Jrkv4j6d5qLiMAAABSznx8U+1X\nUlIS5s6dW9PFAAAAQAFmNi+EUFKZtDxxCgAAAKlDkAoAAIDUIUgFAABA6hCkAgAAIHUIUgEAAJA6\nBKkAAABIHYJUAAAApA5BKgAAAFKHIBUAAACpQ5AKAACA1CFIBQAAQOoQpAIAACB1CFIBAACQOgSp\nAAAASB2CVAAAAKQOQSoAAABShyAVQLWYOFEyi5bhw9fvcQCAuo0gFUCdMmtWMugdPLimSwQA+DHq\n13QBAGzcWreWjjkmWu/evebKAgBID4JUADVq++2lRx6p6VIAANKG5n4A6838+dLZZ0tbbik1aiRt\nt510003JNBX1SV2wQPrVr6RevaTmzaUGDbz2tUcP6eSTpVtukb77ztN27Ch16pQ8/r77ys//s8+i\n/Fu0kBo29PIefrj06KNSCGWvq3//ZJ5r1kg33OBlatxY6t1b+tnPov1NmkhLl5bN59xzozQNGkgL\nF1b61gJAnUdNKoD1YupUD/wWLYq2ffSRNGSI9M030hVXVJzHwoVSnz4e7MYtXuzL++9L//iHdNRR\nUklJ1cv42GPSKadIK1Ykt8+fLz35pC8DB0oPPyxtsknhfH72M2nMmOS2IUOkf/7TX3/7rXT//dLP\nfx7tX71aGjs2Wj/sMKlt26pfAwDUVdSkAlgvxo6VliyRdt/dm/TjrrtOWr684jzuuisZoPbo4TWc\ne+4pdehQNv0hh0gHH5zc1qGD93nNLtk+r++8I514YjJA7dFD2ndfr7HNevpp6bzzyi/nmDFeW7rX\nXtI++/jrnXeW+vaN0txxR/KYF15IBvBnnln+OQBgY0NNKoD15rHHvIZw7VqvkXzmGd++YoX0v/95\ns3l55s6NXu+3n/Sf/yT3z5kjjRsXBZW33eaj++NN/v37S6NGlc37qqukVaui9REjpMsv99effebB\n9bx5vn7ffdJvfiN17Zq/nF26SM89590NJOn77/3nRRdJr77qr997T5o0yfOVkjWvW24pHXRQ/rwB\nYGNFTSpqLybmLGvUqOS15YvONpB+/TxAlaR69coGYblN+Plsu230+s03pWuukZ56Spoxw/uBbrWV\ndM450qabVq1sa9d6UJlVXCz9+tfR+tZbS+efH62HEAXY+YwYEQWokvdrlbzmNt4N4W9/85+lpR7A\nZw0eLBUVVe0aAKCuI0gF1hUTc+a1447J9WbNkuvxWsxCTj/dA1FJ+vprr+k87DCv0WzRwpv3x42r\netkWL052N+jSxQcuxeV2UZg9u3B+/frl316/frKrwMMP+wCq556TvvrKt5n5dQIAkmjux8anLk/M\n2bFj8tri1XsbWMuWyfUfU1O4+eZeg3rzzR6MTpsWNaWvXCn9+9++PPmkdOihP76sZmW35RvVX8gW\nWxTed/bZXtP67be+/P3v3tUha++9kzXGAABHkIqNT12emLN//4o7elaDGQuXqWNxUzUoihpjVq9Z\nq/XRONO2rTfzX3ONN/HPmye99ZZ04YVRn9Vbb42C1HwBZ65WraSmTaNBU9OnexN8vDZ12rTkMfkG\namXVK+eyW7XyqbLuvNPXb7896usqMWAKAAqhuR91y8Y8MadUcZ/UG27w+ZJ22EFq187vUZMmfg3H\nHefVkhWYsXCZjrptkoaMnqLSNWslSaVr1uovL3xc4bFV9fzz0ujR0rJlvl5U5P1FjzxS6tw5Sjdr\nVvQ6d6qoeECYVVQkHXBAtL54sXT99dH63Lk+CCvLTDrwwB99Gbrwwuj1Bx/4FFyS96WNV3wDACLU\npKLu2Ngn5qyMSy7xADfXrFm+PPKINHRoMmLL0bG4qfbu2lpPT/V7NPL4Xhr60Dt6bWYlOplW0eTJ\nHs83bOhx9JZbet/WDz6QPo7FxF26RK9bt5Y22yyaPH/8eB9Rv+WWvv7nP3s/1yuu8Omlst0HLrtM\nevBBr7l9440okJT8u0m3bj/+OrJTWz3/fHL7SSeV/zYDwMaMIBV1x9ixXkW2++4+yub996N9110n\n/eIXZUfv5Mo3MWfnzj7h55w5ZUfPHHKIB3fxGsgOHaSddorWcyfmjI8Y6tEjioqy1YXZiTnvvbdw\nObMTc+60k19ztma3MjbfXNpmG//ZqJH0xRfS229H0drIkV6ruuuueQ9vUFRPNw7ymtunp87/IVjd\nrVMXPV75UlTJ99/7942PPiq7r0UL6Xe/i9azY9duvDHalp0GSvKK7a228oFdDzzgaVeu9H1Tp/oS\nd9BB3kS/ri66qGyQSlM/ABRGkIq6ZWOfmLMib73lTf25HTenTUsOZ3/00YJBquSB6sjje/0QoErS\neftso8evrVwxKuu447y/58svexEXLfLvH5ts4rd8wACvKM8dH3bddf4WjR7t3ysK3Z7jjvNJ92+5\nxd/qmTP9LWrVyuP/U07xNJXp51qRgQP9+86nn/r6DjtIP/3puucLAHUVQSrqjnwTc8Ynt/yxE3P2\n6uVtvZ07RxNzVlVlJ+a87DJfz07MWShILTQxZ0Vat/ZzjB8vffKJ196uXl02Xb4qy5jSNWs19KF3\nEtseX/iOvl/dOzGYKm7w4Pyzc/XvX3gkfYcO0sUX+1IVDRv6d4Krrqo4bceOXnlcFRMnVi295JXd\n8S4E/+//VT0PANiYEKSi7qiuiTlvuslrTLMTc2Y1aeLzBV1wgTfzV8WGmpizPFOnevmzE3SWJx5N\n5Shds1ZDRk/R01Pna2DPdj/0Sc3Wqt44qHCgujG6/noPUJ980j8Gkve0OO20mi0XAKQdQSrqDibm\nLN8llyQD1JYtpV12iYL5Rx+tVFlmLV6hF6cv0sCe7X4ISLN9VF+cvkizFq9Ql7bNq16+OupXvyq7\nbeTIqMcIACA/glQgV22fmLOQV16JXm+5pQ+Rb9HC1xcsSAap5ejStrkeO2/3xDyp2UCVALWwZs18\nDN2ll/rkEACA8tEmh1phxsJlP8zJmbU6Z71a1JWJOfOJ9z2tXz/qx7p6tQ/SqoIubZuXadJvUFSP\nADWPEHxZtkx6/XUCVACoLIJUpN6GnDxekyf7NFHFxf4ggAEDfJL9rl2l//43SpdvYs6s7MScxx7r\ny5w5vv2KK5IDnC67zId477+/90eNzyywrhNz5rPzztHrzz7z/A891AeL3Xdf9Z4LAIB1RJCK1ItP\nHj9k9BR9V7pGQ0ZP0Wszv1x/J81OzDlhgvc/jc8cX2hizrhXX/Xm80cf9QFYUjQxZ5MmUbqpUz2o\njQ9Uqq6JOXNde22ye8Fnn/mcrLNne9cGAABShD6pSL1Ck8cP6dSq+k9WlybmzLXHHtKLL0rDhkWz\n22+/vc/vdMIJ0fRXAACkgIWqjChOsZKSkjA33lyKOue70jXa7opo3tMPRxykxg1+xAh+AABQI8xs\nXgihUs8Vp7kftUK+yeOHPvROmcFUAACgbiBIRerlTh7/4YiDNLBnux/6qBKoAgBQ99AnFanH5PEA\nAGx86JOKWmHGwmWJyeMlr2ElQAUAoPaoSp9UalJRK+QLRJk8HgCAuos+qQAAAEgdglQAAACkDkEq\nAAAAUocgFQAAAKlDkAoAAIDUIUgFAABA6hCkAgAAIHWqNUg1sy5mNsnMppvZG2bWvUC6y83sk8wy\norL7AAAAsHGo7prUv0m6I4TQVdIfJd2dm8DM9pJ0oqQdJHWXdLCZHVjRPgAAAGw8qi1INbM2kvpI\neiCz6VFJncysY07SEySNCiGsCCGsknSPPDCtaB8AAAA2EtVZk7qVpM9DCKslKYQQJH0maeucdFtL\nmh1bnxVLU94+AAAAbCSqu7k/5KxbJdLlpilvX7TD7GIzm5tdli9fXoViAgAAIM2qM0idI6nEzOpL\nkpmZvHb1s5x0n0nqGFvvEEtT3r6EEMINIYSS7NKsWbN1vgAAAACkQ7UFqSGELyRNlnRyZtMxkmaF\nEGblJH1Y0mlm1tTMGkk6Q9LoSuwDAADARqK6m/vPkXSOmU2XdKmkMyXJzMaZ2U6SFEKYKOkhSVMl\nfSDpuRDCMxXtAwAAwMbDfHxT7VdSUhLmzp1b08UAAABAAWY2L4RQUpm0PHEKAAAAqUOQCgAAgNQh\nSAUAAEDqEKQCAAAgdQhSAQAAkDoEqQAAAEgdglQAAACkDkEqAAAAUocgFQAAAKlDkAoAAIDUIUgF\nAABA6hCkAgAAIHUIUgEAAJA6BKkAAABIHYJUAAAApA5BKgAAAFKHIBUAAACpQ5AKAACA1CFIBQAA\nQOoQpAIAACB1CFIBAACQOgSpAAAASB2CVAAAAKQOQSoAAABShyAVAAAAqUOQCgAAgNQhSAUAAEDq\nEKQCAAAgdQhSAQAAkDoEqQAAAEgdglQAAACkDkEqAAAAUocgFQAAAKlDkAoAAIDUIUgFAABA6hCk\nAgAAIHUIUgEAAJA6BKkAAABIHYJUAAAApA5BKgAAAFKHIBUAAACpQ5AKAMB69I9/SHvtJW22mVSv\nnmTmy6hRNV0yIN3q13QBAACoq+68Uzr77JouBVA7EaQCALCe3H13cn2XXaSttvLXHTtu8OIAtQpB\nKgAA68nChdHrdu2k11+vubIAtQ19UgEAtdbixdLVV0u77y61aiU1bCi1bSv17StdcYVUWhqlXbtW\nevhh6fDDpS239LSbbir17i39+tfS3Lll8584MepDaiYNHy7Nni2deaYHnY0aSdttJ910U/K4wYM9\n/axZ0bb585N5xa1cKd18s9S/v1RcLDVoILVuLR1yiDR2bP5r798/md+aNdINN0g9ekiNG/t1xU2f\nLv3851L37lKzZp5m222lc8+VZswom/+sWcn8Bw/2+z1kiNShg197p07SsGHS6tX5y7h8ud+bAQOk\nNm38nhcXS336SL/8pbRkSdljXnhBGjTIz9G4sdS8ubTjjtLvfictXZr/PKijQgh1Ymnfvn0AAGw8\nxo8PoVWrEKTCy1dfedrly0M44IDy07ZoEcJTTyXPMWFCMs1BB4Ww6ab5j7/qqui4004r/1xSlHbG\njBC6di0/7QknhFBamizb3nuXTRNf79UrSnvnnSE0bFg4/8aNQ3j44WT+M2cm0+yxRwglJfmPP+OM\nsu/P5MkhdOhQ/nVNnhylX7MmhLPOKj/91luH8P77lfp4IKUkzQ2VjO3M09d+JSUlYW6+r8EAgDrn\nww+lnXaSVqyItm22mdSrl4cz773ntXRffeXbBw2SxoyJ0rZo4f1D58+X3n8/2r7JJtJbb0k/+Ymv\nT5wo7bNP8txFRdKuu0pff508tmlTacECr6W89VZpwgTp3//2WlJJatJEOvjgKP0jj0jffSftsEOy\nJrNXL++3+t57yZrYSy+Vfv/7aL1/f+nFF5Nla9LE70tRkec9aZI0frx04IFek5y9xr59Pc0rr0Tl\na9RIeuMNL4/k5+7Uqey9/+lPfZaCN9+MtplJn3wSpV+82Gt0490dmjb1a9tkE79vCxZIkydHNb7D\nhkkjRkTp27b1Gtevv5ZefdXfV0nq3NnvzSablC0b0s/M5oUQSiqVuLLRbNoXalIBYOMxaFCyhu2I\nI0JYujTav3p1CKNHh7BiRQjvvptM26FDCHPnRmmvuCK5/2c/i/bl1qRKITzxhO9bs8ZrVuP7JkxI\nljNek9ihQ9nruPXW5PEPPBDtW7MmhOOOi/Y1ahTCokXR/tya1C5dvPYza9Uq/7nzzsmayM8/j9LM\nnh1Cy5bR/mOOifbl1qRKIdx8c7T/7LOT++69N9p36aXJfX37Js8bQgjPPBPCnDn+evFir83Npj/k\nkKj8Ifg9j+d3yy1l7yVqB1WhJpWBUwCAWmXtWunpp6P1Jk2ke+7x/qVZRUXSCSf463//O3n8RRdJ\n7dtH65dd5v0mv/kmf/q4fv2kww7z1/XqSQcdJD3zTLR//vyqXctTT0Wv69WTHnvMl6zZs6PXq1Z5\nf83jj8+f14gRyRkDGjaUvvgiWeNZVOT9UuOKiqLXzz7r97denhErHTpI558frR9yiHTHHdF6/Nqf\neCJ57L33eh/euAMPjF4//7zX/GYtXiyddFK0ntvoO26cdMEFZcuIuoUgFWWtXCndeKP09tvSBx/4\nX4uvvvK/eFtuKe22m/R//+ftRXHxkQB77y099JD029/6X6sVK7zn+4gR3kaV7eF/553+V3jLLaVT\nTvH2nvp5PpaLF3v72bhx3vt/xQqpZUtvJzrhBD+2QYPkMYMHS/fdF63PnOltd7feKk2b5unphQ/U\nOl9+KS1bFq137y5tvnnh9PEmc0nafvvkeuPG0jbbeNOz5N0Eli3zATu5dtwxud6sWXJ91apyi15u\n2daulR59tPz08aA1V79+5ecv+Z/BmTML57F8ud/f1q3L7uvVKxm8lnft8fMWF0vduhU+Z75yvvGG\nL4WUdx9QdxCkoqwlSzy4zFVa6h2nZsyQHnjAqx5yv5JnLVrknbbif3leflnaf3/pued8GOu//hXt\nmzXLA9h588pOLPjKK9JRR3mecV984Xk995x/nX/yyfx/WbMuvTTZKS1e7QKg1ljXoRS5I+urkmfL\nlsn1eC3kj1HVa4n3wc21xRbrnn/2HPn+lFbl2qt63uq8D6g7mIIKhW2xhQeaBx/s7Vt9+kR/lUKQ\nhg7NP2eL5DWVs2d7bWv37tH21aulgQM9QO3Qweclif/HuPfe5Nf8zz/3+WLiAeo220gHHODzzWS9\n/nrhNrCsMWO8NnjXXf34zTar3H0AUONmLFym0jU+8qe4OFmLN21a/qmMsjp0SK7HBztJ3sz86afR\nesuW+WtR14d483zjxh58le0JGi3DhxfOq1ATfdypp1Y050D1PGQgPuBq8WLpo4/KT597znvuKb+M\nuTWvqJsIUlFWcbE388+fL732mjexP/GED3l98skoXWlp2Y5HcSNH+tDSKVOSf4G+/daD148+8o5I\nv/xltC+E5HDVkSOT/33OPtub+5991n9mh6FK3pQ/fnzh8rRu7e1Hr70WHQ8g9WYsXKajbpukIaOn\nqHTNWu8LenBU9bZypXTGGT4KPGvNGmnUKN8XH1EveUPO559H63/4Q9QfVSqbfn065JDo9Xff+Xf/\n3C4D334rPf64dOihVc9/iy28fiFrzBhvfMo1Z473wIqPrl8XuWU944yy/XUff9zPK0n77uuzC2Rd\nfXX+bglTpvi/jHhDHOquaglSzayJmT1oZh+b2XQzO7pAup5m9l8z+9DMpprZHWbWKLY/mNm7ZjYl\ns+TpYYP1rnFj/0p+4YXe53PTTb0G1Sz5F1Uq/PW4RQvpvPP8dYMGZTtyXXRR9Bdpr72S++J/yeIj\nGOrVk669Nqou2Hxzb8KPGzeu8HUNHeqdqrIaNiycFkBqdCxuqr27ttbTU+dryOgp+q50jerv9IGs\nQTSD/OOP+3fhffbxpV076fTTpe+/9++yxx4b5Tdzpjfw7L+/1LOnTxKf1bixD6TaUM48M1nrePvt\nPv3U/vt7A9ZOO3nN7pFHJgeLVcXVV0cNVqtW+YClHj08/wMP9Cmdtt7a/0R+8sm6X5PkecW7DEya\nJHXpIu25p19bSYlf05df+v7iYn9IQNann0pdu3p9xhFH+Hvapo3/Kxk5kuEEG4vq6pP6S0mrQgjb\nmlknSa+a2YQQwlc56b6TdEEI4V0zK5L0T0lDJV0bS7N7CGF5NZULP8bzz/vX4PhQy0Li1Q9xnTsn\nvxbn9rDfbrvC+wr1vm/bNtnEL5UdAVHVUQUAUq9BUT3dOMgn03x66nw9PdW/yA68uIEm3bmtlizx\nCGzpUm9Qyeeee3z85/PP+/rXX5dteGnWzLvb5/5ZWZ+aNPHv4ocfHjXuLFqUv1EoX3N+ZRx8sPTX\nv3q9w/ff+7b33y/b7UHKP271x2jTxmc9OPLIqLZ0xQofYlDINdf4td9zj6+vXu0NX/lUVzmRbtX1\nNp8gabAkhRBmmtl/JX5aM+EAACAASURBVB0haVQ8UQhhRuz1GjN7U9J2QrpccEEyQG3XzmtUmzTx\ntrN47Wah3u65g5Jy/7r+mEFL6zLaQco/qgBArdCgqJ5GHt/rhwBVkh4e0VnfXGy6/XZvRPnoIx+d\n3rKlf0/ebz+fQF7yPqbPPeeTjjzwgPde+vJL/y7dubN3U//5z71GcUPr1s1nFrjvPn8E6rvvei+n\nBg38z1aPHt59v6Ju9+U55xxvUr/tNp/GauZMDxqbNfMa6J139oaygQOr7bLUp49Pun/33V7T/d57\n/uWgRQu/zwMGJO93UZGnPfVU6a67fAL/zz/3nmUtW3pN7O67e+C7xx7VV06kV3UFqVtLildhzcps\nK8jMmko6S9Kvc3ZNNLMGkp6XdEUIIe8YPjO7WNLF2fVNGan9o8xYuEwdi5uqQVEmiFyyxB/lktWn\nj/+lyDaNv/Za+ZMIVrcOHaLyLFjg/1XitanTppVNX8iPrYYAUONK16zV0IfeSWwb+tA7unFQbw0b\nVk/DhlWcR716/uSpQYMqf97+/cv/Ljx4sC+FVHaAT5MmPrPf//1f5ctWqNa4kG239X6nldWxY/nX\nXtG9kTwg/cUvfKmsvff2BajUf20ze8nMFhdYtsoki39U81R5JfJrIGmMpOdCCI/HdnUIIewkaXdJ\nrSX9qVAeIYQbQggl2aVZbpMxKpQ7GEGSSld9n0zUsGE0/+jKldKVV27YQsZHMKxdK11+efRXcelS\n6Y9/LJweQJ1QumathoyeoqenztfAnu304YiDNLBnux/6qGb/fgGoWyoVpIYQ+oUQigsscyR9Jqlj\n7JAOmW1lZALUhyTNl3RRznk+y/xcIek2SXQiXI/yDUYY8sLnmtuiTZTotdf8IdYDB/rUT//5z4Yt\n5NChyamibr/de9MfeKC3/UyZEu3r18975AOoU2YtXqEXpy/SwJ7tdOOg3mrcoEg3DuqtgT3b6cXp\nizRrMZNmAnVRdTX3PyzpfEmDMwOn9pZ0bm4iM6svabSkJZLOzjzDNbuvpXzw1Uozqyfv5zq5msqH\nPAoNRuhz6i905l9+EyX86CNfzHx+kssv33CFbN/eOzMdc4xPtidJH3/sS9xOO0mPPLLhygVgg+nS\ntrkeO2/3RNek7N+vWYtXqEvbDTSpKYANqro66f1J0iZm9rGkZyWdH0JYIklmdpWZZQPWEyQdLWkn\nSZMz00zdmtm3naTXzOwdSVMltZI0RFivsoMR4n52w6/8gdJ9+/p8LC1aeOej556TfvazDV/Ivfby\nYajDhnkw2qKFD+0sLvaREXfe6UNG27SpOC8AtVKXts2jvvMZDYrqEaACdZiFqj6LLKVKSkrC3EJP\nP0JB8b5eWdkmtdx/CAAAAOvCzOaFEEoqk5YoZCPGYASgDvrHP7z1YbPNfDi7mS+jRtV0ydJl8ODo\n3pjxnE0ghZgOdyOWOxgh3kc1OxiBpjSgFrnzTn90MADUAQSpGzEGIwB1zN13J9d32cWfsSn5pJeI\n7Lyzz/yflZ31H0Bq0CcVAOqKTp2iZut27fxxPQCQIvRJBYD1YfFi6eqr/dmMrVr5wy7atvWZMK64\nwp/fmLV2rfTww/5Q9i239LSbbuqPGP71r6V8X6onTkz2kxw+XJo9WzrzTA86GzWStttOuumm5HHZ\n/pXxfpXz5yfzilu5Urr5Zp+1o7jYH9jRurU/F3Ps2PzX3r9/Mr81a/zxRT16+CwgvXsn00+f7s8Z\n7d7dn73ZuLE/8ujcc6UZM8rmP2tWMv/Bg/1+DxniT5Jr1MiD8GHD/KHu+Sxf7vdmwACf7aNhQ7++\nPn2kX/7Sn6iXe8/y9Un94gu/94cf7s8szd6j5s2l7bf3a8j34HsA1SuEUCeW9u3bBwBYb8aPD6FV\nqxD8mWf5l6++8rTLl4dwwAHlp23RIoSnnkqeY8KEZJqDDgph003zH3/VVdFxp51W/rmkKO2MGSF0\n7Vp+2hNOCKG0NFm2vfcumya+3qtXlPbOO0No2LBw/o0bh/Dww8n8Z85MptljjxBKSvIff8YZZd+f\nyZND6NCh/OuaPLnwPZs5M9r30ksV38+GDUN49NFyPjAA8pE0N1QytqNPKgBU5MMPpSOOkFbEnmy0\n2WZSr14esrz3XrKW7swzfV7hrBYtvH/o/PlRDdw33+j/t3fn4VVVZ9/HvysJ8yxBRCiDAiqIiFYp\nVsBaFRQt4oC0DqCitLVVFMurdagWO/i0itrCU+3jUEsBEaUOCFi14IDiCEZQQBQEjCFMCigRkvX+\ncZ/D3vvkJJzAIdnA73Nd+2LPZ53FdGete63F+efDO+/Yqm7pzJwJubnWcvvll9HWuzvvtAXRGzYM\n8itnzLBWUrDF4FOXCd661VpLwy2Z3btb3uoHHwStiY89Zq2Wf/hDxXXy2GP2Gd/9rpVx61Y7/8IL\nMGKEtSQD1KtnLc25uTaf8ddf270XXWSrxx11VPr3v/aa/XrssTZLwVtvBdceftgWFenQwY7XroX+\n/aGoKLinQQP7bvXqWb198UXF36UibdvagiLNm9v3+ewze5f38O239j3797d6EJHsyzSajfumllQR\n2WOGDIm2og0c6P3GjcH17du9nzzZ+y1bvH///ei97dp5v2pVcO8tt0SvX3hhcC21JRW8f/ppu1Za\nai2r4Wv//W+0nOGWxHbtyn+PceOiz0+YEFwrLfX+/PODa3XqeF9cHFxPbUnt1Cna+lhSYr8ed1xw\nT9u23n/+eXDPihXeN2sWXD/33OBaaksqeH/ffcH1K6+MXnv44eDaDTdEr/XqFf1c772fOdP7lSuD\n48paUjdsiB6HjR8ffW769PT3iUhaqCVVRCRLyspg+vTguH59eOghyy9Nys2FCy6w/Rkzos9fc421\nxiX9+teWN/nVV+nvD+vdG846y/ZzcqzVbubM4HphYfrnKvLss8F+Tg5Mm2Zb0ooVwX5JCbz0Egwe\nnP5dY8ZEZwyoXdtyOcMtnrm5lpcalpsb7M+aZfWbk2Z4RLt2cNVVwfEZZ8ADDwTH4e/+9NPRZx9+\n2HJ4w/r1S/890mna1JZevuMOeP11a0HdssXC0lSLF1vZRCTrFKSKiFRm3TrYtCk47tIFDjig4vtT\nJ4Xv2jV6XLcuHHoovPeeHa9fb+9vlGbKtx49oscNG0aPS0oqLXqlZSsrgyeeqPz+cNCaqnfvyt8P\n8OmntlVk82ar3xYtyl/r3j0avFb23cOfm59vg512x6OPwqWXBikLlUn+sCEiWacgVUSkMulaz6oi\ndWR9Vd7ZrFn0ONwKuSuq+l3CObipDjpo99+f/Ix0QWpVvvvu/h6FlZTA1VdHA9QOHeyHjTp1oLgY\nXn55z3y2iERoCioRkZClRZuiSwLn5+PDrXiLFkUHSaVq1y56nDpV0dat8MknwXGzZulbUfeEcPd8\n3bpBF3ZF2223Vfyuirrowy65ZGdj5LOzyEByABXYIKrFi3f9XQsX2iC1pLPOgmXL4JlnYOpU+NnP\ndv3dIlIlClJFRBKWFm1i0Pi5jJw8f0egus3Du117BTd9/TVcdlk0kCkthUcesWupI+rvuy86qf4f\n/xjtIk69f08K505u3QqjRpVPGfjmG3jqKTjzzKq//6CDbE7SpMcei85ykLRypc2xOmZM1T8jndSy\nXnZZ+Xzdp56yz92Z1DlY69ULWsPXrbPfPxGpFuruFxFJaJ/fgL6dWzC9wAKcuwZ3Z9SUBSw+chDT\n579CnZLENEtPPWUtgMkJ7BcutG7gs8+2KZXOO89a3cByMrt0sWmivvjCpnpKqlvXBlJVl8svh7Fj\ngzzRv/3N8lK7d7eyFBZa+aqa6xp2xx0wYIC1kpaU2IClrl2ttfPbb236q+TnDx26+98JLNh++GH7\nPQCYOxc6dbLfn3r14MMPYfVqywNOLhNbka5dbXBcciqvKVNsYYKWLWHevOgPJyKyRylIFRFJqJWb\nwz1DLPCcXlC4I1gd8IOe5J47DS66MOjq37jRVohK56GHYMMGePFFO/7yS5s/NKxhQ5gwofzAqj2p\nfn2bTeBHP7LACyywSy0bpO/Oz8Tpp8P//q/ldX77rZ1buDD9Ck15Wfov6MADbdaDs88OWku3bAnm\nWq2KBg0szWH06ODc/Pn2a+3atuLV7bfvdpFFZOfU3S8iElIrN4e7BnePnLtrcHfyTu9vLXK33w49\ne9o0RXl5NuinZ0+46SYLcMByTJ9/HiZNslbFgw6yZTUbNrSW1uuvt6Bt4MDq/4KHHWYtiuPHwymn\nWICXl2ctjh06WA7m2LGZdY1XZMQI+37XXmuttI0b28CnJk3sePhwW351/Pjsfa9jjrFW4Lvvhr59\nbQL+vDybieHoo+G662xy/kz86leWvtG9uwWmzZrZ7+Prr9vysCJSLZzfR0YmtmnTxq9Ktxa2iEgV\nbCstY+Tk+TtaUQEGdGvFPUOOplaufq4XEdkdzrnV3vs2mdyrf3FFRBLCAeqAbq34aEx/BnRrxfSC\nwshgKhER2fOUkyoikrB87RbmLCmOtJwmc1TnLClm+dotdGpZTdNFiYjs59TdLyISsrRoE+3zG0S6\n9reVlilAFRHJgqp096slVUQkJF0gWis3RwGqiEg1U06qiIiIiMSOglQRERERiR0FqSIiIiISOwpS\nRURERCR2FKSKiIiISOwoSBURERGR2FGQKiIiIiKxoyBVRERERGJHQaqIiIiIxI6CVBERERGJHQWp\nIiIisufNng3OBdttt9V0ifac8Pc86aSqPdu+ffBs+/Z7oHBp3HZbtMyzZ1fP5+6EglQRERERiZ28\nmi6AiIiI7AdatIBzzw2Ou3SpubLsaeHv2bVrzZVjL6cgVURERPa8rl1h6tSaLkX12F++5x6m7n4R\nEZH9zSOPRHMQH3kker2iHMV0eaUrVsDll0OrVlCnDhx+ONx7b/nPrCgn9dVXo+fTPfvBB9F7br01\nen31arjhBujRA5o0sXK0bQsXXwzvvpu+DlLzRjdsgKuvhnbtIC8PRo60+8rK4KGH4Ic/hJYtoVYt\naNQIDjkE+vWzsixcWPm7U61bB1ddBa1bQ9261qp8991QWpq+rEnvvgujRsHJJ9vnN21q5TngAOjZ\nE37zG1i7tuLnH3zQ6qhePfsuQ4fCqlWVf2YNUkuqiIiI7Jp58+Cee+DLL4NzixdbgPfVV3DLLTt/\nx4knQqdOsHSpHU+aBNdcE71n0qRg3zkLrpKmT4ef/MQ+L2zlSpgwASZOtAAw9Z1hGzfCCSfARx+V\nv3bFFRakhm3ebNunn8Lzz0Pt2pl3669ZY985+X0BPvzQgs9XX608UH3uOfsuqTZsgDfftO3+++09\nHTtG7xk5MvoDwNat8OijMGOGBeAxpJZUERER2TUzZ1qwdsIJ5YO0O++0a5kIB53z5sEnn0SvP/ZY\nsN+nDxx6qO0vXAjnnx8EqHl50Ls3nH46NGtm58rK4NprrawVWbDAAtQWLeC00+D44yEnx1oZwwFq\n8+bQv7+1oHbpYq2gVXX11dEAtWFDCxIPOwymTdt5y2ZOjrVW9+kDAwdaWb7zneB6UZF9RtjMmeVb\nqI85xt7x5ZcweXLVv0c1UJAqIiIiu27aNHjtNXj/fQvgkrZsgbffzuwdQ4da8JUUDpreeguWLQuO\nL7002L/9dvjmG9tv3NjK8PLL1uL4ySdBMOs93Hxz5WUYOBCWL4dZsyxQ/uMfLY0gbMECa3mcOdMC\n5PXr4dlnLdjLxKpV8PjjwfEBB8B778ELL1hr6ogRlT9/8cVQXGz3zpkD//63lWXFCrjgguC+WbNg\n06bgeOzY6Hvuvhveecfe8eKL0bqPkXiWSkREROKvd2846yzbz8mJBqkAhYWZvadNGzjllOA4HKSG\n9xs2hPPOs/2yMgsYk+rVs/SC886zbfhw2L49uP7OO9bVnk5eHvz1r1C/fnCudu3yXeajR1v6wNtv\nWxBYrx4MGJB5kDpnjpU7aejQ4DOcs6C7Mu3aWW7vOedAhw5WXues7sOtzWVl8PHHtr99uwXuSS1a\nwC9/GRyfeKK1HseQclJFRERk1/ToET1u2DB6XFKS+buGDbP8ToCCAmup7NIFpkwJ7hk8GBo0sP21\na6PpBEVF8MQTlX/GihVw4IHlz3foYIFyqubNbYDTuHF2PHGibWDB4ZFHwpAhlu8ZDnArsnJl9PiI\nI6LHLVtamsKGDemfHz7cBj9lIpkCsXat5Z8mdepkQXlqOSpLh6ghClJFRET2d6mDdYqLM3sumfeZ\nlJu762UYNMhGq2/caMeTJlkLXzhHM9zV733VP2PLlvTnDzqo4mfuu89Gzk+caGkAyQDSewumCwqs\nyz7cjb8nzJtXPkDt0cNaV3NzYdEiSwNI2pX6iRl194sk7E8r9onIfq527ehxasvdvHnVV5akunWt\nVTJp8uRoV3/HjtY1nZSfH7SqgnW5e1/5VtESpZXlZObkWC7ojBmWg7puHcydG6QdgM2LWlS08++Y\n2lobDirB0hEqakV97bXo8Z//bFNSTZtmn19RykF+fnSA19Kl0TQISD+rQQwoSBUREdnHLS3axLbS\nUC5kasvhlCnw9de2P26c5W/WhGHDgv1ly6Ij68OtqGCth/36BcevvAL/+Ef5dxYXwwMPlB/xnonN\nm22WgmR+J9hgp169yuffLl++8/f17RsNiB99NDqTQWWtI6mBZTi9YP58+Ne/0j+XlxcN7ouLg/QF\nsIB71qydFr0mqLtfRERkH7a0aBODxs+lb+cW3DPkaGrl5rDtmGMpq1OXOiWJXMW33rLAtU6dyieD\n39N69rQ81EWL7DiZ05qTA5dcUv7+W2+1eVJLSqyldNgwGDPGpnMqK7N5TJcutf2+fatenq1bbZGA\nG26wiffbtLG80XXroq3Nubk2uf7OfOc7tmRqMjVg3To4+mj73qtWVd6iedxx0eNf/tLeU1ZmgWZq\nEBt27bU2g0DSyJHwz39aDvHrr0cHc8WIWlJFRET2Ye3zG9C3cwumFxQycvJ8tm4rZeT0Zfzt2EHR\nGzdtsgA1P99Gj9eUcGtq0qmnph/Y1L27jWpv3Dg4t2yZTUE1c6YtLJAMwFIHC1XV6tUWmD79tHW9\nh4PCG2+0UfOZ+MtfgqmxwOr9hRcsQD35ZFu5K50f/ADOOCM4Li2F//7XZgxo1QquvLLizzzjDBsA\nFpacgqpu3WCGhphRkCqSoc2bbWq5Pn1swGetWvZr37624EpqPv4ppwT5rS1apP8h99xzg3saNIhO\na+c9PPmkTd3XurU1cDRpYr1MY8cGUwOKiFSmVm4O9ww5mgHdWjG9oJDDb5nJ9IJClvxsFKV33W2t\njrVrWwvhpZda13G3bjVX4IsvLh9QpgtckwYOtNzOm2+21samTa1ls1EjW2Dgoous1fCpp6pelqZN\nbdWqESNs8vtWrayuksuuDhoEzzxjrbeZatkS3ngDfvpTa72uXRs6d7Z3zJhRPl847Mkn4aabbLBU\nrVpWnuHDg5bwyvzlL5b2cNRRVv78fJtb9Z137LvFkPP7wOgvgDZt2vhVMV5/VuJv9mz7QTXpN78J\n0oM++simwktdBCWsc2frdUpOefevf9m/jUkzZkRTmDZtsplQkjODXHJJkE71zTe2iMr06RV/3lFH\n2TsPPjjTbygi+7Ot20o5/JZgmqGPxvSnbq3dGI0vsgucc6u992maxctTS6rITnz9tfWUhAPU1q1t\nZpRwgLhkCZx5Jnz7rR2fc060Byq89DTYQiHhqevCYwJGjIgGqO3aWZAc/mH3/fdtcOk+8nOmiOxB\n20rLGDVlQeTcqCkLooOpRGJGQarITjz4oOXeJw0YYClPs2bZr6eeGlxbvNgGa4ItRBJepS41KA3P\nrNKhQ5DTv3Ch9S4lXXmlBcjPPmu9MvfdF1x7/fXKW1tFRLaVljFy8nymFxQyoFsrPhrTf0fX/8jJ\n8xWoSmwpSBXZifCqe2BpQ3Xq2H7duuVXsXvuuWA/3Dr61VfBtfXr4T//Ca4NG2Z5qWBBZ7h1dOlS\nW2QludJfannCnycikmr52i3MWVLMgG6tuGfI0dStlbsjR3XOkmKWr61ggnuRGqYpqER2Ijz1nXM2\nO0pY167R4xUrgv1evWxMwuLFdjxpkqUBTJ0K27YF7xw6NP3ngQ3erEz480REUnVq2YhpPz+B9vkN\nqJVrbVPJwVTL126hU8tGNVxCkfTUkiqyh4UHpT77rA2YCnf1n3yy5ZwmVTXHtKJV/kREkjq1bLQj\nQE2qlZujAFViTUGqyE6kBpCpq9gtXFjx/WCj9pPLWW/dCvffb1PTJaUuotK+ffT4pZcqX+Vv9uyq\nfiMREZH4y0qQ6pyr75yb5Jz72Dm3xDmXdhZg51x759x259z80HZo6PqZzrmPEu95wjnXMBvlE0lV\nbolAYHsFgwdOPz16fOutwQj+rVvL56Sm3n/wwdHBVTffHMwt3bhx+Tmzw3M1gy10UlwcPZdcYGTE\niJpZYltERGRPy1ZL6vVAife+I9APGO+ca1bBvRu990eHtmUAiYD0QeDsxHsKgZuyVD6RHZJLBIZH\ntW4rLeOvL32c9v7hw6Oto888Y4uF9O9vc6KGlzzu2DH9yn3h1tLkKn8AQ4bYLABh3bpFZwV4802b\nM7pPH5uzundvWzr6+9+3eZk1qb+IiNC+fbA6TGqX3F4qW0HqBcA4AO/9p8DLwMAqvuN04G3vfXLh\n2vHAj7NUPpEd0i4ROHk+b3y6Lu399evbiPvw3/lVqyw4Xb06OHfooXZfcuR/2MCBFlimSu3qT3rw\nwWiL6tat8Morthrfq6/Cl18G13I1F7eISPVJBoLOwUkn1XRp9mnZGt3fFgiPMV6eOJdOY+fcW0Au\n8G/gd9770gre0do5l+O9L9cP65y7DrguedykSZPdKb/sR5KjWgGmFxQyvaAQgO916ERFi+Z17WqT\n599/v813umiRTSnVuLFdGzTI5jNtWEGCSp068OMfw7hxwbnDDoPvfS/9/Q0aWMD79NO2mt+bb0JR\nkXXz5+fbs71727Kq3bvvYkWIiMi+44wzYM0a2z/wwJotS5ZktCyqc+4V4IgKLvcAFgGHeO+LE/f/\nCdjkvf9tynvqAE2892uccwcAjwH/8d7/j3NuVOIdVyXurQ98CdRJF6Sm0rKoUlVaIlBERKosOak1\n2CosGr1aJVlfFtV739t7n1/BthL4DGgfeqRd4lzqe0q892sS++uBh4Deicup72gPrM4kQBWpKi0R\nKCIiVXLSSdEAFWyqlnD3f3jOwfXr4Xe/sy6zZs2gdm0bSXvuuRVPgJ2aV7ptG4wda4MV6taFFi1s\nYu2iop0/G1ZWBg89BD/8IbRsCbVqQaNGcMgh0K+fjQhOnaomBrKVk/o4kGwB7QD0BZ5Ovck5d6Bz\nrlZivw5wDvBe4vJM4Djn3OGJ458Dk1PfIbK7tESgiEjNmD07GtPddltNl2gPefNNPm9+pE3nMm8e\nbNxoAWdhITz5pE2Qff31lb9j61Y47TS47jr44AMbdbt2ra29ffLJ0XW2d+aKK+Dyy21OwzVrYPt2\n2LzZ1vx+/nlbSnHatN37zntAtoLUPwH1nHMfA7OAqxItpTjnfuuc+2nivhOB95xzC4B3gS+A3wF4\n7zcBw4F/J97TGvh9lsonsoOWCBQRkSrr29daQcPy8+1ccjvuOGvlHDCAg7HxDmU4FjXqaTmjBx0U\nPHvXXYxw9+NcBRkDRUV2oUMH+MEPrCU2adGi6KowlVm1ylpRk5o3t+lp+vWzJRTr1s3sPTUgKwOn\nvPdbsBH+6a7dGtp/Eniykvc8TZoWWJFs0hKBIiI1o0WLaJyXusx0rCUnxQ53+Xftautch40ebS2e\nQCk53NZ3Njl9e9vjJSXWCjp3rr2S3/B/DMfGkqcxeDBMnGjTuEyaBD/5SXBt9uxoekFFwtPQACxY\nAK1bB8fffGMtrI3i939ftkb3i+xV0gWiWiJQRGTPShfT7XOefXbHbm7D+ozJvxcW3gvnJU5u2LDj\n+kEUcSzvAMenf9cf/hDMM5i60kthYWbl6dgxejx6NAwYAJ0721QxjRrZcQxpWVQREZH90COPRPND\nH3kkev2226LXk13S6fJKV6ywlMdWrWzKvcMPh3vvLf+ZFeWkvvpq9Hy6Zz/4IHrPrbdGr69ebSv0\n9egBTZpYOdq2hYsvhnffTV8HqVOebtgAV19tC7jk5cHIkXZfeNxR2Bvz0ow7Wr48uGHzZnjiieiW\nsrZ2O1bwgx9YGZaHJuL8yjXGHXoIzsEJJ1B+jsOSEo44Iij/6s/Tf0eaN4errgqOJ06ECy+01IQm\nTeCoo+D3v4evv67gBTVHLakiIiKyy+bNg3vuiS4ysnixBXhffQW33LLzd5x4InTqBEuX2vGkSXDN\nNdF7Jk0K9p2zQe5J06dbT/hXX0WfWbkSJkywuOzuu8u/M2zjRgsGP/qo/LUrroimdSZt3Wrjjp5/\n3lJGu3at/Hum04D04yB802aQaHR9/XUoWJRLt9D1zZujZW3UCFhfwYfcdx/07GkVMW9e0JrrPRQU\n2Pbee/D441X/AnuQWlJFRERkl82caQHTCSeUD9LuvNOuZSIcdM6bB598Er3+2GPBfp8+tsofWAvm\n+ecHAWpeni12cvrpNvMTWEvotddaWSuyYIEFfS1a2KD644+HnJzy447C6tdPM+4otI72Ctri8JzU\n14P3LPzAc965ni5HeBy2/YNh9Oljubr16wevadgwenz//dGPWVMc7NepY4vAVCgnx5qUZ8ywqbHW\nrbO82PPOC+6ZOjX91FY1SEGqiIiI7JZp0+C112xlvv79g/NbtsDbb2f2jqFDLZZKCg9ef+stWLYs\nOA4vKX377Tb2B2wVwPffh5dfhuees0A3Gcx6bzNCVWbgQOutnzXLAuXLr93Eis+i0xKW1a23Y//4\n1p+zfr2lofbpkzgZyh1tx2fcxB24xJTvyZzciwd+xY+ZyD+5aMd3mDoVDmwRfE5uDlxySXA8YUK0\nrMVrgv2zz7b7FD7u8gAAFHFJREFU09q82X5a+Pjj4NwBB0CvXtHfLIimKsSAuvtFRERkl/XuDWed\nZfs5ORb3hFssMx3f06YNnHKKdZ2DBam//nWwn9SwYdAAWFZmjYNJ9eqVTy/Yvj3Yf+cdmyY03aqh\neXnw178GrZdLizZxwf/NpWerg4Bg/enFOYdyBB8kblpKvRN6MOCQQywHocGNMGoUPPzwji71O7iF\nNfPuh/5dbRDUypWMXvghuWxnOe2ozNVXWwuq99F0CoBvQtOkXn458EYFL9m61ZJ1b7jBRvW3aWMT\n+q9bZ5F4Um6uTe4fIwpSRUREZJf16BE9TjO+J2PDhgVBakGBdeV36QJTpgT3DB4cdG2vXRtNJygq\nsrFJlVmxIn2Q2qGDxW9J7fMb0LdzC6YXrOLIU1vxwX/sob99PZx7GRncOH++bckvcOyxMH06hSec\nQyu+AODAklUwK1i6PTnh1PadhGFHHAGnnhrUSTrt2pUf0FWh1avLT0mVdOONlusQIwpSRUREhNLS\n6HFxcfr7UiXzPpNyK5jyMxODBkHTpjaICWyw1GmnWV5oUrir3/uqf8aWCtZrCc+zD8H82QDP+rdo\nXrs1WxYdzP1FP8d97bmcB+nIx9QjzcpPvXrRhUVcwd85k2fpnreIJn6jJY+2acPCvO7cu+gUpnJe\n+WdTXHNN5UHqpZdG0yTKadrUcgVeecXyJgoLrRXVOWtRPfZYuOwyOPPMnZaluilIFRER2cctLdoU\nWcAEICevjPDQlND0nUC0J7i61K0LQ4bA3/5mx5Mn2zifpI4dbSaApPx8a1VNBp59+sCcObv22ekC\nvVq5Odw1uDvTCwppeORqGh65mo/G9OfrTSNZvHgkl9wdnff1i+OgZWJ/I834E6P5E6Pp+/3oqlJT\nb4e/35amEGlyQk8/3aY0XbIEHJ6ePYPfm5yc0Hz+FeWT5uXZlFMXXljJt48nDZwSERHZhy0t2sSg\n8XMZOXk+20ptAM+20jKeWPRx5L4pU4KpMseNs/zNmhBeRGnZsujI+nArKlirbb9+wfErr8A//lH+\nncXF8MADluNZFdtKy7j60QK+fOMQtm2wZNVRUxbQqEnZbo07qlcvelxRDzxYg2e43OEfHk45JTKZ\nwD5HLakiIiL7sCC30kYw3TW4O6OmLOC9b9aQV+dQtpdY//xbb1mXd506O1b1rBE9e1oe6qJFdpzM\nac1JGe2edOutNk9qSYl1/w8bBmPG2GJKZWXw6ac2/2pZGfTtm3k5tpWWMXLyfGYsWMvGOaexcc4R\nNGj2Lf9XdwvP3f0VB9dtwptvBkukVmXcUefO0eOf/cymMK1Xz2YjuPPO6PWhQ+Gmm8oPnrr88sy/\nz95ILakiIiL7sGRu5YBurZheUMjht8xkekEhZ333QP7fr6JhwKZNFqDm58M559RQgUm/JP2pp0YH\nNiV1725zqDZuHJxbtsymoJo50xYWKEvMIpVXhaa55Wu3MGdJMad2abnj3JYNtfm2sBnL323K3Lku\nMnNAVcYdnXYaHHxwcLxpk5X3iSfgxRfL39+wYfmAtHlzmzJrX6YgVUREZB+XzK0Mu2twd8b81jF2\nrLU61q5t42guvdQGq3frVsHLqsHFF5cPKNMFrkkDB9pqozffbKt9Nm1qLZuNGtncpBddBP/8Jzz1\nVOZl6NSyEdN+fgLjLu3GhAkwYgQccwy0auWpVdvvWHZ10CB45hlrvc1U/frw0kv2g0CLFjsZ+JTw\ni19Y13/SRRdZq/e+zPldGRoXQ23atPGrwsP/REREBAi6rpNd/gADurXiniFHRwZTSXy99ZatgpX0\n/vs1+4PErnLOrfbep2kTL085qSIiIvuwcIA6oFurHTmpyYBVgWp8rVxpqQwbNsCjjwbnTzll7wxQ\nq0pBqoiIyD4smVsZbjlNzv85Z0kxy9duoVPLRjVcSkln2TL41a+i5xo0gHvuqZnyVDcFqSIiIvuw\nZG5leJ7UZKCqAHXv0bw59Oplua9du9Z0aaqHclJFREREpFpUJSdVSSgiIiIiEjsKUkVEREQkdhSk\nioiIiEjsKEgVERERkdhRkCoiIiIisaMgVURERERiR0GqiIiIiMSOglQRERERiR0FqSIiIiISOwpS\nRURERCR2FKSKiIiISOwoSBURERGR2FGQKiIiIiKxoyBVRERERGJHQaqIiIiIxI6CVBERERGJHQWp\nIiIiIhI7ClJFREREJHYUpIqIiIhI7ChIFREREZHYUZAqIiIiIrGjIFVEREREYkdBqoiIiIjEjoJU\nEREREYkdBakiIiIiEjsKUkVEREQkdhSkioiIiEjsKEgVERERkdhRkCoiIiKyj2vfHpyzrX37mi5N\nZhSkioiIiIQkgznn4KSTaro0+6+8mi6AiIiIiOxZZ5wBa9bY/oEH1mxZMqUgVURERGQfN358TZeg\n6tTdLyIiIoJ17TsXPTdnTrT7f9iw4Nr69fC738H3vgfNmkHt2nDwwXDuufDf/6b/jNTc0G3bYOxY\n6NYN6taFFi1g6FAoKtqzz4aVlcFDD8EPfwgtW0KtWtCoERxyCPTrB7feCgsXZlaH2aSWVBEREZEq\nevNNOPtsKCyMni8shCeftG3UKPjznyt+x9atcNppMHt2cK6kBB59FN5+G955x4LPbD+b6oorLEgN\n27zZtk8/heeftwC8a9fM3pctClJFREREgL59IT8fnngiOJefb+eTjjvOWioHDIC1a+2cc3D88dC8\nObz7LnzxhZ2/6y7o1AlGjEj/eUVFtnXoYK2br70G335r1xYtgsmToy232Xo2bNWqaIDavLl9R+9h\n5Ur45BMLiGuCglQRERER4Pbb7ddwl3/XrjB1avS+0aODADUnx1oze/e245ISOPlkmDvXjn/zGxg+\nHHJz03/m4MEwcaJdnzQJfvKT4Nrs2ZUHmrvzbNLq1dHjBQugdevg+Jtv4KWXrPu/umUlSHXO1Qce\nBI4DyoAbvPdPprnvFCDc8H0g8IX3/pjEdQ8UJN4B8Evv/SvZKKOIiIhINjz7bLBfvz7ce69tSRs2\nBPtFRdb1fvzx6d/1hz8EAewZZ0SvpaYSZPPZpI4do8ejR1srcefOcNhhFpwOGJDZu7ItWy2p1wMl\n3vuOzrkOwOvOuf967zeEb/LevwAcnTx2zj0LpKYWn+C935ylcomIiIhk1fLlwf7mzdH0gHRWrEgf\npDZubIOTkho2jF4vKan4nbvzbFjz5nDVVTBunB1PnGgbWIvykUfCkCEwcqQF5NUpW6P7LwDGAXjv\nPwVeBgZW9oBz7mDgZOCfWSqDiIiISOxs2ZL+fLNm0eOKUgKy/Wyq++6zAVf9+0ff6z0UFMBNN9ms\nAdUtW0FqW2BF6Hh54lxlhgIzvPdrUs7Pds4tcM7d7ZxrUNHDzrnrnHOrktvmzWp8FRERkapZWrSJ\nbaVlO78xpF27YL9tWwvmKtsyyQ2tSTk5cPHFMGOGTau1bp3l1J53XnDP1Knpp7bao+XK5Cbn3CvO\nubUVbN9J3ObDj2Tw2kuxPNawdt777wInAC2AP1X0sPf+bu99m+TWMLWdW0RERKQSS4s2MWj8XEZO\nnr8jUN1WWkZe7SBo/fzz8s+F8z8/+wzuuMPmGg376ivrNr/ooj1R8uzZvBnuvBM+/jg4d8AB0KuX\ntayGhdMcqkNGOane+96VXXfOfQa0B4oTp9oBz1Vyfx+gPjAr5XM+S/y6xTk3Hnggk/KJiIiIVFX7\n/Ab07dyC6QU2yuiuwd0ZNWUBrumhsKYJAEuXQo8elv/pHNx4o81/+vDDwQCpW26B+++3mQByc23q\npg8/hO3bo62ucbR1K9xwg22tW0ObNjah/7p1MG9ecF9ubjQHtjpka+DU48BVwLDEwKm+wE8ruf8y\n4BHvfWnyhHOuGTb46mvnXA6W5/pelsonIiIiElErN4d7hth47ukFhTuC1eNPa8FrE5rsuG/+fNvA\nuu6PPRamT4dzzgnmRF21yrZUeXvRZJ+rV5efkirpxhttRavqlK2c1D8B9ZxzH2Oto1d579cDOOd+\n65zbEbA65xoB5wIpaxtwOPCGc24BNg1Vc2BklsonIiIiUk6t3BzuGtw9cu4/Dx7M2LE2sr2iVZt6\n9bJJ8++80+ZIbd7cWhvr17fpm84/31pX33yzGr7EbmjaFCZMsAUHjjkGWrWy1aXq1LF820GD4Jln\nYMyY6i+b897v/K69QJs2bfyqdD/CiIiIiFRgW2kZIyfP39GKCjCgWyvuGXI0tXKz1ZYnSc651d77\nNpncq9oXERGR/VI4QB3QrRUfjenPgG6tmF5QGBlMJTVjL8qUEBEREcme5Wu3MGdJcaTlNJmjOmdJ\nMcvXbqFTyxpYD1QAdfeLiIjIfmxp0Sba5zeIdO1vKy1TgLqHVKW7Xy2pIiIist9KF4jWys1RgBoD\nykkVERERkdhRkCoiIiIisaMgVURERERiR0GqiIiIiMSOglQRERERiR0FqSIiIiISOwpSRURERCR2\nFKSKiIiISOwoSBURERGR2FGQKiIiIiKxoyBVRERERGJHQaqIiIiIxI6CVBERERGJHQWpIiIiIhI7\nClJFREREJHYUpIqIiIhI7ChIFREREZHYUZAqIiIiIrGjIFVEREREYkdBqoiIiIjEjoJUEREREYkd\nBakiIiIiEjsKUkVEREQkdhSkioiIiEjsKEgVERERkdhRkCoiIiIisaMgVURERERiR0GqiIiIiMSO\nglQRERERiR0FqSIiIiISOwpSRURERCR2FKSKiIiISOwoSBURERGR2FGQKiIiIiKxoyBVRERERGJH\nQaqIiIiIxI6CVBERERGJHQWpIiIiIhI7ClJFREREJHYUpIqIiIhI7ChIFREREZHYUZAqIiIiIrGj\nIFVEREREYkdBqoiIiIjEjvPe13QZssI5VwIU13Q5qqAhsLmmC7EPUD3uPtVhdqged5/qMDtUj7tP\ndbj7KqrDFt77Opm8YJ8JUvc2zrlV3vs2NV2OvZ3qcfepDrND9bj7VIfZoXrcfarD3ZeNOlR3v4iI\niIjEjoJUEREREYkdBak15+6aLsA+QvW4+1SH2aF63H2qw+xQPe4+1eHu2+06VE6qiIiIiMSOWlJF\nREREJHYUpIqIiIhI7ChIFREREZHYUZBaTZxzlznnCpxz251zv9jJvWc65z5yzn3snHvCOdewusoZ\nZ865+s65SYl6WeKcO6eSe693zn3gnJvvnHvDOXdcdZY1zqpYj22dc8845xYn/kz+sjrLGldVqcPE\n/c4596Jzbm11lXFvkGk9Oue6OedeTvwZLHDOPeCcy2gy8H2Vc66Tc25uot7edM51qeC+m51zyxLb\nmOouZ5xlUofOuQucc+8l/j8p0L+BUZn+OUzc28I5V+Scm5rp+xWkVp93gMHAxMpuSgSkDwJne+87\nAoXATXu+eHuF64GSRL30A8Y755ql3uSc6w78Evie9/5o4K/AuGotabxlWo8OmAY86r0/DDgCeLxa\nSxpfGdVhyC+A5dVRsL1MpvW4FfiF9/5w4GigCTCq+ooZS/cDD3jvOwP/g/2/EeGc6wP8GDgK6AKc\n7pzrV62ljLed1iGwCjjde38kcCJwjXPu+9VYxrjLpA6TxgPPVeXlClKrifd+gff+Q6BsJ7eeDrzt\nvf8ocTwe+0dG4AISwab3/lPgZWBgBffWAhok9pti/9CIybQefwh8471/PHGv995/UW2ljLeM/yw6\n5zoBQ4A/Vlvp9h4Z1aP3fqn3/v3EfinwFnBINZYzVpxzBwLHABMSp54AOjjn2qfcegHwiPd+i/e+\nBHgI/X8CZF6H3vvXkv/uee+/BD4COlRfSeOrCn8Occ5dCBQBc6ryGQpS46ctsCJ0vBxo7ZzT71X6\nummbepP3fgE2P9unzrlVwLVYy6qYjOoRa3kpds5NTnR3TXPO7beBQYqM6jDx9/bvwFXAtmop2d4l\n0z+LOzjnGgDDgWf2XLFi7zvA59777WA/QAKfUb7uqly/+5FM63CHRFd2L+Clailh/GVUh865g4Hr\ngBuq+gF5WSikAM65V7Du0HR6eO9XVuF1++XktTurw8Sv4bpxFbynHfAj4FDvfWEiB/hfwElZKmqs\nZasesdboU7C0iYXOuSuBycDxWSlojGWxDq8HXvbez0/XurCvy2I9Jt9XC3gMeN57/9Tul3Cvlvr/\nREV1l3H97ocyrUOcc22Ap4Cfeu8/36Ol2rtkUod/B0Z77zdbFlnmFKRmife+d5Ze9Rlwcui4PbDa\ne7+zNIG93s7q0Dn3GVYfxYlT7Uif33I+8IH3vjBx/DBwn3MuN9FVuE/LYj2uAN7z3i9MHE8A/nd/\nqMcs1mEf4Cjn3CXYv7fNnHPLsR9cN2StwDGVxXpMBqhTsDz9a7JXyr3SSqCNcy7Pe789kT/+Hez/\nj7Bk/Sa1S3PP/irTOky2BL4A3JFMfxIg8zrsBTyYCFAbAvWcc7O89zvNj1YXcvzMBI5zzh2eOP45\n1nolNmjnKgDnXAegL/B0mvs+AU50wawIZwEf7uuBVRVkWo8zsFST1onj/ljwr3rMsA6992d679t6\n79tjgy42eO/b7w8BaoYyqkfnXB727+B64Eq/ny+V6L1fA7wHXJQ4dS6w3Hu/POXWx4GhzrkGidkQ\nLkP/nwCZ16FzrhXwInCn9/4f1VrImMu0Dr33ByT+3WuP9S7NyCRATT6srRq2xG/iKmALsCGx3yNx\n7bdYF0Ly3h9hydkfY6OrG9d0+eOwYQOhHkvUyxLgvNC1HXWIdTf8IVGHC4BXk3WtLfN6TBz3A+Yn\n6nEO0LWmyx+HrSp1GDrfHlhb02WP01aFv9MXYt2KCxJ/HucD42q6/DVcd4cBryfq7e3k302sJfq7\noftuxX5w/wT4fU2XO05bJnWIdVVvCf25mw9cWtNlj8uW6Z/D0P3DgKmZvt8lHhIRERERiQ1194uI\niIhI7ChIFREREZHYUZAqIiIiIrGjIFVEREREYkdBqoiIiIjEjoJUEREREYkdBakiIiIiEjsKUkVE\nREQkdv4/wadfMFHnJzwAAAAASUVORK5CYII=\n", 215 | "text/plain": [ 216 | "<matplotlib.figure.Figure at 0x7f2e98102190>" 217 | ] 218 | }, 219 | "metadata": {}, 220 | "output_type": "display_data" 221 | } 222 | ], 223 | "source": [ 224 | "# get 5 random input words\n", 225 | "src_words = ['university', 'love', 'history', 'tennis', 'research', 'conference']\n", 226 | "tgt_words = ['universidad', 'amor', 'historia', u'tenis', u'investigación', 'conferencia']\n", 227 | "\n", 228 | "# assert words in dictionaries\n", 229 | "for sw in src_words:\n", 230 | " assert sw in src_word2id, '\"%s\" not in source dictionary' % sw\n", 231 | "for tw in tgt_words:\n", 232 | " assert tw in tgt_word2id, '\"%s\" not in target dictionary' % sw\n", 233 | "\n", 234 | "plot_similar_word(src_words, src_word2id, src_embeddings, tgt_words, tgt_word2id, tgt_embeddings, pca)" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "metadata": { 241 | "collapsed": true 242 | }, 243 | "outputs": [], 244 | "source": [] 245 | } 246 | ], 247 | "metadata": { 248 | "kernelspec": { 249 | "display_name": "Python 2", 250 | "language": "python", 251 | "name": "python2" 252 | }, 253 | "language_info": { 254 | "codemirror_mode": { 255 | "name": "ipython", 256 | "version": 2 257 | }, 258 | "file_extension": ".py", 259 | "mimetype": "text/x-python", 260 | "name": "python", 261 | "nbconvert_exporter": "python", 262 | "pygments_lexer": "ipython2", 263 | "version": "2.7.14" 264 | } 265 | }, 266 | "nbformat": 4, 267 | "nbformat_minor": 2 268 | } 269 | -------------------------------------------------------------------------------- /evaluate.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | # python evaluate.py --crosslingual --src_lang en --tgt_lang es --src_emb data/wiki.en-es.en.vec --tgt_emb data/wiki.en-es.es.vec 9 | 10 | import os 11 | import argparse 12 | from collections import OrderedDict 13 | 14 | from src.utils import bool_flag, initialize_exp 15 | from src.models import build_model 16 | from src.trainer import Trainer 17 | from src.evaluation import Evaluator 18 | 19 | # main 20 | parser = argparse.ArgumentParser(description='Evaluation') 21 | parser.add_argument("--verbose", type=int, default=2, help="Verbose level (2:debug, 1:info, 0:warning)") 22 | parser.add_argument("--exp_path", type=str, default="", help="Where to store experiment logs and models") 23 | parser.add_argument("--exp_name", type=str, default="debug", help="Experiment name") 24 | parser.add_argument("--exp_id", type=str, default="", help="Experiment ID") 25 | parser.add_argument("--cuda", type=bool_flag, default=True, help="Run on GPU") 26 | # data 27 | parser.add_argument("--src_lang", type=str, default="", help="Source language") 28 | parser.add_argument("--tgt_lang", type=str, default="", help="Target language") 29 | parser.add_argument("--dico_eval", type=str, default="default", help="Path to evaluation dictionary") 30 | # reload pre-trained embeddings 31 | parser.add_argument("--src_emb", type=str, default="", help="Reload source embeddings") 32 | parser.add_argument("--tgt_emb", type=str, default="", help="Reload target embeddings") 33 | parser.add_argument("--max_vocab", type=int, default=200000, help="Maximum vocabulary size (-1 to disable)") 34 | parser.add_argument("--emb_dim", type=int, default=300, help="Embedding dimension") 35 | parser.add_argument("--normalize_embeddings", type=str, default="", help="Normalize embeddings before training") 36 | 37 | 38 | # parse parameters 39 | params = parser.parse_args() 40 | 41 | # check parameters 42 | assert params.src_lang, "source language undefined" 43 | assert os.path.isfile(params.src_emb) 44 | assert not params.tgt_lang or os.path.isfile(params.tgt_emb) 45 | assert params.dico_eval == 'default' or os.path.isfile(params.dico_eval) 46 | 47 | # build logger / model / trainer / evaluator 48 | logger = initialize_exp(params) 49 | src_emb, tgt_emb, mapping, _ = build_model(params, False) 50 | trainer = Trainer(src_emb, tgt_emb, mapping, None, params) 51 | evaluator = Evaluator(trainer) 52 | 53 | # run evaluations 54 | to_log = OrderedDict({'n_iter': 0}) 55 | evaluator.monolingual_wordsim(to_log) 56 | # evaluator.monolingual_wordanalogy(to_log) 57 | if params.tgt_lang: 58 | evaluator.crosslingual_wordsim(to_log) 59 | evaluator.word_translation(to_log) 60 | evaluator.sent_translation(to_log) 61 | # evaluator.dist_mean_cosine(to_log) 62 | -------------------------------------------------------------------------------- /outline_all.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/MUSE/3159355b93f5c3c4883808ba785ba9d18d7f5e81/outline_all.png -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/MUSE/3159355b93f5c3c4883808ba785ba9d18d7f5e81/src/__init__.py -------------------------------------------------------------------------------- /src/dico_builder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | from logging import getLogger 9 | import torch 10 | 11 | from .utils import get_nn_avg_dist 12 | 13 | 14 | logger = getLogger() 15 | 16 | 17 | def get_candidates(emb1, emb2, params): 18 | """ 19 | Get best translation pairs candidates. 20 | """ 21 | bs = 128 22 | 23 | all_scores = [] 24 | all_targets = [] 25 | 26 | # number of source words to consider 27 | n_src = emb1.size(0) 28 | if params.dico_max_rank > 0 and not params.dico_method.startswith('invsm_beta_'): 29 | n_src = min(params.dico_max_rank, n_src) 30 | 31 | # nearest neighbors 32 | if params.dico_method == 'nn': 33 | 34 | # for every source word 35 | for i in range(0, n_src, bs): 36 | 37 | # compute target words scores 38 | scores = emb2.mm(emb1[i:min(n_src, i + bs)].transpose(0, 1)).transpose(0, 1) 39 | best_scores, best_targets = scores.topk(2, dim=1, largest=True, sorted=True) 40 | 41 | # update scores / potential targets 42 | all_scores.append(best_scores.cpu()) 43 | all_targets.append(best_targets.cpu()) 44 | 45 | all_scores = torch.cat(all_scores, 0) 46 | all_targets = torch.cat(all_targets, 0) 47 | 48 | # inverted softmax 49 | elif params.dico_method.startswith('invsm_beta_'): 50 | 51 | beta = float(params.dico_method[len('invsm_beta_'):]) 52 | 53 | # for every target word 54 | for i in range(0, emb2.size(0), bs): 55 | 56 | # compute source words scores 57 | scores = emb1.mm(emb2[i:i + bs].transpose(0, 1)) 58 | scores.mul_(beta).exp_() 59 | scores.div_(scores.sum(0, keepdim=True).expand_as(scores)) 60 | 61 | best_scores, best_targets = scores.topk(2, dim=1, largest=True, sorted=True) 62 | 63 | # update scores / potential targets 64 | all_scores.append(best_scores.cpu()) 65 | all_targets.append((best_targets + i).cpu()) 66 | 67 | all_scores = torch.cat(all_scores, 1) 68 | all_targets = torch.cat(all_targets, 1) 69 | 70 | all_scores, best_targets = all_scores.topk(2, dim=1, largest=True, sorted=True) 71 | all_targets = all_targets.gather(1, best_targets) 72 | 73 | # contextual dissimilarity measure 74 | elif params.dico_method.startswith('csls_knn_'): 75 | 76 | knn = params.dico_method[len('csls_knn_'):] 77 | assert knn.isdigit() 78 | knn = int(knn) 79 | 80 | # average distances to k nearest neighbors 81 | average_dist1 = torch.from_numpy(get_nn_avg_dist(emb2, emb1, knn)) 82 | average_dist2 = torch.from_numpy(get_nn_avg_dist(emb1, emb2, knn)) 83 | average_dist1 = average_dist1.type_as(emb1) 84 | average_dist2 = average_dist2.type_as(emb2) 85 | 86 | # for every source word 87 | for i in range(0, n_src, bs): 88 | 89 | # compute target words scores 90 | scores = emb2.mm(emb1[i:min(n_src, i + bs)].transpose(0, 1)).transpose(0, 1) 91 | scores.mul_(2) 92 | scores.sub_(average_dist1[i:min(n_src, i + bs)][:, None] + average_dist2[None, :]) 93 | best_scores, best_targets = scores.topk(2, dim=1, largest=True, sorted=True) 94 | 95 | # update scores / potential targets 96 | all_scores.append(best_scores.cpu()) 97 | all_targets.append(best_targets.cpu()) 98 | 99 | all_scores = torch.cat(all_scores, 0) 100 | all_targets = torch.cat(all_targets, 0) 101 | 102 | all_pairs = torch.cat([ 103 | torch.arange(0, all_targets.size(0)).long().unsqueeze(1), 104 | all_targets[:, 0].unsqueeze(1) 105 | ], 1) 106 | 107 | # sanity check 108 | assert all_scores.size() == all_pairs.size() == (n_src, 2) 109 | 110 | # sort pairs by score confidence 111 | diff = all_scores[:, 0] - all_scores[:, 1] 112 | reordered = diff.sort(0, descending=True)[1] 113 | all_scores = all_scores[reordered] 114 | all_pairs = all_pairs[reordered] 115 | 116 | # max dico words rank 117 | if params.dico_max_rank > 0: 118 | selected = all_pairs.max(1)[0] <= params.dico_max_rank 119 | mask = selected.unsqueeze(1).expand_as(all_scores).clone() 120 | all_scores = all_scores.masked_select(mask).view(-1, 2) 121 | all_pairs = all_pairs.masked_select(mask).view(-1, 2) 122 | 123 | # max dico size 124 | if params.dico_max_size > 0: 125 | all_scores = all_scores[:params.dico_max_size] 126 | all_pairs = all_pairs[:params.dico_max_size] 127 | 128 | # min dico size 129 | diff = all_scores[:, 0] - all_scores[:, 1] 130 | if params.dico_min_size > 0: 131 | diff[:params.dico_min_size] = 1e9 132 | 133 | # confidence threshold 134 | if params.dico_threshold > 0: 135 | mask = diff > params.dico_threshold 136 | logger.info("Selected %i / %i pairs above the confidence threshold." % (mask.sum(), diff.size(0))) 137 | mask = mask.unsqueeze(1).expand_as(all_pairs).clone() 138 | all_pairs = all_pairs.masked_select(mask).view(-1, 2) 139 | 140 | return all_pairs 141 | 142 | 143 | def build_dictionary(src_emb, tgt_emb, params, s2t_candidates=None, t2s_candidates=None): 144 | """ 145 | Build a training dictionary given current embeddings / mapping. 146 | """ 147 | logger.info("Building the train dictionary ...") 148 | s2t = 'S2T' in params.dico_build 149 | t2s = 'T2S' in params.dico_build 150 | assert s2t or t2s 151 | 152 | if s2t: 153 | if s2t_candidates is None: 154 | s2t_candidates = get_candidates(src_emb, tgt_emb, params) 155 | if t2s: 156 | if t2s_candidates is None: 157 | t2s_candidates = get_candidates(tgt_emb, src_emb, params) 158 | t2s_candidates = torch.cat([t2s_candidates[:, 1:], t2s_candidates[:, :1]], 1) 159 | 160 | if params.dico_build == 'S2T': 161 | dico = s2t_candidates 162 | elif params.dico_build == 'T2S': 163 | dico = t2s_candidates 164 | else: 165 | s2t_candidates = set([(a, b) for a, b in s2t_candidates.numpy()]) 166 | t2s_candidates = set([(a, b) for a, b in t2s_candidates.numpy()]) 167 | if params.dico_build == 'S2T|T2S': 168 | final_pairs = s2t_candidates | t2s_candidates 169 | else: 170 | assert params.dico_build == 'S2T&T2S' 171 | final_pairs = s2t_candidates & t2s_candidates 172 | if len(final_pairs) == 0: 173 | logger.warning("Empty intersection ...") 174 | return None 175 | dico = torch.LongTensor(list([[int(a), int(b)] for (a, b) in final_pairs])) 176 | 177 | logger.info('New train dictionary of %i pairs.' % dico.size(0)) 178 | return dico.cuda() if params.cuda else dico 179 | -------------------------------------------------------------------------------- /src/dictionary.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | from logging import getLogger 9 | 10 | 11 | logger = getLogger() 12 | 13 | 14 | class Dictionary(object): 15 | 16 | def __init__(self, id2word, word2id, lang): 17 | assert len(id2word) == len(word2id) 18 | self.id2word = id2word 19 | self.word2id = word2id 20 | self.lang = lang 21 | self.check_valid() 22 | 23 | def __len__(self): 24 | """ 25 | Returns the number of words in the dictionary. 26 | """ 27 | return len(self.id2word) 28 | 29 | def __getitem__(self, i): 30 | """ 31 | Returns the word of the specified index. 32 | """ 33 | return self.id2word[i] 34 | 35 | def __contains__(self, w): 36 | """ 37 | Returns whether a word is in the dictionary. 38 | """ 39 | return w in self.word2id 40 | 41 | def __eq__(self, y): 42 | """ 43 | Compare the dictionary with another one. 44 | """ 45 | self.check_valid() 46 | y.check_valid() 47 | if len(self.id2word) != len(y): 48 | return False 49 | return self.lang == y.lang and all(self.id2word[i] == y[i] for i in range(len(y))) 50 | 51 | def check_valid(self): 52 | """ 53 | Check that the dictionary is valid. 54 | """ 55 | assert len(self.id2word) == len(self.word2id) 56 | for i in range(len(self.id2word)): 57 | assert self.word2id[self.id2word[i]] == i 58 | 59 | def index(self, word): 60 | """ 61 | Returns the index of the specified word. 62 | """ 63 | return self.word2id[word] 64 | 65 | def prune(self, max_vocab): 66 | """ 67 | Limit the vocabulary size. 68 | """ 69 | assert max_vocab >= 1 70 | self.id2word = {k: v for k, v in self.id2word.items() if k < max_vocab} 71 | self.word2id = {v: k for k, v in self.id2word.items()} 72 | self.check_valid() 73 | -------------------------------------------------------------------------------- /src/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from .wordsim import get_wordsim_scores, get_crosslingual_wordsim_scores, get_wordanalogy_scores 2 | from .word_translation import get_word_translation_accuracy 3 | from .sent_translation import get_sent_translation_accuracy, load_europarl_data 4 | from .evaluator import Evaluator 5 | -------------------------------------------------------------------------------- /src/evaluation/evaluator.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | from logging import getLogger 9 | from copy import deepcopy 10 | import numpy as np 11 | from torch.autograd import Variable 12 | from torch import Tensor as torch_tensor 13 | 14 | from . import get_wordsim_scores, get_crosslingual_wordsim_scores, get_wordanalogy_scores 15 | from . import get_word_translation_accuracy 16 | from . import load_europarl_data, get_sent_translation_accuracy 17 | from ..dico_builder import get_candidates, build_dictionary 18 | from src.utils import get_idf 19 | 20 | 21 | logger = getLogger() 22 | 23 | 24 | class Evaluator(object): 25 | 26 | def __init__(self, trainer): 27 | """ 28 | Initialize evaluator. 29 | """ 30 | self.src_emb = trainer.src_emb 31 | self.tgt_emb = trainer.tgt_emb 32 | self.src_dico = trainer.src_dico 33 | self.tgt_dico = trainer.tgt_dico 34 | self.mapping = trainer.mapping 35 | self.discriminator = trainer.discriminator 36 | self.params = trainer.params 37 | 38 | def monolingual_wordsim(self, to_log): 39 | """ 40 | Evaluation on monolingual word similarity. 41 | """ 42 | src_ws_scores = get_wordsim_scores( 43 | self.src_dico.lang, self.src_dico.word2id, 44 | self.mapping(self.src_emb.weight).data.cpu().numpy() 45 | ) 46 | tgt_ws_scores = get_wordsim_scores( 47 | self.tgt_dico.lang, self.tgt_dico.word2id, 48 | self.tgt_emb.weight.data.cpu().numpy() 49 | ) if self.params.tgt_lang else None 50 | if src_ws_scores is not None: 51 | src_ws_monolingual_scores = np.mean(list(src_ws_scores.values())) 52 | logger.info("Monolingual source word similarity score average: %.5f" % src_ws_monolingual_scores) 53 | to_log['src_ws_monolingual_scores'] = src_ws_monolingual_scores 54 | to_log.update({'src_' + k: v for k, v in src_ws_scores.items()}) 55 | if tgt_ws_scores is not None: 56 | tgt_ws_monolingual_scores = np.mean(list(tgt_ws_scores.values())) 57 | logger.info("Monolingual target word similarity score average: %.5f" % tgt_ws_monolingual_scores) 58 | to_log['tgt_ws_monolingual_scores'] = tgt_ws_monolingual_scores 59 | to_log.update({'tgt_' + k: v for k, v in tgt_ws_scores.items()}) 60 | if src_ws_scores is not None and tgt_ws_scores is not None: 61 | ws_monolingual_scores = (src_ws_monolingual_scores + tgt_ws_monolingual_scores) / 2 62 | logger.info("Monolingual word similarity score average: %.5f" % ws_monolingual_scores) 63 | to_log['ws_monolingual_scores'] = ws_monolingual_scores 64 | 65 | def monolingual_wordanalogy(self, to_log): 66 | """ 67 | Evaluation on monolingual word analogy. 68 | """ 69 | src_analogy_scores = get_wordanalogy_scores( 70 | self.src_dico.lang, self.src_dico.word2id, 71 | self.mapping(self.src_emb.weight).data.cpu().numpy() 72 | ) 73 | if self.params.tgt_lang: 74 | tgt_analogy_scores = get_wordanalogy_scores( 75 | self.tgt_dico.lang, self.tgt_dico.word2id, 76 | self.tgt_emb.weight.data.cpu().numpy() 77 | ) 78 | if src_analogy_scores is not None: 79 | src_analogy_monolingual_scores = np.mean(list(src_analogy_scores.values())) 80 | logger.info("Monolingual source word analogy score average: %.5f" % src_analogy_monolingual_scores) 81 | to_log['src_analogy_monolingual_scores'] = src_analogy_monolingual_scores 82 | to_log.update({'src_' + k: v for k, v in src_analogy_scores.items()}) 83 | if self.params.tgt_lang and tgt_analogy_scores is not None: 84 | tgt_analogy_monolingual_scores = np.mean(list(tgt_analogy_scores.values())) 85 | logger.info("Monolingual target word analogy score average: %.5f" % tgt_analogy_monolingual_scores) 86 | to_log['tgt_analogy_monolingual_scores'] = tgt_analogy_monolingual_scores 87 | to_log.update({'tgt_' + k: v for k, v in tgt_analogy_scores.items()}) 88 | 89 | def crosslingual_wordsim(self, to_log): 90 | """ 91 | Evaluation on cross-lingual word similarity. 92 | """ 93 | src_emb = self.mapping(self.src_emb.weight).data.cpu().numpy() 94 | tgt_emb = self.tgt_emb.weight.data.cpu().numpy() 95 | # cross-lingual wordsim evaluation 96 | src_tgt_ws_scores = get_crosslingual_wordsim_scores( 97 | self.src_dico.lang, self.src_dico.word2id, src_emb, 98 | self.tgt_dico.lang, self.tgt_dico.word2id, tgt_emb, 99 | ) 100 | if src_tgt_ws_scores is None: 101 | return 102 | ws_crosslingual_scores = np.mean(list(src_tgt_ws_scores.values())) 103 | logger.info("Cross-lingual word similarity score average: %.5f" % ws_crosslingual_scores) 104 | to_log['ws_crosslingual_scores'] = ws_crosslingual_scores 105 | to_log.update({'src_tgt_' + k: v for k, v in src_tgt_ws_scores.items()}) 106 | 107 | def word_translation(self, to_log): 108 | """ 109 | Evaluation on word translation. 110 | """ 111 | # mapped word embeddings 112 | src_emb = self.mapping(self.src_emb.weight).data 113 | tgt_emb = self.tgt_emb.weight.data 114 | 115 | for method in ['nn', 'csls_knn_10']: 116 | results = get_word_translation_accuracy( 117 | self.src_dico.lang, self.src_dico.word2id, src_emb, 118 | self.tgt_dico.lang, self.tgt_dico.word2id, tgt_emb, 119 | method=method, 120 | dico_eval=self.params.dico_eval 121 | ) 122 | to_log.update([('%s-%s' % (k, method), v) for k, v in results]) 123 | 124 | def sent_translation(self, to_log): 125 | """ 126 | Evaluation on sentence translation. 127 | Only available on Europarl, for en - {de, es, fr, it} language pairs. 128 | """ 129 | lg1 = self.src_dico.lang 130 | lg2 = self.tgt_dico.lang 131 | 132 | # parameters 133 | n_keys = 200000 134 | n_queries = 2000 135 | n_idf = 300000 136 | 137 | # load europarl data 138 | if not hasattr(self, 'europarl_data'): 139 | self.europarl_data = load_europarl_data( 140 | lg1, lg2, n_max=(n_keys + 2 * n_idf) 141 | ) 142 | 143 | # if no Europarl data for this language pair 144 | if not self.europarl_data: 145 | return 146 | 147 | # mapped word embeddings 148 | src_emb = self.mapping(self.src_emb.weight).data 149 | tgt_emb = self.tgt_emb.weight.data 150 | 151 | # get idf weights 152 | idf = get_idf(self.europarl_data, lg1, lg2, n_idf=n_idf) 153 | 154 | for method in ['nn', 'csls_knn_10']: 155 | 156 | # source <- target sentence translation 157 | results = get_sent_translation_accuracy( 158 | self.europarl_data, 159 | self.src_dico.lang, self.src_dico.word2id, src_emb, 160 | self.tgt_dico.lang, self.tgt_dico.word2id, tgt_emb, 161 | n_keys=n_keys, n_queries=n_queries, 162 | method=method, idf=idf 163 | ) 164 | to_log.update([('tgt_to_src_%s-%s' % (k, method), v) for k, v in results]) 165 | 166 | # target <- source sentence translation 167 | results = get_sent_translation_accuracy( 168 | self.europarl_data, 169 | self.tgt_dico.lang, self.tgt_dico.word2id, tgt_emb, 170 | self.src_dico.lang, self.src_dico.word2id, src_emb, 171 | n_keys=n_keys, n_queries=n_queries, 172 | method=method, idf=idf 173 | ) 174 | to_log.update([('src_to_tgt_%s-%s' % (k, method), v) for k, v in results]) 175 | 176 | def dist_mean_cosine(self, to_log): 177 | """ 178 | Mean-cosine model selection criterion. 179 | """ 180 | # get normalized embeddings 181 | src_emb = self.mapping(self.src_emb.weight).data 182 | tgt_emb = self.tgt_emb.weight.data 183 | src_emb = src_emb / src_emb.norm(2, 1, keepdim=True).expand_as(src_emb) 184 | tgt_emb = tgt_emb / tgt_emb.norm(2, 1, keepdim=True).expand_as(tgt_emb) 185 | 186 | # build dictionary 187 | for dico_method in ['nn', 'csls_knn_10']: 188 | dico_build = 'S2T' 189 | dico_max_size = 10000 190 | # temp params / dictionary generation 191 | _params = deepcopy(self.params) 192 | _params.dico_method = dico_method 193 | _params.dico_build = dico_build 194 | _params.dico_threshold = 0 195 | _params.dico_max_rank = 10000 196 | _params.dico_min_size = 0 197 | _params.dico_max_size = dico_max_size 198 | s2t_candidates = get_candidates(src_emb, tgt_emb, _params) 199 | t2s_candidates = get_candidates(tgt_emb, src_emb, _params) 200 | dico = build_dictionary(src_emb, tgt_emb, _params, s2t_candidates, t2s_candidates) 201 | # mean cosine 202 | if dico is None: 203 | mean_cosine = -1e9 204 | else: 205 | mean_cosine = (src_emb[dico[:dico_max_size, 0]] * tgt_emb[dico[:dico_max_size, 1]]).sum(1).mean() 206 | mean_cosine = mean_cosine.item() if isinstance(mean_cosine, torch_tensor) else mean_cosine 207 | logger.info("Mean cosine (%s method, %s build, %i max size): %.5f" 208 | % (dico_method, _params.dico_build, dico_max_size, mean_cosine)) 209 | to_log['mean_cosine-%s-%s-%i' % (dico_method, _params.dico_build, dico_max_size)] = mean_cosine 210 | 211 | def all_eval(self, to_log): 212 | """ 213 | Run all evaluations. 214 | """ 215 | self.monolingual_wordsim(to_log) 216 | self.crosslingual_wordsim(to_log) 217 | self.word_translation(to_log) 218 | self.sent_translation(to_log) 219 | self.dist_mean_cosine(to_log) 220 | 221 | def eval_dis(self, to_log): 222 | """ 223 | Evaluate discriminator predictions and accuracy. 224 | """ 225 | bs = 128 226 | src_preds = [] 227 | tgt_preds = [] 228 | 229 | self.discriminator.eval() 230 | 231 | for i in range(0, self.src_emb.num_embeddings, bs): 232 | emb = Variable(self.src_emb.weight[i:i + bs].data, volatile=True) 233 | preds = self.discriminator(self.mapping(emb)) 234 | src_preds.extend(preds.data.cpu().tolist()) 235 | 236 | for i in range(0, self.tgt_emb.num_embeddings, bs): 237 | emb = Variable(self.tgt_emb.weight[i:i + bs].data, volatile=True) 238 | preds = self.discriminator(emb) 239 | tgt_preds.extend(preds.data.cpu().tolist()) 240 | 241 | src_pred = np.mean(src_preds) 242 | tgt_pred = np.mean(tgt_preds) 243 | logger.info("Discriminator source / target predictions: %.5f / %.5f" 244 | % (src_pred, tgt_pred)) 245 | 246 | src_accu = np.mean([x >= 0.5 for x in src_preds]) 247 | tgt_accu = np.mean([x < 0.5 for x in tgt_preds]) 248 | dis_accu = ((src_accu * self.src_emb.num_embeddings + tgt_accu * self.tgt_emb.num_embeddings) / 249 | (self.src_emb.num_embeddings + self.tgt_emb.num_embeddings)) 250 | logger.info("Discriminator source / target / global accuracy: %.5f / %.5f / %.5f" 251 | % (src_accu, tgt_accu, dis_accu)) 252 | 253 | to_log['dis_accu'] = dis_accu 254 | to_log['dis_src_pred'] = src_pred 255 | to_log['dis_tgt_pred'] = tgt_pred 256 | -------------------------------------------------------------------------------- /src/evaluation/sent_translation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | import os 9 | import io 10 | from logging import getLogger 11 | import numpy as np 12 | import torch 13 | 14 | from src.utils import bow_idf, get_nn_avg_dist 15 | 16 | 17 | EUROPARL_DIR = 'data/crosslingual/europarl' 18 | 19 | 20 | logger = getLogger() 21 | 22 | 23 | def load_europarl_data(lg1, lg2, n_max=1e10, lower=True): 24 | """ 25 | Load data parallel sentences 26 | """ 27 | if not (os.path.isfile(os.path.join(EUROPARL_DIR, 'europarl-v7.%s-%s.%s' % (lg1, lg2, lg1))) or 28 | os.path.isfile(os.path.join(EUROPARL_DIR, 'europarl-v7.%s-%s.%s' % (lg2, lg1, lg1)))): 29 | return None 30 | 31 | if os.path.isfile(os.path.join(EUROPARL_DIR, 'europarl-v7.%s-%s.%s' % (lg2, lg1, lg1))): 32 | lg1, lg2 = lg2, lg1 33 | 34 | # load sentences 35 | data = {lg1: [], lg2: []} 36 | for lg in [lg1, lg2]: 37 | fname = os.path.join(EUROPARL_DIR, 'europarl-v7.%s-%s.%s' % (lg1, lg2, lg)) 38 | 39 | with io.open(fname, 'r', encoding='utf-8') as f: 40 | for i, line in enumerate(f): 41 | if i >= n_max: 42 | break 43 | line = line.lower() if lower else line 44 | data[lg].append(line.rstrip().split()) 45 | 46 | # get only unique sentences for each language 47 | assert len(data[lg1]) == len(data[lg2]) 48 | data[lg1] = np.array(data[lg1]) 49 | data[lg2] = np.array(data[lg2]) 50 | data[lg1], indices = np.unique(data[lg1], return_index=True) 51 | data[lg2] = data[lg2][indices] 52 | data[lg2], indices = np.unique(data[lg2], return_index=True) 53 | data[lg1] = data[lg1][indices] 54 | 55 | # shuffle sentences 56 | rng = np.random.RandomState(1234) 57 | perm = rng.permutation(len(data[lg1])) 58 | data[lg1] = data[lg1][perm] 59 | data[lg2] = data[lg2][perm] 60 | 61 | logger.info("Loaded europarl %s-%s (%i sentences)." % (lg1, lg2, len(data[lg1]))) 62 | return data 63 | 64 | 65 | def get_sent_translation_accuracy(data, lg1, word2id1, emb1, lg2, word2id2, emb2, 66 | n_keys, n_queries, method, idf): 67 | 68 | """ 69 | Given parallel sentences from Europarl, evaluate the 70 | sentence translation accuracy using the precision@k. 71 | """ 72 | # get word vectors dictionaries 73 | emb1 = emb1.cpu().numpy() 74 | emb2 = emb2.cpu().numpy() 75 | word_vec1 = dict([(w, emb1[word2id1[w]]) for w in word2id1]) 76 | word_vec2 = dict([(w, emb2[word2id2[w]]) for w in word2id2]) 77 | word_vect = {lg1: word_vec1, lg2: word_vec2} 78 | lg_keys = lg2 79 | lg_query = lg1 80 | 81 | # get n_keys pairs of sentences 82 | keys = data[lg_keys][:n_keys] 83 | keys = bow_idf(keys, word_vect[lg_keys], idf_dict=idf[lg_keys]) 84 | 85 | # get n_queries query pairs from these n_keys pairs 86 | rng = np.random.RandomState(1234) 87 | idx_query = rng.choice(range(n_keys), size=n_queries, replace=False) 88 | queries = data[lg_query][idx_query] 89 | queries = bow_idf(queries, word_vect[lg_query], idf_dict=idf[lg_query]) 90 | 91 | # normalize embeddings 92 | queries = torch.from_numpy(queries).float() 93 | queries = queries / queries.norm(2, 1, keepdim=True).expand_as(queries) 94 | keys = torch.from_numpy(keys).float() 95 | keys = keys / keys.norm(2, 1, keepdim=True).expand_as(keys) 96 | 97 | # nearest neighbors 98 | if method == 'nn': 99 | scores = keys.mm(queries.transpose(0, 1)).transpose(0, 1) 100 | scores = scores.cpu() 101 | 102 | # inverted softmax 103 | elif method.startswith('invsm_beta_'): 104 | beta = float(method[len('invsm_beta_'):]) 105 | scores = keys.mm(queries.transpose(0, 1)).transpose(0, 1) 106 | scores.mul_(beta).exp_() 107 | scores.div_(scores.sum(0, keepdim=True).expand_as(scores)) 108 | scores = scores.cpu() 109 | 110 | # contextual dissimilarity measure 111 | elif method.startswith('csls_knn_'): 112 | knn = method[len('csls_knn_'):] 113 | assert knn.isdigit() 114 | knn = int(knn) 115 | # average distances to k nearest neighbors 116 | knn = method[len('csls_knn_'):] 117 | assert knn.isdigit() 118 | knn = int(knn) 119 | average_dist_keys = torch.from_numpy(get_nn_avg_dist(queries, keys, knn)) 120 | average_dist_queries = torch.from_numpy(get_nn_avg_dist(keys, queries, knn)) 121 | # scores 122 | scores = keys.mm(queries.transpose(0, 1)).transpose(0, 1) 123 | scores.mul_(2) 124 | scores.sub_(average_dist_queries[:, None].float() + average_dist_keys[None, :].float()) 125 | scores = scores.cpu() 126 | 127 | results = [] 128 | top_matches = scores.topk(10, 1, True)[1] 129 | for k in [1, 5, 10]: 130 | top_k_matches = (top_matches[:, :k] == torch.from_numpy(idx_query)[:, None]).sum(1) 131 | precision_at_k = 100 * top_k_matches.float().numpy().mean() 132 | logger.info("%i queries (%s) - %s - Precision at k = %i: %f" % 133 | (len(top_k_matches), lg_query.upper(), method, k, precision_at_k)) 134 | results.append(('sent-precision_at_%i' % k, precision_at_k)) 135 | 136 | return results 137 | -------------------------------------------------------------------------------- /src/evaluation/word_translation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | import os 9 | import io 10 | from logging import getLogger 11 | import numpy as np 12 | import torch 13 | 14 | from ..utils import get_nn_avg_dist 15 | 16 | 17 | DIC_EVAL_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', '..', 'data', 'crosslingual', 'dictionaries') 18 | 19 | 20 | logger = getLogger() 21 | 22 | 23 | def load_identical_char_dico(word2id1, word2id2): 24 | """ 25 | Build a dictionary of identical character strings. 26 | """ 27 | pairs = [(w1, w1) for w1 in word2id1.keys() if w1 in word2id2] 28 | if len(pairs) == 0: 29 | raise Exception("No identical character strings were found. " 30 | "Please specify a dictionary.") 31 | 32 | logger.info("Found %i pairs of identical character strings." % len(pairs)) 33 | 34 | # sort the dictionary by source word frequencies 35 | pairs = sorted(pairs, key=lambda x: word2id1[x[0]]) 36 | dico = torch.LongTensor(len(pairs), 2) 37 | for i, (word1, word2) in enumerate(pairs): 38 | dico[i, 0] = word2id1[word1] 39 | dico[i, 1] = word2id2[word2] 40 | 41 | return dico 42 | 43 | 44 | def load_dictionary(path, word2id1, word2id2): 45 | """ 46 | Return a torch tensor of size (n, 2) where n is the size of the 47 | loader dictionary, and sort it by source word frequency. 48 | """ 49 | assert os.path.isfile(path) 50 | 51 | pairs = [] 52 | not_found = 0 53 | not_found1 = 0 54 | not_found2 = 0 55 | 56 | with io.open(path, 'r', encoding='utf-8') as f: 57 | for index, line in enumerate(f): 58 | assert line == line.lower() 59 | parts = line.rstrip().split() 60 | if len(parts) < 2: 61 | logger.warning("Could not parse line %s (%i)", line, index) 62 | continue 63 | word1, word2 = parts 64 | if word1 in word2id1 and word2 in word2id2: 65 | pairs.append((word1, word2)) 66 | else: 67 | not_found += 1 68 | not_found1 += int(word1 not in word2id1) 69 | not_found2 += int(word2 not in word2id2) 70 | 71 | logger.info("Found %i pairs of words in the dictionary (%i unique). " 72 | "%i other pairs contained at least one unknown word " 73 | "(%i in lang1, %i in lang2)" 74 | % (len(pairs), len(set([x for x, _ in pairs])), 75 | not_found, not_found1, not_found2)) 76 | 77 | # sort the dictionary by source word frequencies 78 | pairs = sorted(pairs, key=lambda x: word2id1[x[0]]) 79 | dico = torch.LongTensor(len(pairs), 2) 80 | for i, (word1, word2) in enumerate(pairs): 81 | dico[i, 0] = word2id1[word1] 82 | dico[i, 1] = word2id2[word2] 83 | 84 | return dico 85 | 86 | 87 | def get_word_translation_accuracy(lang1, word2id1, emb1, lang2, word2id2, emb2, method, dico_eval): 88 | """ 89 | Given source and target word embeddings, and a dictionary, 90 | evaluate the translation accuracy using the precision@k. 91 | """ 92 | if dico_eval == 'default': 93 | path = os.path.join(DIC_EVAL_PATH, '%s-%s.5000-6500.txt' % (lang1, lang2)) 94 | else: 95 | path = dico_eval 96 | dico = load_dictionary(path, word2id1, word2id2) 97 | dico = dico.cuda() if emb1.is_cuda else dico 98 | 99 | assert dico[:, 0].max() < emb1.size(0) 100 | assert dico[:, 1].max() < emb2.size(0) 101 | 102 | # normalize word embeddings 103 | emb1 = emb1 / emb1.norm(2, 1, keepdim=True).expand_as(emb1) 104 | emb2 = emb2 / emb2.norm(2, 1, keepdim=True).expand_as(emb2) 105 | 106 | # nearest neighbors 107 | if method == 'nn': 108 | query = emb1[dico[:, 0]] 109 | scores = query.mm(emb2.transpose(0, 1)) 110 | 111 | # inverted softmax 112 | elif method.startswith('invsm_beta_'): 113 | beta = float(method[len('invsm_beta_'):]) 114 | bs = 128 115 | word_scores = [] 116 | for i in range(0, emb2.size(0), bs): 117 | scores = emb1.mm(emb2[i:i + bs].transpose(0, 1)) 118 | scores.mul_(beta).exp_() 119 | scores.div_(scores.sum(0, keepdim=True).expand_as(scores)) 120 | word_scores.append(scores.index_select(0, dico[:, 0])) 121 | scores = torch.cat(word_scores, 1) 122 | 123 | # contextual dissimilarity measure 124 | elif method.startswith('csls_knn_'): 125 | # average distances to k nearest neighbors 126 | knn = method[len('csls_knn_'):] 127 | assert knn.isdigit() 128 | knn = int(knn) 129 | average_dist1 = get_nn_avg_dist(emb2, emb1, knn) 130 | average_dist2 = get_nn_avg_dist(emb1, emb2, knn) 131 | average_dist1 = torch.from_numpy(average_dist1).type_as(emb1) 132 | average_dist2 = torch.from_numpy(average_dist2).type_as(emb2) 133 | # queries / scores 134 | query = emb1[dico[:, 0]] 135 | scores = query.mm(emb2.transpose(0, 1)) 136 | scores.mul_(2) 137 | scores.sub_(average_dist1[dico[:, 0]][:, None]) 138 | scores.sub_(average_dist2[None, :]) 139 | 140 | else: 141 | raise Exception('Unknown method: "%s"' % method) 142 | 143 | results = [] 144 | top_matches = scores.topk(10, 1, True)[1] 145 | for k in [1, 5, 10]: 146 | top_k_matches = top_matches[:, :k] 147 | _matching = (top_k_matches == dico[:, 1][:, None].expand_as(top_k_matches)).sum(1).cpu().numpy() 148 | # allow for multiple possible translations 149 | matching = {} 150 | for i, src_id in enumerate(dico[:, 0].cpu().numpy()): 151 | matching[src_id] = min(matching.get(src_id, 0) + _matching[i], 1) 152 | # evaluate precision@k 153 | precision_at_k = 100 * np.mean(list(matching.values())) 154 | logger.info("%i source words - %s - Precision at k = %i: %f" % 155 | (len(matching), method, k, precision_at_k)) 156 | results.append(('precision_at_%i' % k, precision_at_k)) 157 | 158 | return results 159 | -------------------------------------------------------------------------------- /src/evaluation/wordsim.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | import os 9 | import io 10 | from logging import getLogger 11 | import numpy as np 12 | import torch 13 | from scipy.stats import spearmanr 14 | 15 | 16 | MONOLINGUAL_EVAL_PATH = 'data/monolingual' 17 | SEMEVAL17_EVAL_PATH = 'data/crosslingual/wordsim' 18 | 19 | 20 | logger = getLogger() 21 | 22 | 23 | def get_word_pairs(path, lower=True): 24 | """ 25 | Return a list of (word1, word2, score) tuples from a word similarity file. 26 | """ 27 | assert os.path.isfile(path) and type(lower) is bool 28 | word_pairs = [] 29 | with io.open(path, 'r', encoding='utf-8') as f: 30 | for line in f: 31 | line = line.rstrip() 32 | line = line.lower() if lower else line 33 | line = line.split() 34 | # ignore phrases, only consider words 35 | if len(line) != 3: 36 | assert len(line) > 3 37 | assert 'SEMEVAL17' in os.path.basename(path) or 'EN-IT_MWS353' in path 38 | continue 39 | word_pairs.append((line[0], line[1], float(line[2]))) 40 | return word_pairs 41 | 42 | 43 | def get_word_id(word, word2id, lower): 44 | """ 45 | Get a word ID. 46 | If the model does not use lowercase and the evaluation file is lowercased, 47 | we might be able to find an associated word. 48 | """ 49 | assert type(lower) is bool 50 | word_id = word2id.get(word) 51 | if word_id is None and not lower: 52 | word_id = word2id.get(word.capitalize()) 53 | if word_id is None and not lower: 54 | word_id = word2id.get(word.title()) 55 | return word_id 56 | 57 | 58 | def get_spearman_rho(word2id1, embeddings1, path, lower, 59 | word2id2=None, embeddings2=None): 60 | """ 61 | Compute monolingual or cross-lingual word similarity score. 62 | """ 63 | assert not ((word2id2 is None) ^ (embeddings2 is None)) 64 | word2id2 = word2id1 if word2id2 is None else word2id2 65 | embeddings2 = embeddings1 if embeddings2 is None else embeddings2 66 | assert len(word2id1) == embeddings1.shape[0] 67 | assert len(word2id2) == embeddings2.shape[0] 68 | assert type(lower) is bool 69 | word_pairs = get_word_pairs(path) 70 | not_found = 0 71 | pred = [] 72 | gold = [] 73 | for word1, word2, similarity in word_pairs: 74 | id1 = get_word_id(word1, word2id1, lower) 75 | id2 = get_word_id(word2, word2id2, lower) 76 | if id1 is None or id2 is None: 77 | not_found += 1 78 | continue 79 | u = embeddings1[id1] 80 | v = embeddings2[id2] 81 | score = u.dot(v) / (np.linalg.norm(u) * np.linalg.norm(v)) 82 | gold.append(similarity) 83 | pred.append(score) 84 | return spearmanr(gold, pred).correlation, len(gold), not_found 85 | 86 | 87 | def get_wordsim_scores(language, word2id, embeddings, lower=True): 88 | """ 89 | Return monolingual word similarity scores. 90 | """ 91 | dirpath = os.path.join(MONOLINGUAL_EVAL_PATH, language) 92 | if not os.path.isdir(dirpath): 93 | return None 94 | 95 | scores = {} 96 | separator = "=" * (30 + 1 + 10 + 1 + 13 + 1 + 12) 97 | pattern = "%30s %10s %13s %12s" 98 | logger.info(separator) 99 | logger.info(pattern % ("Dataset", "Found", "Not found", "Rho")) 100 | logger.info(separator) 101 | 102 | for filename in list(os.listdir(dirpath)): 103 | if filename.startswith('%s_' % (language.upper())): 104 | filepath = os.path.join(dirpath, filename) 105 | coeff, found, not_found = get_spearman_rho(word2id, embeddings, filepath, lower) 106 | logger.info(pattern % (filename[:-4], str(found), str(not_found), "%.4f" % coeff)) 107 | scores[filename[:-4]] = coeff 108 | logger.info(separator) 109 | 110 | return scores 111 | 112 | 113 | def get_wordanalogy_scores(language, word2id, embeddings, lower=True): 114 | """ 115 | Return (english) word analogy score 116 | """ 117 | dirpath = os.path.join(MONOLINGUAL_EVAL_PATH, language) 118 | if not os.path.isdir(dirpath) or language not in ["en"]: 119 | return None 120 | 121 | # normalize word embeddings 122 | embeddings = embeddings / np.sqrt((embeddings ** 2).sum(1))[:, None] 123 | 124 | # scores by category 125 | scores = {} 126 | 127 | word_ids = {} 128 | queries = {} 129 | 130 | with io.open(os.path.join(dirpath, 'questions-words.txt'), 'r', encoding='utf-8') as f: 131 | for line in f: 132 | # new line 133 | line = line.rstrip() 134 | if lower: 135 | line = line.lower() 136 | 137 | # new category 138 | if ":" in line: 139 | assert line[1] == ' ' 140 | category = line[2:] 141 | assert category not in scores 142 | scores[category] = {'n_found': 0, 'n_not_found': 0, 'n_correct': 0} 143 | word_ids[category] = [] 144 | queries[category] = [] 145 | continue 146 | 147 | # get word IDs 148 | assert len(line.split()) == 4, line 149 | word1, word2, word3, word4 = line.split() 150 | word_id1 = get_word_id(word1, word2id, lower) 151 | word_id2 = get_word_id(word2, word2id, lower) 152 | word_id3 = get_word_id(word3, word2id, lower) 153 | word_id4 = get_word_id(word4, word2id, lower) 154 | 155 | # if at least one word is not found 156 | if any(x is None for x in [word_id1, word_id2, word_id3, word_id4]): 157 | scores[category]['n_not_found'] += 1 158 | continue 159 | else: 160 | scores[category]['n_found'] += 1 161 | word_ids[category].append([word_id1, word_id2, word_id3, word_id4]) 162 | # generate query vector and get nearest neighbors 163 | query = embeddings[word_id1] - embeddings[word_id2] + embeddings[word_id4] 164 | query = query / np.linalg.norm(query) 165 | 166 | queries[category].append(query) 167 | 168 | # Compute score for each category 169 | for cat in queries: 170 | qs = torch.from_numpy(np.vstack(queries[cat])) 171 | keys = torch.from_numpy(embeddings.T) 172 | values = qs.mm(keys).cpu().numpy() 173 | 174 | # be sure we do not select input words 175 | for i, ws in enumerate(word_ids[cat]): 176 | for wid in [ws[0], ws[1], ws[3]]: 177 | values[i, wid] = -1e9 178 | scores[cat]['n_correct'] = np.sum(values.argmax(axis=1) == [ws[2] for ws in word_ids[cat]]) 179 | 180 | # pretty print 181 | separator = "=" * (30 + 1 + 10 + 1 + 13 + 1 + 12) 182 | pattern = "%30s %10s %13s %12s" 183 | logger.info(separator) 184 | logger.info(pattern % ("Category", "Found", "Not found", "Accuracy")) 185 | logger.info(separator) 186 | 187 | # compute and log accuracies 188 | accuracies = {} 189 | for k in sorted(scores.keys()): 190 | v = scores[k] 191 | accuracies[k] = float(v['n_correct']) / max(v['n_found'], 1) 192 | logger.info(pattern % (k, str(v['n_found']), str(v['n_not_found']), "%.4f" % accuracies[k])) 193 | logger.info(separator) 194 | 195 | return accuracies 196 | 197 | 198 | def get_crosslingual_wordsim_scores(lang1, word2id1, embeddings1, 199 | lang2, word2id2, embeddings2, lower=True): 200 | """ 201 | Return cross-lingual word similarity scores. 202 | """ 203 | f1 = os.path.join(SEMEVAL17_EVAL_PATH, '%s-%s-SEMEVAL17.txt' % (lang1, lang2)) 204 | f2 = os.path.join(SEMEVAL17_EVAL_PATH, '%s-%s-SEMEVAL17.txt' % (lang2, lang1)) 205 | if not (os.path.exists(f1) or os.path.exists(f2)): 206 | return None 207 | 208 | if os.path.exists(f1): 209 | coeff, found, not_found = get_spearman_rho( 210 | word2id1, embeddings1, f1, 211 | lower, word2id2, embeddings2 212 | ) 213 | elif os.path.exists(f2): 214 | coeff, found, not_found = get_spearman_rho( 215 | word2id2, embeddings2, f2, 216 | lower, word2id1, embeddings1 217 | ) 218 | 219 | scores = {} 220 | separator = "=" * (30 + 1 + 10 + 1 + 13 + 1 + 12) 221 | pattern = "%30s %10s %13s %12s" 222 | logger.info(separator) 223 | logger.info(pattern % ("Dataset", "Found", "Not found", "Rho")) 224 | logger.info(separator) 225 | 226 | task_name = '%s_%s_SEMEVAL17' % (lang1.upper(), lang2.upper()) 227 | logger.info(pattern % (task_name, str(found), str(not_found), "%.4f" % coeff)) 228 | scores[task_name] = coeff 229 | if not scores: 230 | return None 231 | logger.info(separator) 232 | 233 | return scores 234 | -------------------------------------------------------------------------------- /src/logger.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | import logging 9 | import time 10 | from datetime import timedelta 11 | 12 | 13 | class LogFormatter(): 14 | 15 | def __init__(self): 16 | self.start_time = time.time() 17 | 18 | def format(self, record): 19 | elapsed_seconds = round(record.created - self.start_time) 20 | 21 | prefix = "%s - %s - %s" % ( 22 | record.levelname, 23 | time.strftime('%x %X'), 24 | timedelta(seconds=elapsed_seconds) 25 | ) 26 | message = record.getMessage() 27 | message = message.replace('\n', '\n' + ' ' * (len(prefix) + 3)) 28 | return "%s - %s" % (prefix, message) 29 | 30 | 31 | def create_logger(filepath, vb=2): 32 | """ 33 | Create a logger. 34 | """ 35 | # create log formatter 36 | log_formatter = LogFormatter() 37 | 38 | # create file handler and set level to debug 39 | file_handler = logging.FileHandler(filepath, "a") 40 | file_handler.setLevel(logging.DEBUG) 41 | file_handler.setFormatter(log_formatter) 42 | 43 | # create console handler and set level to info 44 | log_level = logging.DEBUG if vb == 2 else logging.INFO if vb == 1 else logging.WARNING 45 | console_handler = logging.StreamHandler() 46 | console_handler.setLevel(log_level) 47 | console_handler.setFormatter(log_formatter) 48 | 49 | # create logger and set level to debug 50 | logger = logging.getLogger() 51 | logger.handlers = [] 52 | logger.setLevel(logging.DEBUG) 53 | logger.propagate = False 54 | logger.addHandler(file_handler) 55 | logger.addHandler(console_handler) 56 | 57 | # reset logger elapsed time 58 | def reset_time(): 59 | log_formatter.start_time = time.time() 60 | logger.reset_time = reset_time 61 | 62 | return logger 63 | -------------------------------------------------------------------------------- /src/models.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | import torch 9 | from torch import nn 10 | 11 | from .utils import load_embeddings, normalize_embeddings 12 | 13 | 14 | class Discriminator(nn.Module): 15 | 16 | def __init__(self, params): 17 | super(Discriminator, self).__init__() 18 | 19 | self.emb_dim = params.emb_dim 20 | self.dis_layers = params.dis_layers 21 | self.dis_hid_dim = params.dis_hid_dim 22 | self.dis_dropout = params.dis_dropout 23 | self.dis_input_dropout = params.dis_input_dropout 24 | 25 | layers = [nn.Dropout(self.dis_input_dropout)] 26 | for i in range(self.dis_layers + 1): 27 | input_dim = self.emb_dim if i == 0 else self.dis_hid_dim 28 | output_dim = 1 if i == self.dis_layers else self.dis_hid_dim 29 | layers.append(nn.Linear(input_dim, output_dim)) 30 | if i < self.dis_layers: 31 | layers.append(nn.LeakyReLU(0.2)) 32 | layers.append(nn.Dropout(self.dis_dropout)) 33 | layers.append(nn.Sigmoid()) 34 | self.layers = nn.Sequential(*layers) 35 | 36 | def forward(self, x): 37 | assert x.dim() == 2 and x.size(1) == self.emb_dim 38 | return self.layers(x).view(-1) 39 | 40 | 41 | def build_model(params, with_dis): 42 | """ 43 | Build all components of the model. 44 | """ 45 | # source embeddings 46 | src_dico, _src_emb = load_embeddings(params, source=True) 47 | params.src_dico = src_dico 48 | src_emb = nn.Embedding(len(src_dico), params.emb_dim, sparse=True) 49 | src_emb.weight.data.copy_(_src_emb) 50 | 51 | # target embeddings 52 | if params.tgt_lang: 53 | tgt_dico, _tgt_emb = load_embeddings(params, source=False) 54 | params.tgt_dico = tgt_dico 55 | tgt_emb = nn.Embedding(len(tgt_dico), params.emb_dim, sparse=True) 56 | tgt_emb.weight.data.copy_(_tgt_emb) 57 | else: 58 | tgt_emb = None 59 | 60 | # mapping 61 | mapping = nn.Linear(params.emb_dim, params.emb_dim, bias=False) 62 | if getattr(params, 'map_id_init', True): 63 | mapping.weight.data.copy_(torch.diag(torch.ones(params.emb_dim))) 64 | 65 | # discriminator 66 | discriminator = Discriminator(params) if with_dis else None 67 | 68 | # cuda 69 | if params.cuda: 70 | src_emb.cuda() 71 | if params.tgt_lang: 72 | tgt_emb.cuda() 73 | mapping.cuda() 74 | if with_dis: 75 | discriminator.cuda() 76 | 77 | # normalize embeddings 78 | params.src_mean = normalize_embeddings(src_emb.weight.data, params.normalize_embeddings) 79 | if params.tgt_lang: 80 | params.tgt_mean = normalize_embeddings(tgt_emb.weight.data, params.normalize_embeddings) 81 | 82 | return src_emb, tgt_emb, mapping, discriminator 83 | -------------------------------------------------------------------------------- /src/trainer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | import os 9 | from logging import getLogger 10 | import scipy 11 | import scipy.linalg 12 | import torch 13 | from torch.autograd import Variable 14 | from torch.nn import functional as F 15 | 16 | from .utils import get_optimizer, load_embeddings, normalize_embeddings, export_embeddings 17 | from .utils import clip_parameters 18 | from .dico_builder import build_dictionary 19 | from .evaluation.word_translation import DIC_EVAL_PATH, load_identical_char_dico, load_dictionary 20 | 21 | 22 | logger = getLogger() 23 | 24 | 25 | class Trainer(object): 26 | 27 | def __init__(self, src_emb, tgt_emb, mapping, discriminator, params): 28 | """ 29 | Initialize trainer script. 30 | """ 31 | self.src_emb = src_emb 32 | self.tgt_emb = tgt_emb 33 | self.src_dico = params.src_dico 34 | self.tgt_dico = getattr(params, 'tgt_dico', None) 35 | self.mapping = mapping 36 | self.discriminator = discriminator 37 | self.params = params 38 | 39 | # optimizers 40 | if hasattr(params, 'map_optimizer'): 41 | optim_fn, optim_params = get_optimizer(params.map_optimizer) 42 | self.map_optimizer = optim_fn(mapping.parameters(), **optim_params) 43 | if hasattr(params, 'dis_optimizer'): 44 | optim_fn, optim_params = get_optimizer(params.dis_optimizer) 45 | self.dis_optimizer = optim_fn(discriminator.parameters(), **optim_params) 46 | else: 47 | assert discriminator is None 48 | 49 | # best validation score 50 | self.best_valid_metric = -1e12 51 | 52 | self.decrease_lr = False 53 | 54 | def get_dis_xy(self, volatile): 55 | """ 56 | Get discriminator input batch / output target. 57 | """ 58 | # select random word IDs 59 | bs = self.params.batch_size 60 | mf = self.params.dis_most_frequent 61 | assert mf <= min(len(self.src_dico), len(self.tgt_dico)) 62 | src_ids = torch.LongTensor(bs).random_(len(self.src_dico) if mf == 0 else mf) 63 | tgt_ids = torch.LongTensor(bs).random_(len(self.tgt_dico) if mf == 0 else mf) 64 | if self.params.cuda: 65 | src_ids = src_ids.cuda() 66 | tgt_ids = tgt_ids.cuda() 67 | 68 | # get word embeddings 69 | src_emb = self.src_emb(Variable(src_ids, volatile=True)) 70 | tgt_emb = self.tgt_emb(Variable(tgt_ids, volatile=True)) 71 | src_emb = self.mapping(Variable(src_emb.data, volatile=volatile)) 72 | tgt_emb = Variable(tgt_emb.data, volatile=volatile) 73 | 74 | # input / target 75 | x = torch.cat([src_emb, tgt_emb], 0) 76 | y = torch.FloatTensor(2 * bs).zero_() 77 | y[:bs] = 1 - self.params.dis_smooth 78 | y[bs:] = self.params.dis_smooth 79 | y = Variable(y.cuda() if self.params.cuda else y) 80 | 81 | return x, y 82 | 83 | def dis_step(self, stats): 84 | """ 85 | Train the discriminator. 86 | """ 87 | self.discriminator.train() 88 | 89 | # loss 90 | x, y = self.get_dis_xy(volatile=True) 91 | preds = self.discriminator(Variable(x.data)) 92 | loss = F.binary_cross_entropy(preds, y) 93 | stats['DIS_COSTS'].append(loss.data.item()) 94 | 95 | # check NaN 96 | if (loss != loss).data.any(): 97 | logger.error("NaN detected (discriminator)") 98 | exit() 99 | 100 | # optim 101 | self.dis_optimizer.zero_grad() 102 | loss.backward() 103 | self.dis_optimizer.step() 104 | clip_parameters(self.discriminator, self.params.dis_clip_weights) 105 | 106 | def mapping_step(self, stats): 107 | """ 108 | Fooling discriminator training step. 109 | """ 110 | if self.params.dis_lambda == 0: 111 | return 0 112 | 113 | self.discriminator.eval() 114 | 115 | # loss 116 | x, y = self.get_dis_xy(volatile=False) 117 | preds = self.discriminator(x) 118 | loss = F.binary_cross_entropy(preds, 1 - y) 119 | loss = self.params.dis_lambda * loss 120 | 121 | # check NaN 122 | if (loss != loss).data.any(): 123 | logger.error("NaN detected (fool discriminator)") 124 | exit() 125 | 126 | # optim 127 | self.map_optimizer.zero_grad() 128 | loss.backward() 129 | self.map_optimizer.step() 130 | self.orthogonalize() 131 | 132 | return 2 * self.params.batch_size 133 | 134 | def load_training_dico(self, dico_train): 135 | """ 136 | Load training dictionary. 137 | """ 138 | word2id1 = self.src_dico.word2id 139 | word2id2 = self.tgt_dico.word2id 140 | 141 | # identical character strings 142 | if dico_train == "identical_char": 143 | self.dico = load_identical_char_dico(word2id1, word2id2) 144 | # use one of the provided dictionary 145 | elif dico_train == "default": 146 | filename = '%s-%s.0-5000.txt' % (self.params.src_lang, self.params.tgt_lang) 147 | self.dico = load_dictionary( 148 | os.path.join(DIC_EVAL_PATH, filename), 149 | word2id1, word2id2 150 | ) 151 | # dictionary provided by the user 152 | else: 153 | self.dico = load_dictionary(dico_train, word2id1, word2id2) 154 | 155 | # cuda 156 | if self.params.cuda: 157 | self.dico = self.dico.cuda() 158 | 159 | def build_dictionary(self): 160 | """ 161 | Build a dictionary from aligned embeddings. 162 | """ 163 | src_emb = self.mapping(self.src_emb.weight).data 164 | tgt_emb = self.tgt_emb.weight.data 165 | src_emb = src_emb / src_emb.norm(2, 1, keepdim=True).expand_as(src_emb) 166 | tgt_emb = tgt_emb / tgt_emb.norm(2, 1, keepdim=True).expand_as(tgt_emb) 167 | self.dico = build_dictionary(src_emb, tgt_emb, self.params) 168 | 169 | def procrustes(self): 170 | """ 171 | Find the best orthogonal matrix mapping using the Orthogonal Procrustes problem 172 | https://en.wikipedia.org/wiki/Orthogonal_Procrustes_problem 173 | """ 174 | A = self.src_emb.weight.data[self.dico[:, 0]] 175 | B = self.tgt_emb.weight.data[self.dico[:, 1]] 176 | W = self.mapping.weight.data 177 | M = B.transpose(0, 1).mm(A).cpu().numpy() 178 | U, S, V_t = scipy.linalg.svd(M, full_matrices=True) 179 | W.copy_(torch.from_numpy(U.dot(V_t)).type_as(W)) 180 | 181 | def orthogonalize(self): 182 | """ 183 | Orthogonalize the mapping. 184 | """ 185 | if self.params.map_beta > 0: 186 | W = self.mapping.weight.data 187 | beta = self.params.map_beta 188 | W.copy_((1 + beta) * W - beta * W.mm(W.transpose(0, 1).mm(W))) 189 | 190 | def update_lr(self, to_log, metric): 191 | """ 192 | Update learning rate when using SGD. 193 | """ 194 | if 'sgd' not in self.params.map_optimizer: 195 | return 196 | old_lr = self.map_optimizer.param_groups[0]['lr'] 197 | new_lr = max(self.params.min_lr, old_lr * self.params.lr_decay) 198 | if new_lr < old_lr: 199 | logger.info("Decreasing learning rate: %.8f -> %.8f" % (old_lr, new_lr)) 200 | self.map_optimizer.param_groups[0]['lr'] = new_lr 201 | 202 | if self.params.lr_shrink < 1 and to_log[metric] >= -1e7: 203 | if to_log[metric] < self.best_valid_metric: 204 | logger.info("Validation metric is smaller than the best: %.5f vs %.5f" 205 | % (to_log[metric], self.best_valid_metric)) 206 | # decrease the learning rate, only if this is the 207 | # second time the validation metric decreases 208 | if self.decrease_lr: 209 | old_lr = self.map_optimizer.param_groups[0]['lr'] 210 | self.map_optimizer.param_groups[0]['lr'] *= self.params.lr_shrink 211 | logger.info("Shrinking the learning rate: %.5f -> %.5f" 212 | % (old_lr, self.map_optimizer.param_groups[0]['lr'])) 213 | self.decrease_lr = True 214 | 215 | def save_best(self, to_log, metric): 216 | """ 217 | Save the best model for the given validation metric. 218 | """ 219 | # best mapping for the given validation criterion 220 | if to_log[metric] > self.best_valid_metric: 221 | # new best mapping 222 | self.best_valid_metric = to_log[metric] 223 | logger.info('* Best value for "%s": %.5f' % (metric, to_log[metric])) 224 | # save the mapping 225 | W = self.mapping.weight.data.cpu().numpy() 226 | path = os.path.join(self.params.exp_path, 'best_mapping.pth') 227 | logger.info('* Saving the mapping to %s ...' % path) 228 | torch.save(W, path) 229 | 230 | def reload_best(self): 231 | """ 232 | Reload the best mapping. 233 | """ 234 | path = os.path.join(self.params.exp_path, 'best_mapping.pth') 235 | logger.info('* Reloading the best model from %s ...' % path) 236 | # reload the model 237 | assert os.path.isfile(path) 238 | to_reload = torch.from_numpy(torch.load(path)) 239 | W = self.mapping.weight.data 240 | assert to_reload.size() == W.size() 241 | W.copy_(to_reload.type_as(W)) 242 | 243 | def export(self): 244 | """ 245 | Export embeddings. 246 | """ 247 | params = self.params 248 | 249 | # load all embeddings 250 | logger.info("Reloading all embeddings for mapping ...") 251 | params.src_dico, src_emb = load_embeddings(params, source=True, full_vocab=True) 252 | params.tgt_dico, tgt_emb = load_embeddings(params, source=False, full_vocab=True) 253 | 254 | # apply same normalization as during training 255 | normalize_embeddings(src_emb, params.normalize_embeddings, mean=params.src_mean) 256 | normalize_embeddings(tgt_emb, params.normalize_embeddings, mean=params.tgt_mean) 257 | 258 | # map source embeddings to the target space 259 | bs = 4096 260 | logger.info("Map source embeddings to the target space ...") 261 | for i, k in enumerate(range(0, len(src_emb), bs)): 262 | x = Variable(src_emb[k:k + bs], volatile=True) 263 | src_emb[k:k + bs] = self.mapping(x.cuda() if params.cuda else x).data.cpu() 264 | 265 | # write embeddings to the disk 266 | export_embeddings(src_emb, tgt_emb, params) 267 | -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | import os 9 | import io 10 | import re 11 | import sys 12 | import pickle 13 | import random 14 | import inspect 15 | import argparse 16 | import subprocess 17 | import numpy as np 18 | import torch 19 | from torch import optim 20 | from logging import getLogger 21 | 22 | from .logger import create_logger 23 | from .dictionary import Dictionary 24 | 25 | 26 | MAIN_DUMP_PATH = os.path.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), 'dumped') 27 | 28 | logger = getLogger() 29 | 30 | 31 | # load Faiss if available (dramatically accelerates the nearest neighbor search) 32 | try: 33 | import faiss 34 | FAISS_AVAILABLE = True 35 | if not hasattr(faiss, 'StandardGpuResources'): 36 | sys.stderr.write("Impossible to import Faiss-GPU. " 37 | "Switching to FAISS-CPU, " 38 | "this will be slower.\n\n") 39 | 40 | except ImportError: 41 | sys.stderr.write("Impossible to import Faiss library!! " 42 | "Switching to standard nearest neighbors search implementation, " 43 | "this will be significantly slower.\n\n") 44 | FAISS_AVAILABLE = False 45 | 46 | 47 | def initialize_exp(params): 48 | """ 49 | Initialize experiment. 50 | """ 51 | # initialization 52 | if getattr(params, 'seed', -1) >= 0: 53 | np.random.seed(params.seed) 54 | torch.manual_seed(params.seed) 55 | if params.cuda: 56 | torch.cuda.manual_seed(params.seed) 57 | 58 | # dump parameters 59 | params.exp_path = get_exp_path(params) 60 | with io.open(os.path.join(params.exp_path, 'params.pkl'), 'wb') as f: 61 | pickle.dump(params, f) 62 | 63 | # create logger 64 | logger = create_logger(os.path.join(params.exp_path, 'train.log'), vb=params.verbose) 65 | logger.info('============ Initialized logger ============') 66 | logger.info('\n'.join('%s: %s' % (k, str(v)) for k, v in sorted(dict(vars(params)).items()))) 67 | logger.info('The experiment will be stored in %s' % params.exp_path) 68 | return logger 69 | 70 | 71 | def load_fasttext_model(path): 72 | """ 73 | Load a binarized fastText model. 74 | """ 75 | try: 76 | import fastText 77 | except ImportError: 78 | raise Exception("Unable to import fastText. Please install fastText for Python: " 79 | "https://github.com/facebookresearch/fastText") 80 | return fastText.load_model(path) 81 | 82 | 83 | def bow(sentences, word_vec, normalize=False): 84 | """ 85 | Get sentence representations using average bag-of-words. 86 | """ 87 | embeddings = [] 88 | for sent in sentences: 89 | sentvec = [word_vec[w] for w in sent if w in word_vec] 90 | if normalize: 91 | sentvec = [v / np.linalg.norm(v) for v in sentvec] 92 | if len(sentvec) == 0: 93 | sentvec = [word_vec[list(word_vec.keys())[0]]] 94 | embeddings.append(np.mean(sentvec, axis=0)) 95 | return np.vstack(embeddings) 96 | 97 | 98 | def bow_idf(sentences, word_vec, idf_dict=None): 99 | """ 100 | Get sentence representations using weigthed IDF bag-of-words. 101 | """ 102 | embeddings = [] 103 | for sent in sentences: 104 | sent = set(sent) 105 | list_words = [w for w in sent if w in word_vec and w in idf_dict] 106 | if len(list_words) > 0: 107 | sentvec = [word_vec[w] * idf_dict[w] for w in list_words] 108 | sentvec = sentvec / np.sum([idf_dict[w] for w in list_words]) 109 | else: 110 | sentvec = [word_vec[list(word_vec.keys())[0]]] 111 | embeddings.append(np.sum(sentvec, axis=0)) 112 | return np.vstack(embeddings) 113 | 114 | 115 | def get_idf(europarl, src_lg, tgt_lg, n_idf): 116 | """ 117 | Compute IDF values. 118 | """ 119 | idf = {src_lg: {}, tgt_lg: {}} 120 | k = 0 121 | for lg in idf: 122 | start_idx = 200000 + k * n_idf 123 | end_idx = 200000 + (k + 1) * n_idf 124 | for sent in europarl[lg][start_idx:end_idx]: 125 | for word in set(sent): 126 | idf[lg][word] = idf[lg].get(word, 0) + 1 127 | n_doc = len(europarl[lg][start_idx:end_idx]) 128 | for word in idf[lg]: 129 | idf[lg][word] = max(1, np.log10(n_doc / (idf[lg][word]))) 130 | k += 1 131 | return idf 132 | 133 | 134 | def get_nn_avg_dist(emb, query, knn): 135 | """ 136 | Compute the average distance of the `knn` nearest neighbors 137 | for a given set of embeddings and queries. 138 | Use Faiss if available. 139 | """ 140 | if FAISS_AVAILABLE: 141 | emb = emb.cpu().numpy() 142 | query = query.cpu().numpy() 143 | if hasattr(faiss, 'StandardGpuResources'): 144 | # gpu mode 145 | res = faiss.StandardGpuResources() 146 | config = faiss.GpuIndexFlatConfig() 147 | config.device = 0 148 | index = faiss.GpuIndexFlatIP(res, emb.shape[1], config) 149 | else: 150 | # cpu mode 151 | index = faiss.IndexFlatIP(emb.shape[1]) 152 | index.add(emb) 153 | distances, _ = index.search(query, knn) 154 | return distances.mean(1) 155 | else: 156 | bs = 1024 157 | all_distances = [] 158 | emb = emb.transpose(0, 1).contiguous() 159 | for i in range(0, query.shape[0], bs): 160 | distances = query[i:i + bs].mm(emb) 161 | best_distances, _ = distances.topk(knn, dim=1, largest=True, sorted=True) 162 | all_distances.append(best_distances.mean(1).cpu()) 163 | all_distances = torch.cat(all_distances) 164 | return all_distances.numpy() 165 | 166 | 167 | def bool_flag(s): 168 | """ 169 | Parse boolean arguments from the command line. 170 | """ 171 | if s.lower() in ['off', 'false', '0']: 172 | return False 173 | if s.lower() in ['on', 'true', '1']: 174 | return True 175 | raise argparse.ArgumentTypeError("invalid value for a boolean flag (0 or 1)") 176 | 177 | 178 | def get_optimizer(s): 179 | """ 180 | Parse optimizer parameters. 181 | Input should be of the form: 182 | - "sgd,lr=0.01" 183 | - "adagrad,lr=0.1,lr_decay=0.05" 184 | """ 185 | if "," in s: 186 | method = s[:s.find(',')] 187 | optim_params = {} 188 | for x in s[s.find(',') + 1:].split(','): 189 | split = x.split('=') 190 | assert len(split) == 2 191 | assert re.match("^[+-]?(\d+(\.\d*)?|\.\d+)quot;, split[1]) is not None 192 | optim_params[split[0]] = float(split[1]) 193 | else: 194 | method = s 195 | optim_params = {} 196 | 197 | if method == 'adadelta': 198 | optim_fn = optim.Adadelta 199 | elif method == 'adagrad': 200 | optim_fn = optim.Adagrad 201 | elif method == 'adam': 202 | optim_fn = optim.Adam 203 | elif method == 'adamax': 204 | optim_fn = optim.Adamax 205 | elif method == 'asgd': 206 | optim_fn = optim.ASGD 207 | elif method == 'rmsprop': 208 | optim_fn = optim.RMSprop 209 | elif method == 'rprop': 210 | optim_fn = optim.Rprop 211 | elif method == 'sgd': 212 | optim_fn = optim.SGD 213 | assert 'lr' in optim_params 214 | else: 215 | raise Exception('Unknown optimization method: "%s"' % method) 216 | 217 | # check that we give good parameters to the optimizer 218 | expected_args = inspect.getargspec(optim_fn.__init__)[0] 219 | assert expected_args[:2] == ['self', 'params'] 220 | if not all(k in expected_args[2:] for k in optim_params.keys()): 221 | raise Exception('Unexpected parameters: expected "%s", got "%s"' % ( 222 | str(expected_args[2:]), str(optim_params.keys()))) 223 | 224 | return optim_fn, optim_params 225 | 226 | 227 | def get_exp_path(params): 228 | """ 229 | Create a directory to store the experiment. 230 | """ 231 | # create the main dump path if it does not exist 232 | exp_folder = MAIN_DUMP_PATH if params.exp_path == '' else params.exp_path 233 | if not os.path.exists(exp_folder): 234 | subprocess.Popen("mkdir %s" % exp_folder, shell=True).wait() 235 | assert params.exp_name != '' 236 | exp_folder = os.path.join(exp_folder, params.exp_name) 237 | if not os.path.exists(exp_folder): 238 | subprocess.Popen("mkdir %s" % exp_folder, shell=True).wait() 239 | if params.exp_id == '': 240 | chars = 'abcdefghijklmnopqrstuvwxyz0123456789' 241 | while True: 242 | exp_id = ''.join(random.choice(chars) for _ in range(10)) 243 | exp_path = os.path.join(exp_folder, exp_id) 244 | if not os.path.isdir(exp_path): 245 | break 246 | else: 247 | exp_path = os.path.join(exp_folder, params.exp_id) 248 | assert not os.path.isdir(exp_path), exp_path 249 | # create the dump folder 250 | if not os.path.isdir(exp_path): 251 | subprocess.Popen("mkdir %s" % exp_path, shell=True).wait() 252 | return exp_path 253 | 254 | 255 | def clip_parameters(model, clip): 256 | """ 257 | Clip model weights. 258 | """ 259 | if clip > 0: 260 | for x in model.parameters(): 261 | x.data.clamp_(-clip, clip) 262 | 263 | 264 | def read_txt_embeddings(params, source, full_vocab): 265 | """ 266 | Reload pretrained embeddings from a text file. 267 | """ 268 | word2id = {} 269 | vectors = [] 270 | 271 | # load pretrained embeddings 272 | lang = params.src_lang if source else params.tgt_lang 273 | emb_path = params.src_emb if source else params.tgt_emb 274 | _emb_dim_file = params.emb_dim 275 | with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f: 276 | for i, line in enumerate(f): 277 | if i == 0: 278 | split = line.split() 279 | assert len(split) == 2 280 | assert _emb_dim_file == int(split[1]) 281 | else: 282 | word, vect = line.rstrip().split(' ', 1) 283 | if not full_vocab: 284 | word = word.lower() 285 | vect = np.fromstring(vect, sep=' ') 286 | if np.linalg.norm(vect) == 0: # avoid to have null embeddings 287 | vect[0] = 0.01 288 | if word in word2id: 289 | if full_vocab: 290 | logger.warning("Word '%s' found twice in %s embedding file" 291 | % (word, 'source' if source else 'target')) 292 | else: 293 | if not vect.shape == (_emb_dim_file,): 294 | logger.warning("Invalid dimension (%i) for %s word '%s' in line %i." 295 | % (vect.shape[0], 'source' if source else 'target', word, i)) 296 | continue 297 | assert vect.shape == (_emb_dim_file,), i 298 | word2id[word] = len(word2id) 299 | vectors.append(vect[None]) 300 | if params.max_vocab > 0 and len(word2id) >= params.max_vocab and not full_vocab: 301 | break 302 | 303 | assert len(word2id) == len(vectors) 304 | logger.info("Loaded %i pre-trained word embeddings." % len(vectors)) 305 | 306 | # compute new vocabulary / embeddings 307 | id2word = {v: k for k, v in word2id.items()} 308 | dico = Dictionary(id2word, word2id, lang) 309 | embeddings = np.concatenate(vectors, 0) 310 | embeddings = torch.from_numpy(embeddings).float() 311 | embeddings = embeddings.cuda() if (params.cuda and not full_vocab) else embeddings 312 | 313 | assert embeddings.size() == (len(dico), params.emb_dim) 314 | return dico, embeddings 315 | 316 | 317 | def select_subset(word_list, max_vocab): 318 | """ 319 | Select a subset of words to consider, to deal with words having embeddings 320 | available in different casings. In particular, we select the embeddings of 321 | the most frequent words, that are usually of better quality. 322 | """ 323 | word2id = {} 324 | indexes = [] 325 | for i, word in enumerate(word_list): 326 | word = word.lower() 327 | if word not in word2id: 328 | word2id[word] = len(word2id) 329 | indexes.append(i) 330 | if max_vocab > 0 and len(word2id) >= max_vocab: 331 | break 332 | assert len(word2id) == len(indexes) 333 | return word2id, torch.LongTensor(indexes) 334 | 335 | 336 | def load_pth_embeddings(params, source, full_vocab): 337 | """ 338 | Reload pretrained embeddings from a PyTorch binary file. 339 | """ 340 | # reload PyTorch binary file 341 | lang = params.src_lang if source else params.tgt_lang 342 | data = torch.load(params.src_emb if source else params.tgt_emb) 343 | dico = data['dico'] 344 | embeddings = data['vectors'] 345 | assert dico.lang == lang 346 | assert embeddings.size() == (len(dico), params.emb_dim) 347 | logger.info("Loaded %i pre-trained word embeddings." % len(dico)) 348 | 349 | # select a subset of word embeddings (to deal with casing) 350 | if not full_vocab: 351 | word2id, indexes = select_subset([dico[i] for i in range(len(dico))], params.max_vocab) 352 | id2word = {v: k for k, v in word2id.items()} 353 | dico = Dictionary(id2word, word2id, lang) 354 | embeddings = embeddings[indexes] 355 | 356 | assert embeddings.size() == (len(dico), params.emb_dim) 357 | return dico, embeddings 358 | 359 | 360 | def load_bin_embeddings(params, source, full_vocab): 361 | """ 362 | Reload pretrained embeddings from a fastText binary file. 363 | """ 364 | # reload fastText binary file 365 | lang = params.src_lang if source else params.tgt_lang 366 | model = load_fasttext_model(params.src_emb if source else params.tgt_emb) 367 | words = model.get_labels() 368 | assert model.get_dimension() == params.emb_dim 369 | logger.info("Loaded binary model. Generating embeddings ...") 370 | embeddings = torch.from_numpy(np.concatenate([model.get_word_vector(w)[None] for w in words], 0)) 371 | logger.info("Generated embeddings for %i words." % len(words)) 372 | assert embeddings.size() == (len(words), params.emb_dim) 373 | 374 | # select a subset of word embeddings (to deal with casing) 375 | if not full_vocab: 376 | word2id, indexes = select_subset(words, params.max_vocab) 377 | embeddings = embeddings[indexes] 378 | else: 379 | word2id = {w: i for i, w in enumerate(words)} 380 | id2word = {i: w for w, i in word2id.items()} 381 | dico = Dictionary(id2word, word2id, lang) 382 | 383 | assert embeddings.size() == (len(dico), params.emb_dim) 384 | return dico, embeddings 385 | 386 | 387 | def load_embeddings(params, source, full_vocab=False): 388 | """ 389 | Reload pretrained embeddings. 390 | - `full_vocab == False` means that we load the `params.max_vocab` most frequent words. 391 | It is used at the beginning of the experiment. 392 | In that setting, if two words with a different casing occur, we lowercase both, and 393 | only consider the most frequent one. For instance, if "London" and "london" are in 394 | the embeddings file, we only consider the most frequent one, (in that case, probably 395 | London). This is done to deal with the lowercased dictionaries. 396 | - `full_vocab == True` means that we load the entire embedding text file, 397 | before we export the embeddings at the end of the experiment. 398 | """ 399 | assert type(source) is bool and type(full_vocab) is bool 400 | emb_path = params.src_emb if source else params.tgt_emb 401 | if emb_path.endswith('.pth'): 402 | return load_pth_embeddings(params, source, full_vocab) 403 | if emb_path.endswith('.bin'): 404 | return load_bin_embeddings(params, source, full_vocab) 405 | else: 406 | return read_txt_embeddings(params, source, full_vocab) 407 | 408 | 409 | def normalize_embeddings(emb, types, mean=None): 410 | """ 411 | Normalize embeddings by their norms / recenter them. 412 | """ 413 | for t in types.split(','): 414 | if t == '': 415 | continue 416 | if t == 'center': 417 | if mean is None: 418 | mean = emb.mean(0, keepdim=True) 419 | emb.sub_(mean.expand_as(emb)) 420 | elif t == 'renorm': 421 | emb.div_(emb.norm(2, 1, keepdim=True).expand_as(emb)) 422 | else: 423 | raise Exception('Unknown normalization type: "%s"' % t) 424 | return mean.cpu() if mean is not None else None 425 | 426 | 427 | def export_embeddings(src_emb, tgt_emb, params): 428 | """ 429 | Export embeddings to a text or a PyTorch file. 430 | """ 431 | assert params.export in ["txt", "pth"] 432 | 433 | # text file 434 | if params.export == "txt": 435 | src_path = os.path.join(params.exp_path, 'vectors-%s.txt' % params.src_lang) 436 | tgt_path = os.path.join(params.exp_path, 'vectors-%s.txt' % params.tgt_lang) 437 | # source embeddings 438 | logger.info('Writing source embeddings to %s ...' % src_path) 439 | with io.open(src_path, 'w', encoding='utf-8') as f: 440 | f.write(u"%i %i\n" % src_emb.size()) 441 | for i in range(len(params.src_dico)): 442 | f.write(u"%s %s\n" % (params.src_dico[i], " ".join('%.5f' % x for x in src_emb[i]))) 443 | # target embeddings 444 | logger.info('Writing target embeddings to %s ...' % tgt_path) 445 | with io.open(tgt_path, 'w', encoding='utf-8') as f: 446 | f.write(u"%i %i\n" % tgt_emb.size()) 447 | for i in range(len(params.tgt_dico)): 448 | f.write(u"%s %s\n" % (params.tgt_dico[i], " ".join('%.5f' % x for x in tgt_emb[i]))) 449 | 450 | # PyTorch file 451 | if params.export == "pth": 452 | src_path = os.path.join(params.exp_path, 'vectors-%s.pth' % params.src_lang) 453 | tgt_path = os.path.join(params.exp_path, 'vectors-%s.pth' % params.tgt_lang) 454 | logger.info('Writing source embeddings to %s ...' % src_path) 455 | torch.save({'dico': params.src_dico, 'vectors': src_emb}, src_path) 456 | logger.info('Writing target embeddings to %s ...' % tgt_path) 457 | torch.save({'dico': params.tgt_dico, 'vectors': tgt_emb}, tgt_path) 458 | -------------------------------------------------------------------------------- /supervised.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | import os 9 | import json 10 | import argparse 11 | from collections import OrderedDict 12 | import torch 13 | 14 | from src.utils import bool_flag, initialize_exp 15 | from src.models import build_model 16 | from src.trainer import Trainer 17 | from src.evaluation import Evaluator 18 | 19 | 20 | VALIDATION_METRIC_SUP = 'precision_at_1-csls_knn_10' 21 | VALIDATION_METRIC_UNSUP = 'mean_cosine-csls_knn_10-S2T-10000' 22 | 23 | 24 | # main 25 | parser = argparse.ArgumentParser(description='Supervised training') 26 | parser.add_argument("--seed", type=int, default=-1, help="Initialization seed") 27 | parser.add_argument("--verbose", type=int, default=2, help="Verbose level (2:debug, 1:info, 0:warning)") 28 | parser.add_argument("--exp_path", type=str, default="", help="Where to store experiment logs and models") 29 | parser.add_argument("--exp_name", type=str, default="debug", help="Experiment name") 30 | parser.add_argument("--exp_id", type=str, default="", help="Experiment ID") 31 | parser.add_argument("--cuda", type=bool_flag, default=True, help="Run on GPU") 32 | parser.add_argument("--export", type=str, default="txt", help="Export embeddings after training (txt / pth)") 33 | 34 | # data 35 | parser.add_argument("--src_lang", type=str, default='en', help="Source language") 36 | parser.add_argument("--tgt_lang", type=str, default='es', help="Target language") 37 | parser.add_argument("--emb_dim", type=int, default=300, help="Embedding dimension") 38 | parser.add_argument("--max_vocab", type=int, default=200000, help="Maximum vocabulary size (-1 to disable)") 39 | # training refinement 40 | parser.add_argument("--n_refinement", type=int, default=5, help="Number of refinement iterations (0 to disable the refinement procedure)") 41 | # dictionary creation parameters (for refinement) 42 | parser.add_argument("--dico_train", type=str, default="default", help="Path to training dictionary (default: use identical character strings)") 43 | parser.add_argument("--dico_eval", type=str, default="default", help="Path to evaluation dictionary") 44 | parser.add_argument("--dico_method", type=str, default='csls_knn_10', help="Method used for dictionary generation (nn/invsm_beta_30/csls_knn_10)") 45 | parser.add_argument("--dico_build", type=str, default='S2T&T2S', help="S2T,T2S,S2T|T2S,S2T&T2S") 46 | parser.add_argument("--dico_threshold", type=float, default=0, help="Threshold confidence for dictionary generation") 47 | parser.add_argument("--dico_max_rank", type=int, default=10000, help="Maximum dictionary words rank (0 to disable)") 48 | parser.add_argument("--dico_min_size", type=int, default=0, help="Minimum generated dictionary size (0 to disable)") 49 | parser.add_argument("--dico_max_size", type=int, default=0, help="Maximum generated dictionary size (0 to disable)") 50 | # reload pre-trained embeddings 51 | parser.add_argument("--src_emb", type=str, default='', help="Reload source embeddings") 52 | parser.add_argument("--tgt_emb", type=str, default='', help="Reload target embeddings") 53 | parser.add_argument("--normalize_embeddings", type=str, default="", help="Normalize embeddings before training") 54 | 55 | 56 | # parse parameters 57 | params = parser.parse_args() 58 | 59 | # check parameters 60 | assert not params.cuda or torch.cuda.is_available() 61 | assert params.dico_train in ["identical_char", "default"] or os.path.isfile(params.dico_train) 62 | assert params.dico_build in ["S2T", "T2S", "S2T|T2S", "S2T&T2S"] 63 | assert params.dico_max_size == 0 or params.dico_max_size < params.dico_max_rank 64 | assert params.dico_max_size == 0 or params.dico_max_size > params.dico_min_size 65 | assert os.path.isfile(params.src_emb) 66 | assert os.path.isfile(params.tgt_emb) 67 | assert params.dico_eval == 'default' or os.path.isfile(params.dico_eval) 68 | assert params.export in ["", "txt", "pth"] 69 | 70 | # build logger / model / trainer / evaluator 71 | logger = initialize_exp(params) 72 | src_emb, tgt_emb, mapping, _ = build_model(params, False) 73 | trainer = Trainer(src_emb, tgt_emb, mapping, None, params) 74 | evaluator = Evaluator(trainer) 75 | 76 | # load a training dictionary. if a dictionary path is not provided, use a default 77 | # one ("default") or create one based on identical character strings ("identical_char") 78 | trainer.load_training_dico(params.dico_train) 79 | 80 | # define the validation metric 81 | VALIDATION_METRIC = VALIDATION_METRIC_UNSUP if params.dico_train == 'identical_char' else VALIDATION_METRIC_SUP 82 | logger.info("Validation metric: %s" % VALIDATION_METRIC) 83 | 84 | """ 85 | Learning loop for Procrustes Iterative Learning 86 | """ 87 | for n_iter in range(params.n_refinement + 1): 88 | 89 | logger.info('Starting iteration %i...' % n_iter) 90 | 91 | # build a dictionary from aligned embeddings (unless 92 | # it is the first iteration and we use the init one) 93 | if n_iter > 0 or not hasattr(trainer, 'dico'): 94 | trainer.build_dictionary() 95 | 96 | # apply the Procrustes solution 97 | trainer.procrustes() 98 | 99 | # embeddings evaluation 100 | to_log = OrderedDict({'n_iter': n_iter}) 101 | evaluator.all_eval(to_log) 102 | 103 | # JSON log / save best model / end of epoch 104 | logger.info("__log__:%s" % json.dumps(to_log)) 105 | trainer.save_best(to_log, VALIDATION_METRIC) 106 | logger.info('End of iteration %i.\n\n' % n_iter) 107 | 108 | 109 | # export embeddings 110 | if params.export: 111 | trainer.reload_best() 112 | trainer.export() 113 | -------------------------------------------------------------------------------- /unsupervised.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | import os 9 | import time 10 | import json 11 | import argparse 12 | from collections import OrderedDict 13 | import numpy as np 14 | import torch 15 | 16 | from src.utils import bool_flag, initialize_exp 17 | from src.models import build_model 18 | from src.trainer import Trainer 19 | from src.evaluation import Evaluator 20 | 21 | 22 | VALIDATION_METRIC = 'mean_cosine-csls_knn_10-S2T-10000' 23 | 24 | 25 | # main 26 | parser = argparse.ArgumentParser(description='Unsupervised training') 27 | parser.add_argument("--seed", type=int, default=-1, help="Initialization seed") 28 | parser.add_argument("--verbose", type=int, default=2, help="Verbose level (2:debug, 1:info, 0:warning)") 29 | parser.add_argument("--exp_path", type=str, default="", help="Where to store experiment logs and models") 30 | parser.add_argument("--exp_name", type=str, default="debug", help="Experiment name") 31 | parser.add_argument("--exp_id", type=str, default="", help="Experiment ID") 32 | parser.add_argument("--cuda", type=bool_flag, default=True, help="Run on GPU") 33 | parser.add_argument("--export", type=str, default="txt", help="Export embeddings after training (txt / pth)") 34 | # data 35 | parser.add_argument("--src_lang", type=str, default='en', help="Source language") 36 | parser.add_argument("--tgt_lang", type=str, default='es', help="Target language") 37 | parser.add_argument("--emb_dim", type=int, default=300, help="Embedding dimension") 38 | parser.add_argument("--max_vocab", type=int, default=200000, help="Maximum vocabulary size (-1 to disable)") 39 | # mapping 40 | parser.add_argument("--map_id_init", type=bool_flag, default=True, help="Initialize the mapping as an identity matrix") 41 | parser.add_argument("--map_beta", type=float, default=0.001, help="Beta for orthogonalization") 42 | # discriminator 43 | parser.add_argument("--dis_layers", type=int, default=2, help="Discriminator layers") 44 | parser.add_argument("--dis_hid_dim", type=int, default=2048, help="Discriminator hidden layer dimensions") 45 | parser.add_argument("--dis_dropout", type=float, default=0., help="Discriminator dropout") 46 | parser.add_argument("--dis_input_dropout", type=float, default=0.1, help="Discriminator input dropout") 47 | parser.add_argument("--dis_steps", type=int, default=5, help="Discriminator steps") 48 | parser.add_argument("--dis_lambda", type=float, default=1, help="Discriminator loss feedback coefficient") 49 | parser.add_argument("--dis_most_frequent", type=int, default=75000, help="Select embeddings of the k most frequent words for discrimination (0 to disable)") 50 | parser.add_argument("--dis_smooth", type=float, default=0.1, help="Discriminator smooth predictions") 51 | parser.add_argument("--dis_clip_weights", type=float, default=0, help="Clip discriminator weights (0 to disable)") 52 | # training adversarial 53 | parser.add_argument("--adversarial", type=bool_flag, default=True, help="Use adversarial training") 54 | parser.add_argument("--n_epochs", type=int, default=5, help="Number of epochs") 55 | parser.add_argument("--epoch_size", type=int, default=1000000, help="Iterations per epoch") 56 | parser.add_argument("--batch_size", type=int, default=32, help="Batch size") 57 | parser.add_argument("--map_optimizer", type=str, default="sgd,lr=0.1", help="Mapping optimizer") 58 | parser.add_argument("--dis_optimizer", type=str, default="sgd,lr=0.1", help="Discriminator optimizer") 59 | parser.add_argument("--lr_decay", type=float, default=0.98, help="Learning rate decay (SGD only)") 60 | parser.add_argument("--min_lr", type=float, default=1e-6, help="Minimum learning rate (SGD only)") 61 | parser.add_argument("--lr_shrink", type=float, default=0.5, help="Shrink the learning rate if the validation metric decreases (1 to disable)") 62 | # training refinement 63 | parser.add_argument("--n_refinement", type=int, default=5, help="Number of refinement iterations (0 to disable the refinement procedure)") 64 | # dictionary creation parameters (for refinement) 65 | parser.add_argument("--dico_eval", type=str, default="default", help="Path to evaluation dictionary") 66 | parser.add_argument("--dico_method", type=str, default='csls_knn_10', help="Method used for dictionary generation (nn/invsm_beta_30/csls_knn_10)") 67 | parser.add_argument("--dico_build", type=str, default='S2T', help="S2T,T2S,S2T|T2S,S2T&T2S") 68 | parser.add_argument("--dico_threshold", type=float, default=0, help="Threshold confidence for dictionary generation") 69 | parser.add_argument("--dico_max_rank", type=int, default=15000, help="Maximum dictionary words rank (0 to disable)") 70 | parser.add_argument("--dico_min_size", type=int, default=0, help="Minimum generated dictionary size (0 to disable)") 71 | parser.add_argument("--dico_max_size", type=int, default=0, help="Maximum generated dictionary size (0 to disable)") 72 | # reload pre-trained embeddings 73 | parser.add_argument("--src_emb", type=str, default="", help="Reload source embeddings") 74 | parser.add_argument("--tgt_emb", type=str, default="", help="Reload target embeddings") 75 | parser.add_argument("--normalize_embeddings", type=str, default="", help="Normalize embeddings before training") 76 | 77 | 78 | # parse parameters 79 | params = parser.parse_args() 80 | 81 | # check parameters 82 | assert not params.cuda or torch.cuda.is_available() 83 | assert 0 <= params.dis_dropout < 1 84 | assert 0 <= params.dis_input_dropout < 1 85 | assert 0 <= params.dis_smooth < 0.5 86 | assert params.dis_lambda > 0 and params.dis_steps > 0 87 | assert 0 < params.lr_shrink <= 1 88 | assert os.path.isfile(params.src_emb) 89 | assert os.path.isfile(params.tgt_emb) 90 | assert params.dico_eval == 'default' or os.path.isfile(params.dico_eval) 91 | assert params.export in ["", "txt", "pth"] 92 | 93 | # build model / trainer / evaluator 94 | logger = initialize_exp(params) 95 | src_emb, tgt_emb, mapping, discriminator = build_model(params, True) 96 | trainer = Trainer(src_emb, tgt_emb, mapping, discriminator, params) 97 | evaluator = Evaluator(trainer) 98 | 99 | 100 | """ 101 | Learning loop for Adversarial Training 102 | """ 103 | if params.adversarial: 104 | logger.info('----> ADVERSARIAL TRAINING <----\n\n') 105 | 106 | # training loop 107 | for n_epoch in range(params.n_epochs): 108 | 109 | logger.info('Starting adversarial training epoch %i...' % n_epoch) 110 | tic = time.time() 111 | n_words_proc = 0 112 | stats = {'DIS_COSTS': []} 113 | 114 | for n_iter in range(0, params.epoch_size, params.batch_size): 115 | 116 | # discriminator training 117 | for _ in range(params.dis_steps): 118 | trainer.dis_step(stats) 119 | 120 | # mapping training (discriminator fooling) 121 | n_words_proc += trainer.mapping_step(stats) 122 | 123 | # log stats 124 | if n_iter % 500 == 0: 125 | stats_str = [('DIS_COSTS', 'Discriminator loss')] 126 | stats_log = ['%s: %.4f' % (v, np.mean(stats[k])) 127 | for k, v in stats_str if len(stats[k]) > 0] 128 | stats_log.append('%i samples/s' % int(n_words_proc / (time.time() - tic))) 129 | logger.info(('%06i - ' % n_iter) + ' - '.join(stats_log)) 130 | 131 | # reset 132 | tic = time.time() 133 | n_words_proc = 0 134 | for k, _ in stats_str: 135 | del stats[k][:] 136 | 137 | # embeddings / discriminator evaluation 138 | to_log = OrderedDict({'n_epoch': n_epoch}) 139 | evaluator.all_eval(to_log) 140 | evaluator.eval_dis(to_log) 141 | 142 | # JSON log / save best model / end of epoch 143 | logger.info("__log__:%s" % json.dumps(to_log)) 144 | trainer.save_best(to_log, VALIDATION_METRIC) 145 | logger.info('End of epoch %i.\n\n' % n_epoch) 146 | 147 | # update the learning rate (stop if too small) 148 | trainer.update_lr(to_log, VALIDATION_METRIC) 149 | if trainer.map_optimizer.param_groups[0]['lr'] < params.min_lr: 150 | logger.info('Learning rate < 1e-6. BREAK.') 151 | break 152 | 153 | 154 | """ 155 | Learning loop for Procrustes Iterative Refinement 156 | """ 157 | if params.n_refinement > 0: 158 | # Get the best mapping according to VALIDATION_METRIC 159 | logger.info('----> ITERATIVE PROCRUSTES REFINEMENT <----\n\n') 160 | trainer.reload_best() 161 | 162 | # training loop 163 | for n_iter in range(params.n_refinement): 164 | 165 | logger.info('Starting refinement iteration %i...' % n_iter) 166 | 167 | # build a dictionary from aligned embeddings 168 | trainer.build_dictionary() 169 | 170 | # apply the Procrustes solution 171 | trainer.procrustes() 172 | 173 | # embeddings evaluation 174 | to_log = OrderedDict({'n_iter': n_iter}) 175 | evaluator.all_eval(to_log) 176 | 177 | # JSON log / save best model / end of epoch 178 | logger.info("__log__:%s" % json.dumps(to_log)) 179 | trainer.save_best(to_log, VALIDATION_METRIC) 180 | logger.info('End of refinement iteration %i.\n\n' % n_iter) 181 | 182 | 183 | # export embeddings 184 | if params.export: 185 | trainer.reload_best() 186 | trainer.export() 187 | --------------------------------------------------------------------------------