├── tests
    ├── __init__.py
    ├── data
    │   └── .gitkeep
    ├── conftest.py
    ├── test_embedding.py
    ├── test_preprocessing.py
    ├── test_utils.py
    ├── test_laser.py
    └── report
    │   └── comparison-with-LASER.md
├── laserembeddings
    ├── data
    │   └── .gitkeep
    ├── __init__.py
    ├── embedding.py
    ├── utils.py
    ├── __main__.py
    ├── preprocessing.py
    ├── laser.py
    └── encoder.py
├── .style.yapf
├── laserembeddings.gif
├── pylintrc
├── pyproject.toml
├── LICENSE
├── .github
    └── workflows
    │   └── python-package.yml
├── CHANGELOG.md
├── .travis.yml
├── .gitignore
└── README.md


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/data/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/laserembeddings/data/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.style.yapf:
--------------------------------------------------------------------------------
1 | [style]
2 | based_on_style=pep8
3 | 


--------------------------------------------------------------------------------
/laserembeddings.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yannvgn/laserembeddings/HEAD/laserembeddings.gif


--------------------------------------------------------------------------------
/laserembeddings/__init__.py:
--------------------------------------------------------------------------------
1 | from .laser import Laser
2 | 
3 | __version__ = '1.1.2'
4 | 
5 | __all__ = ['Laser']
6 | 


--------------------------------------------------------------------------------
/pylintrc:
--------------------------------------------------------------------------------
 1 | [MESSAGES CONTROL]
 2 | disable=
 3 |     line-too-long,
 4 |     trailing-whitespace,
 5 |     missing-docstring,
 6 |     too-many-locals,
 7 |     too-many-instance-attributes,
 8 |     invalid-name,
 9 |     too-few-public-methods,
10 |     too-many-arguments,
11 | 
12 | [TYPECHECK]
13 | ignored-modules=numpy,torch
14 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pytest
 4 | 
 5 | 
 6 | @pytest.fixture
 7 | def test_data():
 8 |     import numpy as np
 9 |     test_data_file = os.path.join(os.path.dirname(os.path.realpath(__file__)),
10 |                                   'data', 'laserembeddings-test-data.npz')
11 | 
12 |     return np.load(test_data_file) if os.path.isfile(test_data_file) else None
13 | 


--------------------------------------------------------------------------------
/tests/test_embedding.py:
--------------------------------------------------------------------------------
 1 | from laserembeddings import Laser
 2 | from laserembeddings.embedding import BPESentenceEmbedding
 3 | 
 4 | 
 5 | def test_bpe_sentence_embedding():
 6 |     assert BPESentenceEmbedding(
 7 |         Laser.DEFAULT_ENCODER_FILE).embed_bpe_sentences(['hello', 'world'
 8 |                                                          ]).shape == (2, 1024)
 9 | 
10 |     with open(Laser.DEFAULT_ENCODER_FILE, 'rb') as encoder_f:
11 |         assert BPESentenceEmbedding(encoder_f).embed_bpe_sentences(
12 |             ['hello', 'world']).shape == (2, 1024)
13 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "laserembeddings"
 3 | version = "1.1.2"
 4 | description = "Production-ready LASER multilingual embeddings"
 5 | authors = ["yannvgn <hi@yannvgn.io>"]
 6 | license = "BSD-3-Clause"
 7 | homepage = "https://github.com/yannvgn/laserembeddings"
 8 | repository = "https://github.com/yannvgn/laserembeddings"
 9 | readme = "README.md"
10 | 
11 | [tool.poetry.dependencies]
12 | python = "^3.6.2"
13 | torch = "^1.0.1.post2"
14 | subword-nmt = "^0.3.6"
15 | numpy = "^1.15.4"
16 | sacremoses = "0.0.35"
17 | transliterate = "1.10.2"
18 | mecab-python3 = { version = "^1.0.1", optional = true }
19 | ipadic = { version = "1.0.0", optional = true }
20 | jieba = { version = "^0.42.1", optional = true }
21 | 
22 | [tool.poetry.dev-dependencies]
23 | pytest = "^4.6"
24 | yapf = "^0.27.0"
25 | pylint = "^2.3"
26 | 
27 | [tool.poetry.extras]
28 | zh = ["jieba"]
29 | ja = ["mecab-python3", "ipadic"]
30 | 
31 | [build-system]
32 | requires = ["poetry_core>=1.0.0"]
33 | build-backend = "poetry.core.masonry.api"
34 | 


--------------------------------------------------------------------------------
/tests/test_preprocessing.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from laserembeddings import Laser
 4 | from laserembeddings.preprocessing import Tokenizer, BPE
 5 | 
 6 | from laserembeddings.utils import sre_performance_patch
 7 | 
 8 | 
 9 | def test_tokenizer():
10 |     with sre_performance_patch():
11 |         assert Tokenizer('en').tokenize("Let's do it!") == "let 's do it !"
12 | 
13 |         assert Tokenizer(
14 |             'en', descape=True).tokenize("Let's do it &amp; pass that test!"
15 |                                          ) == "let 's do it & pass that test !"
16 | 
17 |         with pytest.raises(AssertionError):
18 |             Tokenizer(lower_case=False)
19 | 
20 |         assert not Tokenizer('en').romanize
21 |         assert Tokenizer('el').romanize
22 | 
23 | 
24 | def test_bpe():
25 |     with open(Laser.DEFAULT_BPE_VOCAB_FILE, 'r', encoding='utf-8') as f_vocab:
26 |         bpe = BPE(Laser.DEFAULT_BPE_CODES_FILE, f_vocab)
27 |         assert bpe.encode_tokens(
28 |             "the tests are passing") == 'the test@@ s are passing'
29 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | from io import StringIO
 2 | 
 3 | from laserembeddings.utils import adapt_bpe_codes, sre_performance_patch
 4 | 
 5 | 
 6 | def test_bpe_codes_adapter():
 7 |     test_f = StringIO(
 8 |         '#version:2.0\ne n 52708119\ne r 51024442\ne n</w> 47209692')
 9 | 
10 |     adapted = adapt_bpe_codes(test_f)
11 | 
12 |     assert adapted.readline() == '#version:2.0\n'
13 |     assert adapted.readline() == 'e n\n'
14 |     assert adapted.readline() == 'e r\n'
15 | 
16 |     for line in adapted:
17 |         assert line == 'e n</w>'
18 | 
19 |     adapted.seek(0)
20 | 
21 |     for line in adapted:
22 |         assert line == '#version:2.0\n'
23 |         break
24 | 
25 | 
26 | def test_sre_performance_patch():
27 |     #pylint: disable=protected-access
28 |     try:
29 |         import sre_parse
30 |         uniq = sre_parse._uniq
31 | 
32 |         with sre_performance_patch():
33 |             assert sre_parse._uniq(['5', '2', '3', '2', '5',
34 |                                     '1']) == ['5', '2', '3', '1']
35 | 
36 |         # make sure the original sre_parse._uniq was restored
37 |         assert sre_parse._uniq == uniq
38 |     except (ImportError, AttributeError):
39 |         pass
40 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 4 | Copyright (c) 2019 - 2020 yannvgn <hi@yannvgn.io>
 5 | 
 6 | Redistribution and use in source and binary forms, with or without modification,
 7 | are permitted provided that the following conditions are met:
 8 | 
 9 |  * Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 |  * Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 |  * Neither the name Facebook nor the names of its contributors may be used to
17 |    endorse or promote products derived from this software without specific
18 |    prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
21 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
22 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
24 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
25 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
26 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
27 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
29 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Python package
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ master ]
 9 |   pull_request:
10 |     branches: [ master ]
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ${{ matrix.os }}
16 |     strategy:
17 |       fail-fast: false
18 |       matrix:
19 |         os: [ubuntu-latest, macos-latest, windows-latest]
20 |         python-version: ["3.6", "3.7", "3.8", "3.9"]
21 |         exclude:
22 |           - os: macos-latest
23 |             python-version: '3.6'
24 |           - os: macos-latest
25 |             python-version: '3.7'
26 |           - os: macos-latest
27 |             python-version: '3.8'
28 |           - os: windows-latest
29 |             python-version: '3.6'
30 |           - os: windows-latest
31 |             python-version: '3.7'
32 |           - os: windows-latest
33 |             python-version: '3.8'
34 | 
35 |     steps:
36 |     - uses: actions/checkout@v2
37 |     - name: Set up Python ${{ matrix.python-version }}
38 |       uses: actions/setup-python@v2
39 |       with:
40 |         python-version: ${{ matrix.python-version }}
41 |     - name: Install Poetry
42 |       uses: snok/install-poetry@v1
43 |       with:
44 |         virtualenvs-create: true
45 |         virtualenvs-in-project: false
46 |         installer-parallel: true
47 |     - name: Install laserembeddings
48 |       run: |
49 |         poetry install -E zh -E ja
50 |         poetry run python -m laserembeddings download-models
51 |     - name: Lint
52 |       run: |
53 |         poetry run pylint laserembeddings
54 |     - name: Test with pytest
55 |       run: |
56 |         poetry run pytest
57 | 


--------------------------------------------------------------------------------
/laserembeddings/embedding.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, List, Union
 2 | from io import BufferedIOBase
 3 | 
 4 | import numpy as np
 5 | 
 6 | from .encoder import SentenceEncoder
 7 | 
 8 | __all__ = ['BPESentenceEmbedding']
 9 | 
10 | 
11 | class BPESentenceEmbedding:
12 |     """
13 |     LASER embeddings computation from BPE-encoded sentences.
14 | 
15 |     Args:
16 |         encoder (str or BufferedIOBase): the path to LASER's encoder PyTorch model,
17 |             or a binary-mode file object.
18 |         max_sentences (int, optional): see ``.encoder.SentenceEncoder``.
19 |         max_tokens (int, optional): see ``.encoder.SentenceEncoder``.
20 |         stable (bool, optional): if True, mergesort sorting algorithm will be used,
21 |             otherwise quicksort will be used. Defaults to False. See ``.encoder.SentenceEncoder``.
22 |         cpu (bool, optional): if True, forces the use of the CPU even a GPU is available. Defaults to False.
23 |     """
24 | 
25 |     def __init__(self,
26 |                  encoder: Union[str, BufferedIOBase],
27 |                  max_sentences: Optional[int] = None,
28 |                  max_tokens: Optional[int] = 12000,
29 |                  stable: bool = False,
30 |                  cpu: bool = False):
31 | 
32 |         self.encoder = SentenceEncoder(
33 |             encoder,
34 |             max_sentences=max_sentences,
35 |             max_tokens=max_tokens,
36 |             sort_kind='mergesort' if stable else 'quicksort',
37 |             cpu=cpu)
38 | 
39 |     def embed_bpe_sentences(self, bpe_sentences: List[str]) -> np.ndarray:
40 |         """
41 |         Computes the LASER embeddings of BPE-encoded sentences
42 | 
43 |         Args:
44 |             bpe_sentences (List[str]): The list of BPE-encoded sentences
45 | 
46 |         Returns:
47 |             np.ndarray: A N * 1024 NumPy array containing the embeddings, N being the number of sentences provided.
48 |         """
49 |         return self.encoder.encode_sentences(bpe_sentences)
50 | 


--------------------------------------------------------------------------------
/laserembeddings/utils.py:
--------------------------------------------------------------------------------
 1 | from io import TextIOBase, StringIO
 2 | import re
 3 | 
 4 | __all__ = ['adapt_bpe_codes', 'sre_performance_patch']
 5 | 
 6 | 
 7 | def adapt_bpe_codes(bpe_codes_f: TextIOBase) -> TextIOBase:
 8 |     """
 9 |     Converts fastBPE codes to subword_nmt BPE codes.
10 | 
11 |     Args:
12 |         bpe_codes_f (TextIOBase): the text-mode file-like object of fastBPE codes
13 |     Returns:
14 |         TextIOBase: subword_nmt-compatible BPE codes as a text-mode file-like object
15 |     """
16 |     return StringIO(
17 |         re.sub(r'^([^ ]+) ([^ ]+) ([^ ]+)$',
18 |                r'\1 \2',
19 |                bpe_codes_f.read(),
20 |                flags=re.MULTILINE))
21 | 
22 | 
23 | class sre_performance_patch:
24 |     """
25 |     Patch fixing https://bugs.python.org/issue37723 for Python 3.7 (<= 3.7.4)
26 |     and Python 3.8 (<= 3.8.0 beta 3)
27 |     """
28 | 
29 |     def __init__(self):
30 |         self.sre_parse = None
31 |         self.original_sre_parse_uniq = None
32 | 
33 |     def __enter__(self):
34 |         #pylint: disable=import-outside-toplevel
35 |         import sys
36 | 
37 |         if self.original_sre_parse_uniq is None and (
38 |                 0x03070000 <= sys.hexversion <= 0x030704f0
39 |                 or 0x03080000 <= sys.hexversion <= 0x030800b3):
40 |             try:
41 |                 import sre_parse
42 |                 self.sre_parse = sre_parse
43 |                 #pylint: disable=protected-access
44 |                 self.original_sre_parse_uniq = sre_parse._uniq
45 |                 sre_parse._uniq = lambda x: list(dict.fromkeys(x))
46 |             except (ImportError, AttributeError):
47 |                 self.sre_parse = None
48 |                 self.original_sre_parse_uniq = None
49 | 
50 |     def __exit__(self, type_, value, traceback):
51 |         if self.sre_parse and self.original_sre_parse_uniq:
52 |             #pylint: disable=protected-access
53 |             self.sre_parse._uniq = self.original_sre_parse_uniq
54 |             self.original_sre_parse_uniq = None
55 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | <a name="1.1.2"></a>
 2 | # [1.1.2](https://github.com/yannvgn/laserembeddings/compare/v1.1.1...v1.1.2) (2021-12-12)
 3 | 
 4 | - A compatibility issue with subword-nmt 0.3.8 was fixed (#39) 🐛
 5 | - The behavior of `Laser.embed_sentences` was unclear/misleading when the number of language codes received in the `lang` argument did not match the number of sentences to encode. It now raises an error in that case (#40) 🐛
 6 | 
 7 | <a name="1.1.1"></a>
 8 | # [1.1.1](https://github.com/yannvgn/laserembeddings/compare/v1.1.0...v1.1.1) (2021-02-06)
 9 | 
10 | - An issue with PyTorch 1.7.0 was fixed (#32) 🐛
11 | 
12 | <a name="1.1.0"></a>
13 | # [1.1.0](https://github.com/yannvgn/laserembeddings/compare/v1.0.1...v1.1.0) (2020-10-04)
14 | 
15 | - Japanese extra on Windows is back! 🇯🇵
16 | 
17 | <a name="1.0.1"></a>
18 | # [1.0.1](https://github.com/yannvgn/laserembeddings/compare/v1.0.0...v1.0.1) (2020-03-02)
19 | 
20 | - The encoder was fixed to remove an innocuous warning message that would sometimes appear when using PyTorch 1.4 🐛
21 | - Japanese extra is now disabled on Windows (sorry) to prevent installation issues and computation failures in other languages 😕
22 | 
23 | <a name="1.0.0"></a>
24 | # [1.0.0](https://github.com/yannvgn/laserembeddings/compare/v0.1.3...v1.0.0) (2019-12-19)
25 | 
26 | - Greek, Chinese and Japanese are now supported 🇬🇷 🇨🇳 🇯🇵 
27 | - Some languages that were only partially supported are now fully supported (New Norwegian, Swedish, Tatar) 🌍
28 | - It should work on Windows now 🙄
29 | - Sentences in different languages can now be processed in the same batch ⚡️
30 | 
31 | <a name="0.1.3"></a>
32 | # [0.1.3](https://github.com/yannvgn/laserembeddings/compare/v0.1.2...v0.1.3) (2019-10-03)
33 | 
34 | - A lot of languages that were only partially supported are now fully supported (br, bs, ceb, fr, gl, oc, ug, vi) 🌍
35 | 
36 | <a name="0.1.2"></a>
37 | # [0.1.2](https://github.com/yannvgn/laserembeddings/compare/v0.1.1...v0.1.2) (2019-08-24)
38 | 
39 | - Korean is now fully supported ✅
40 | - A [bug](https://bugs.python.org/issue37723) in Python 3.7 (<= 3.7.4) and 3.8 (<= 3.8.0 beta 3) affecting the tokenizer performance was patched as a temporary solution until next Python releases 🐛
41 | 
42 | <a name="0.1.1"></a>
43 | # 0.1.1 (2019-07-23)
44 | 
45 | - Initial version 🐣
46 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | jobs:
 3 |   include:
 4 |     - name: "Python 3.8 on Xenial Linux"
 5 |       python: 3.8
 6 |       before_install:
 7 |         - python -m pip install --upgrade pip
 8 |         - pip3 install poetry==1.1.*
 9 |         - pip3 install torch==1.6.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
10 |     - name: "Python 3.6 on Xenial Linux"
11 |       python: 3.6
12 |       before_install:
13 |         - python -m pip install --upgrade pip
14 |         - pip3 install poetry==1.1.*
15 |         - pip3 install torch==1.6.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
16 |     - name: "Python 3.7 on macOS"
17 |       os: osx
18 |       osx_image: xcode11.2
19 |       language: shell
20 |       before_install:
21 |         - python3 -m pip install --upgrade pip
22 |         - pip3 install poetry==1.1.*
23 |         - pip3 install virtualenv
24 |         - virtualenv .env
25 |         - source .env/bin/activate
26 |         - pip3 install torch
27 |     - name: "Python 3.7 on Windows"
28 |       os: windows
29 |       language: shell
30 |       before_install:
31 |         - choco install python --version 3.7.0
32 |         - python -m pip install --upgrade pip
33 |         - pip3 install poetry==1.1.*
34 |         - poetry config virtualenvs.create false
35 |         - pip3 install torch==1.6.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
36 |       env: PATH=/c/Python37:/c/Python37/Scripts:$PATH
37 |     - name: "Python 3.8 on Xenial Linux (wheel installation)"
38 |       python: 3.8
39 |       before_install:
40 |         - python -m pip install --upgrade pip
41 |         - pip3 install poetry==1.1.*
42 |         - pip3 install torch==1.6.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
43 |       install:
44 |         - poetry build
45 |         - pip3 install dist/laserembeddings-*.whl
46 |         - python -m laserembeddings download-models
47 |       script:
48 |         - python -c 'from laserembeddings import Laser; laser = Laser(); laser.embed_sentences(["test"], lang="en")'
49 | 
50 | install:
51 |   - poetry remove torch # fix: latest torch wheel (1.1.0.post2) not available for linux
52 |   - poetry install -E zh -E ja
53 |   - python3 -m laserembeddings download-models || python -m laserembeddings download-models
54 | 
55 | script:
56 |   - poetry run pylint laserembeddings
57 |   - poetry run pytest
58 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | laserembeddings/data/**
  2 | !laserembeddings/data/.gitkeep
  3 | tests/data/**
  4 | !tests/data/.gitkeep
  5 | 
  6 | poetry.lock
  7 | poetry.toml
  8 | 
  9 | .DS_Store
 10 | .vscode
 11 | 
 12 | # Byte-compiled / optimized / DLL files
 13 | __pycache__/
 14 | *.py[cod]
 15 | *$py.class
 16 | 
 17 | # C extensions
 18 | *.so
 19 | 
 20 | # Distribution / packaging
 21 | .Python
 22 | build/
 23 | develop-eggs/
 24 | dist/
 25 | downloads/
 26 | eggs/
 27 | .eggs/
 28 | lib/
 29 | lib64/
 30 | parts/
 31 | sdist/
 32 | var/
 33 | wheels/
 34 | pip-wheel-metadata/
 35 | share/python-wheels/
 36 | *.egg-info/
 37 | .installed.cfg
 38 | *.egg
 39 | MANIFEST
 40 | 
 41 | # PyInstaller
 42 | #  Usually these files are written by a python script from a template
 43 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 44 | *.manifest
 45 | *.spec
 46 | 
 47 | # Installer logs
 48 | pip-log.txt
 49 | pip-delete-this-directory.txt
 50 | 
 51 | # Unit test / coverage reports
 52 | htmlcov/
 53 | .tox/
 54 | .nox/
 55 | .coverage
 56 | .coverage.*
 57 | .cache
 58 | nosetests.xml
 59 | coverage.xml
 60 | *.cover
 61 | .hypothesis/
 62 | .pytest_cache/
 63 | 
 64 | # Translations
 65 | *.mo
 66 | *.pot
 67 | 
 68 | # Django stuff:
 69 | *.log
 70 | local_settings.py
 71 | db.sqlite3
 72 | db.sqlite3-journal
 73 | 
 74 | # Flask stuff:
 75 | instance/
 76 | .webassets-cache
 77 | 
 78 | # Scrapy stuff:
 79 | .scrapy
 80 | 
 81 | # Sphinx documentation
 82 | docs/_build/
 83 | 
 84 | # PyBuilder
 85 | target/
 86 | 
 87 | # Jupyter Notebook
 88 | .ipynb_checkpoints
 89 | 
 90 | # IPython
 91 | profile_default/
 92 | ipython_config.py
 93 | 
 94 | # pyenv
 95 | .python-version
 96 | 
 97 | # pipenv
 98 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 99 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
100 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
101 | #   install all needed dependencies.
102 | #Pipfile.lock
103 | 
104 | # celery beat schedule file
105 | celerybeat-schedule
106 | 
107 | # SageMath parsed files
108 | *.sage.py
109 | 
110 | # Environments
111 | .env
112 | .venv
113 | env/
114 | venv/
115 | ENV/
116 | env.bak/
117 | venv.bak/
118 | 
119 | # Spyder project settings
120 | .spyderproject
121 | .spyproject
122 | 
123 | # Rope project settings
124 | .ropeproject
125 | 
126 | # mkdocs documentation
127 | /site
128 | 
129 | # mypy
130 | .mypy_cache/
131 | .dmypy.json
132 | dmypy.json
133 | 
134 | # Pyre type checker
135 | .pyre/
136 | 
137 | # PyCharm files
138 | .idea/*
139 | 


--------------------------------------------------------------------------------
/tests/test_laser.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pytest
 4 | import numpy as np
 5 | 
 6 | from laserembeddings import Laser
 7 | 
 8 | SIMILARITY_TEST = os.getenv('SIMILARITY_TEST')
 9 | SKIP_ZH = os.getenv('SKIP_ZH')
10 | SKIP_JA = os.getenv('SKIP_JA')
11 | 
12 | 
13 | def test_laser():
14 |     with open(Laser.DEFAULT_ENCODER_FILE, 'rb') as f_encoder:
15 |         laser = Laser(
16 |             Laser.DEFAULT_BPE_CODES_FILE,
17 |             None,
18 |             f_encoder,
19 |         )
20 |         assert laser.embed_sentences(
21 |             ['hello world!', 'i hope the tests are passing'],
22 |             lang='en').shape == (2, 1024)
23 |         assert laser.embed_sentences(['hello world!', "j'aime les pâtes"],
24 |                                      lang=['en', 'fr']).shape == (2, 1024)
25 |         assert laser.embed_sentences('hello world!',
26 |                                      lang='en').shape == (1, 1024)
27 | 
28 |         with pytest.raises(ValueError):
29 |             laser.embed_sentences(['hello world!', "j'aime les pâtes"],
30 |                                   lang=['en'])
31 | 
32 | 
33 | def test_zh():
34 |     if SKIP_ZH:
35 |         pytest.skip("SKIP_ZH is set")
36 |     laser = Laser()
37 |     assert laser.embed_sentences(['干杯！'], lang='zh').shape == (1, 1024)
38 | 
39 | 
40 | def test_ja():
41 |     if SKIP_JA:
42 |         pytest.skip("SKIP_JA is set")
43 |     laser = Laser()
44 |     assert laser.embed_sentences(['乾杯！'], lang='ja').shape == (1, 1024)
45 | 
46 | 
47 | def test_similarity(test_data):
48 |     if not SIMILARITY_TEST:
49 |         pytest.skip("SIMILARITY_TEST not set")
50 | 
51 |     if not test_data:
52 |         raise FileNotFoundError(
53 |             'laserembeddings-test-data.npz is missing, run "python -m laserembeddings download-test-data" to fix that'
54 |         )
55 | 
56 |     report = os.path.join(os.path.dirname(os.path.realpath(__file__)),
57 |                           'report', 'comparison-with-LASER.md')
58 | 
59 |     laser = Laser()
60 | 
61 |     with open(report, 'w', encoding='utf-8') as f_report:
62 | 
63 |         f_report.write(
64 |             '# Comparison of the embeddings computed with original LASER with the embeddings computed with this package\n'
65 |         )
66 |         f_report.write(
67 |             '| |language|avg. cosine similarity|min. cosine similarity|\n')
68 |         f_report.write(
69 |             '|-|--------|----------------------|----------------------|\n')
70 | 
71 |         for lang in test_data['langs']:
72 | 
73 |             sents = test_data[f'{lang}_sentences']
74 |             orig_embeddings = test_data[f'{lang}_embeddings']
75 |             embeddings = laser.embed_sentences(sents, lang)
76 | 
77 |             assert embeddings.shape == orig_embeddings.shape
78 | 
79 |             cosine_similarities = np.sum(
80 |                 orig_embeddings * embeddings,
81 |                 axis=1) / (np.linalg.norm(orig_embeddings, axis=1) *
82 |                            np.linalg.norm(embeddings, axis=1))
83 | 
84 |             similarity_mean = np.mean(cosine_similarities)
85 |             similarity_min = np.min(cosine_similarities)
86 | 
87 |             f_report.write(
88 |                 f'|{"✅" if similarity_min > 0.99999 else "⚠️" if similarity_mean > 0.99 else "❌"}|{lang}|{similarity_mean:.5f}|{similarity_min:.5f}|\n'
89 |             )
90 | 


--------------------------------------------------------------------------------
/tests/report/comparison-with-LASER.md:
--------------------------------------------------------------------------------
  1 | # Comparison of the embeddings computed with original LASER with the embeddings computed with this package
  2 | | |language|avg. cosine similarity|min. cosine similarity|
  3 | |-|--------|----------------------|----------------------|
  4 | |✅|af|1.00000|1.00000|
  5 | |✅|am|1.00000|1.00000|
  6 | |✅|ang|1.00000|1.00000|
  7 | |✅|ar|1.00000|1.00000|
  8 | |✅|arq|1.00000|1.00000|
  9 | |✅|arz|1.00000|1.00000|
 10 | |✅|ast|1.00000|1.00000|
 11 | |✅|awa|1.00000|1.00000|
 12 | |✅|az|1.00000|1.00000|
 13 | |✅|be|1.00000|1.00000|
 14 | |✅|ber|1.00000|1.00000|
 15 | |✅|bg|1.00000|1.00000|
 16 | |✅|bn|1.00000|1.00000|
 17 | |✅|br|1.00000|1.00000|
 18 | |✅|bs|1.00000|1.00000|
 19 | |✅|ca|1.00000|1.00000|
 20 | |✅|cbk|1.00000|1.00000|
 21 | |✅|ceb|1.00000|1.00000|
 22 | |✅|ch|1.00000|1.00000|
 23 | |✅|cmn|1.00000|1.00000|
 24 | |✅|cs|1.00000|1.00000|
 25 | |✅|csb|1.00000|1.00000|
 26 | |✅|cy|1.00000|1.00000|
 27 | |✅|da|1.00000|1.00000|
 28 | |✅|de|1.00000|1.00000|
 29 | |✅|dsb|1.00000|1.00000|
 30 | |✅|dtp|1.00000|1.00000|
 31 | |✅|el|1.00000|1.00000|
 32 | |✅|en|1.00000|1.00000|
 33 | |✅|eo|1.00000|1.00000|
 34 | |✅|es|1.00000|1.00000|
 35 | |✅|et|1.00000|1.00000|
 36 | |✅|eu|1.00000|1.00000|
 37 | |✅|fi|1.00000|1.00000|
 38 | |✅|fo|1.00000|1.00000|
 39 | |✅|fr|1.00000|1.00000|
 40 | |⚠️|fy|0.99993|0.99319|
 41 | |⚠️|ga|0.99762|0.92033|
 42 | |✅|gd|1.00000|1.00000|
 43 | |✅|gl|1.00000|1.00000|
 44 | |✅|gsw|1.00000|1.00000|
 45 | |✅|he|1.00000|1.00000|
 46 | |✅|hi|1.00000|1.00000|
 47 | |✅|hr|1.00000|1.00000|
 48 | |✅|hsb|1.00000|1.00000|
 49 | |✅|hu|1.00000|1.00000|
 50 | |✅|hy|1.00000|1.00000|
 51 | |✅|ia|1.00000|1.00000|
 52 | |✅|id|1.00000|1.00000|
 53 | |✅|ie|1.00000|1.00000|
 54 | |✅|io|1.00000|1.00000|
 55 | |✅|is|1.00000|1.00000|
 56 | |✅|it|1.00000|1.00000|
 57 | |✅|ja|1.00000|1.00000|
 58 | |⚠️|jv|0.99987|0.98719|
 59 | |⚠️|ka|0.99739|0.73893|
 60 | |✅|kab|1.00000|1.00000|
 61 | |✅|kk|1.00000|1.00000|
 62 | |❌|km|0.96787|0.76779|
 63 | |✅|ko|1.00000|1.00000|
 64 | |✅|ku|1.00000|1.00000|
 65 | |✅|kw|1.00000|1.00000|
 66 | |✅|kzj|1.00000|1.00000|
 67 | |⚠️|la|0.99547|0.86945|
 68 | |✅|lfn|1.00000|1.00000|
 69 | |✅|lt|1.00000|1.00000|
 70 | |✅|lvs|1.00000|1.00000|
 71 | |✅|max|1.00000|1.00000|
 72 | |✅|mhr|1.00000|1.00000|
 73 | |✅|mk|1.00000|1.00000|
 74 | |⚠️|ml|0.99297|0.87608|
 75 | |✅|mn|1.00000|1.00000|
 76 | |⚠️|mr|0.99952|0.95155|
 77 | |✅|nb|1.00000|1.00000|
 78 | |✅|nds|1.00000|1.00000|
 79 | |✅|nl|1.00000|1.00000|
 80 | |✅|nn|1.00000|1.00000|
 81 | |✅|nov|1.00000|1.00000|
 82 | |✅|oc|1.00000|1.00000|
 83 | |✅|orv|1.00000|1.00000|
 84 | |✅|pam|1.00000|1.00000|
 85 | |⚠️|pes|0.99901|0.93162|
 86 | |✅|pl|1.00000|1.00000|
 87 | |✅|pms|1.00000|1.00000|
 88 | |✅|pt|1.00000|1.00000|
 89 | |✅|ro|1.00000|1.00000|
 90 | |✅|ru|1.00000|1.00000|
 91 | |✅|sk|1.00000|1.00000|
 92 | |✅|sl|1.00000|1.00000|
 93 | |✅|sq|1.00000|1.00000|
 94 | |✅|sr|1.00000|1.00000|
 95 | |✅|sv|1.00000|1.00000|
 96 | |✅|swg|1.00000|1.00000|
 97 | |✅|swh|1.00000|1.00000|
 98 | |✅|ta|1.00000|1.00000|
 99 | |⚠️|te|0.99838|0.88623|
100 | |✅|th|1.00000|1.00000|
101 | |✅|tk|1.00000|1.00000|
102 | |✅|tl|1.00000|1.00000|
103 | |✅|tr|1.00000|1.00000|
104 | |✅|tt|1.00000|1.00000|
105 | |✅|tzl|1.00000|1.00000|
106 | |✅|ug|1.00000|1.00000|
107 | |✅|uk|1.00000|1.00000|
108 | |✅|ur|1.00000|1.00000|
109 | |✅|uz|1.00000|1.00000|
110 | |✅|vi|1.00000|1.00000|
111 | |✅|war|1.00000|1.00000|
112 | |✅|wuu|1.00000|1.00000|
113 | |✅|xh|1.00000|1.00000|
114 | |⚠️|yi|0.99958|0.96916|
115 | |✅|yue|1.00000|1.00000|
116 | |✅|zsm|1.00000|1.00000|
117 | 


--------------------------------------------------------------------------------
/laserembeddings/__main__.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import urllib.request
  4 | import tarfile
  5 | 
  6 | IS_WIN = os.name == 'nt'
  7 | 
  8 | 
  9 | def non_win_string(s):
 10 |     return s if not IS_WIN else ''
 11 | 
 12 | 
 13 | CONSOLE_CLEAR = non_win_string('\033[0;0m')
 14 | CONSOLE_BOLD = non_win_string('\033[0;1m')
 15 | CONSOLE_WAIT = non_win_string('⏳')
 16 | CONSOLE_DONE = non_win_string('✅')
 17 | CONSOLE_STARS = non_win_string('✨')
 18 | CONSOLE_ERROR = non_win_string('❌')
 19 | 
 20 | 
 21 | def print_usage():
 22 |     print('Usage:')
 23 |     print('')
 24 |     print(
 25 |         f'{CONSOLE_BOLD}python -m laserembeddings download-models [OUTPUT_DIRECTORY]{CONSOLE_CLEAR}'
 26 |     )
 27 |     print(
 28 |         '   Downloads LASER model files. If OUTPUT_DIRECTORY is omitted,'
 29 |         '\n'
 30 |         f'   the models will be placed into the {CONSOLE_BOLD}data{CONSOLE_CLEAR} directory of the module'
 31 |     )
 32 |     print('')
 33 |     print(
 34 |         f'{CONSOLE_BOLD}python -m laserembeddings download-test-data{CONSOLE_CLEAR}'
 35 |     )
 36 |     print('   downloads data needed to run the tests')
 37 |     print('')
 38 | 
 39 | 
 40 | def download_file(url, dest):
 41 |     print(f'{CONSOLE_WAIT}   Downloading {url}...', end='')
 42 |     sys.stdout.flush()
 43 |     urllib.request.urlretrieve(url, dest)
 44 |     print(f'\r{CONSOLE_DONE}   Downloaded {url}    ')
 45 | 
 46 | 
 47 | def extract_tar(tar, output_dir):
 48 |     print(f'{CONSOLE_WAIT}   Extracting archive...', end='')
 49 |     sys.stdout.flush()
 50 |     with tarfile.open(tar) as t:
 51 |         t.extractall(output_dir)
 52 |     print(f'\r{CONSOLE_DONE}   Extracted archive    ')
 53 | 
 54 | 
 55 | def download_models(output_dir):
 56 |     print(f'Downloading models into {output_dir}')
 57 |     print('')
 58 | 
 59 |     download_file('https://dl.fbaipublicfiles.com/laser/models/93langs.fcodes',
 60 |                   os.path.join(output_dir, '93langs.fcodes'))
 61 |     download_file('https://dl.fbaipublicfiles.com/laser/models/93langs.fvocab',
 62 |                   os.path.join(output_dir, '93langs.fvocab'))
 63 |     download_file(
 64 |         'https://dl.fbaipublicfiles.com/laser/models/bilstm.93langs.2018-12-26.pt',
 65 |         os.path.join(output_dir, 'bilstm.93langs.2018-12-26.pt'))
 66 | 
 67 |     print('')
 68 |     print(f'{CONSOLE_STARS} You\'re all set!')
 69 | 
 70 | 
 71 | def download_and_extract_test_data(output_dir):
 72 |     print(f'Downloading test data into {output_dir}')
 73 |     print('')
 74 | 
 75 |     download_file(
 76 |         'https://github.com/yannvgn/laserembeddings-test-data/releases/download/v1.0.2/laserembeddings-test-data.tar.gz',
 77 |         os.path.join(output_dir, 'laserembeddings-test-data.tar.gz'))
 78 | 
 79 |     extract_tar(os.path.join(output_dir, 'laserembeddings-test-data.tar.gz'),
 80 |                 output_dir)
 81 | 
 82 |     print('')
 83 |     print(f'{CONSOLE_STARS} Ready to test all that!')
 84 | 
 85 | 
 86 | def main():
 87 |     if len(sys.argv) == 1:
 88 |         print_usage()
 89 |         return
 90 | 
 91 |     if any(arg == '--help' for arg in sys.argv):
 92 |         print_usage()
 93 |         return
 94 | 
 95 |     if sys.argv[1] == 'download-models':
 96 |         output_dir = sys.argv[2] if len(sys.argv) > 2 else os.path.join(
 97 |             os.path.dirname(os.path.realpath(__file__)), 'data')
 98 | 
 99 |         download_models(output_dir)
100 | 
101 |     elif sys.argv[1] == 'download-test-data':
102 |         if len(sys.argv) > 2:
103 |             print_usage()
104 |             return
105 | 
106 |         repository_root = os.path.dirname(
107 |             os.path.dirname(os.path.realpath(__file__)))
108 | 
109 |         if not os.path.isfile(os.path.join(repository_root, 'pyproject.toml')):
110 |             print(
111 |                 f"{CONSOLE_ERROR}  Looks like you're not running laserembeddings from its source code"
112 |             )
113 |             print(
114 |                 "     → please checkout https://github.com/yannvgn/laserembeddings.git"
115 |             )
116 |             print(
117 |                 '       then run "python -m laserembeddings download-test-data" from the root of the repository'
118 |             )
119 |             return
120 | 
121 |         download_and_extract_test_data(
122 |             os.path.join(repository_root, 'tests', 'data'))
123 | 
124 | 
125 | if __name__ == "__main__":
126 |     main()
127 | 


--------------------------------------------------------------------------------
/laserembeddings/preprocessing.py:
--------------------------------------------------------------------------------
  1 | from typing import Union, Optional
  2 | from io import TextIOBase
  3 | 
  4 | from sacremoses import MosesPunctNormalizer, MosesTokenizer
  5 | from sacremoses.util import xml_unescape
  6 | from subword_nmt.apply_bpe import BPE as subword_nmt_bpe, read_vocabulary
  7 | from transliterate import translit
  8 | 
  9 | from .utils import adapt_bpe_codes
 10 | 
 11 | # Extras
 12 | try:
 13 |     import jieba
 14 |     jieba.setLogLevel(60)
 15 | except ImportError:
 16 |     jieba = None
 17 | 
 18 | try:
 19 |     import MeCab
 20 |     import ipadic
 21 | except ImportError:
 22 |     MeCab = None
 23 | 
 24 | __all__ = ['Tokenizer', 'BPE']
 25 | 
 26 | ###############################################################################
 27 | #
 28 | # Tokenizer
 29 | #
 30 | ###############################################################################
 31 | 
 32 | 
 33 | class Tokenizer:
 34 |     """
 35 |     Tokenizer.
 36 | 
 37 |     Args:
 38 |         lang (str): the language code (ISO 639-1) of the texts to tokenize
 39 |         lower_case (bool, optional): if True, the texts are lower-cased before being tokenized.
 40 |             Defaults to True.
 41 |         romanize (bool or None, optional): if True, the texts are romanized.
 42 |             Defaults to None (romanization enabled based on input language).
 43 |         descape (bool, optional): if True, the XML-escaped symbols get de-escaped.
 44 |             Default to False.
 45 |     """
 46 | 
 47 |     def __init__(self,
 48 |                  lang: str = 'en',
 49 |                  lower_case: bool = True,
 50 |                  romanize: Optional[bool] = None,
 51 |                  descape: bool = False):
 52 |         assert lower_case, 'lower case is needed by all the models'
 53 | 
 54 |         if lang in ('cmn', 'wuu', 'yue'):
 55 |             lang = 'zh'
 56 |         if lang == 'jpn':
 57 |             lang = 'ja'
 58 | 
 59 |         if lang == 'zh' and jieba is None:
 60 |             raise ModuleNotFoundError(
 61 |                 '''No module named 'jieba'. Install laserembeddings with 'zh' extra to fix that: "pip install laserembeddings[zh]"'''
 62 |             )
 63 |         if lang == 'ja' and MeCab is None:
 64 |             raise ModuleNotFoundError(
 65 |                 '''No module named 'MeCab'. Install laserembeddings with 'ja' extra to fix that: "pip install laserembeddings[ja]"'''
 66 |             )
 67 | 
 68 |         self.lang = lang
 69 |         self.lower_case = lower_case
 70 |         self.romanize = romanize if romanize is not None else lang == 'el'
 71 |         self.descape = descape
 72 | 
 73 |         self.normalizer = MosesPunctNormalizer(lang=lang)
 74 |         self.tokenizer = MosesTokenizer(lang=lang)
 75 |         self.mecab_tokenizer = MeCab.Tagger(
 76 |             f"{ipadic.MECAB_ARGS} -Owakati -b 50000") if lang == 'ja' else None
 77 | 
 78 |     def tokenize(self, text: str) -> str:
 79 |         """Tokenizes a text and returns the tokens as a string"""
 80 | 
 81 |         # REM_NON_PRINT_CHAR
 82 |         # not implemented
 83 | 
 84 |         # NORM_PUNC
 85 |         text = self.normalizer.normalize(text)
 86 | 
 87 |         # DESCAPE
 88 |         if self.descape:
 89 |             text = xml_unescape(text)
 90 | 
 91 |         # MOSES_TOKENIZER
 92 |         # see: https://github.com/facebookresearch/LASER/issues/55#issuecomment-480881573
 93 |         text = self.tokenizer.tokenize(text,
 94 |                                        return_str=True,
 95 |                                        escape=False,
 96 |                                        aggressive_dash_splits=False)
 97 | 
 98 |         # jieba
 99 |         if self.lang == 'zh':
100 |             text = ' '.join(jieba.cut(text.rstrip('\r\n')))
101 | 
102 |         # MECAB
103 |         if self.lang == 'ja':
104 |             text = self.mecab_tokenizer.parse(text).rstrip('\r\n')
105 | 
106 |         # ROMAN_LC
107 |         if self.romanize:
108 |             text = translit(text, self.lang, reversed=True)
109 | 
110 |         if self.lower_case:
111 |             text = text.lower()
112 | 
113 |         return text
114 | 
115 | 
116 | ###############################################################################
117 | #
118 | # Apply BPE
119 | #
120 | ###############################################################################
121 | 
122 | 
123 | class BPE:
124 |     """
125 |     BPE encoder.
126 | 
127 |     Args:
128 |         bpe_codes (str or TextIOBase): the path to LASER's BPE codes (``93langs.fcodes``),
129 |             or a text-mode file object.
130 |         bpe_codes (str or TextIOBase): the path to LASER's BPE vocabulary (``93langs.fvocab``),
131 |             or a text-mode file object.
132 |     """
133 | 
134 |     def __init__(self, bpe_codes: Union[str, TextIOBase],
135 |                  bpe_vocab: Union[str, TextIOBase]):
136 | 
137 |         f_bpe_codes = None
138 |         f_bpe_vocab = None
139 | 
140 |         try:
141 |             if isinstance(bpe_codes, str):
142 |                 f_bpe_codes = open(bpe_codes, 'r', encoding='utf-8')  # pylint: disable=consider-using-with
143 |             if isinstance(bpe_vocab, str):
144 |                 f_bpe_vocab = open(bpe_vocab, 'r', encoding='utf-8')  # pylint: disable=consider-using-with
145 | 
146 |             self.bpe = subword_nmt_bpe(codes=adapt_bpe_codes(f_bpe_codes
147 |                                                              or bpe_codes),
148 |                                        vocab=read_vocabulary(f_bpe_vocab
149 |                                                              or bpe_vocab,
150 |                                                              threshold=None))
151 |             self.bpe.version = (0, 2)
152 | 
153 |         finally:
154 |             if f_bpe_codes:
155 |                 f_bpe_codes.close()
156 |             if f_bpe_vocab:
157 |                 f_bpe_vocab.close()
158 | 
159 |     def encode_tokens(self, sentence_tokens: str) -> str:
160 |         """Returns the BPE-encoded sentence from a tokenized sentence"""
161 |         return self.bpe.process_line(sentence_tokens)
162 | 


--------------------------------------------------------------------------------
/laserembeddings/laser.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict, Any, Union, List, Optional
  2 | from io import TextIOBase, BufferedIOBase
  3 | import os
  4 | 
  5 | import numpy as np
  6 | 
  7 | from .preprocessing import Tokenizer, BPE
  8 | from .embedding import BPESentenceEmbedding
  9 | from .utils import sre_performance_patch
 10 | 
 11 | __all__ = ['Laser']
 12 | 
 13 | 
 14 | class Laser:
 15 |     """
 16 |     End-to-end LASER embedding.
 17 | 
 18 |     The pipeline is: ``Tokenizer.tokenize`` -> ``BPE.encode_tokens`` -> ``BPESentenceEmbedding.embed_bpe_sentences``
 19 | 
 20 |     Args:
 21 |         bpe_codes (str or TextIOBase, optional): the path to LASER's BPE codes (``93langs.fcodes``),
 22 |             or a text-mode file object. If omitted, ``Laser.DEFAULT_BPE_CODES_FILE`` is used.
 23 |         bpe_codes (str or TextIOBase, optional): the path to LASER's BPE vocabulary (``93langs.fvocab``),
 24 |             or a text-mode file object. If omitted, ``Laser.DEFAULT_BPE_VOCAB_FILE`` is used.
 25 |         encoder (str or BufferedIOBase, optional): the path to LASER's encoder PyToch model (``bilstm.93langs.2018-12-26.pt``),
 26 |             or a binary-mode file object. If omitted, ``Laser.DEFAULT_ENCODER_FILE`` is used.
 27 |         tokenizer_options (Dict[str, Any], optional): additional arguments to pass to the tokenizer.
 28 |             See ``.preprocessing.Tokenizer``.
 29 |         embedding_options (Dict[str, Any], optional): additional arguments to pass to the embedding layer.
 30 |             See ``.embedding.BPESentenceEmbedding``.
 31 |     
 32 |     Class attributes:
 33 |         DATA_DIR (str): the path to the directory of default LASER files.
 34 |         DEFAULT_BPE_CODES_FILE: the path to default BPE codes file.
 35 |         DEFAULT_BPE_VOCAB_FILE: the path to default BPE vocabulary file.
 36 |         DEFAULT_ENCODER_FILE: the path to default LASER encoder PyTorch model file.
 37 |     """
 38 | 
 39 |     DATA_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)),
 40 |                             'data')
 41 |     DEFAULT_BPE_CODES_FILE = os.path.join(DATA_DIR, '93langs.fcodes')
 42 |     DEFAULT_BPE_VOCAB_FILE = os.path.join(DATA_DIR, '93langs.fvocab')
 43 |     DEFAULT_ENCODER_FILE = os.path.join(DATA_DIR,
 44 |                                         'bilstm.93langs.2018-12-26.pt')
 45 | 
 46 |     def __init__(self,
 47 |                  bpe_codes: Optional[Union[str, TextIOBase]] = None,
 48 |                  bpe_vocab: Optional[Union[str, TextIOBase]] = None,
 49 |                  encoder: Optional[Union[str, BufferedIOBase]] = None,
 50 |                  tokenizer_options: Optional[Dict[str, Any]] = None,
 51 |                  embedding_options: Optional[Dict[str, Any]] = None):
 52 | 
 53 |         if tokenizer_options is None:
 54 |             tokenizer_options = {}
 55 |         if embedding_options is None:
 56 |             embedding_options = {}
 57 | 
 58 |         if bpe_codes is None:
 59 |             if not os.path.isfile(self.DEFAULT_BPE_CODES_FILE):
 60 |                 raise FileNotFoundError(
 61 |                     '93langs.fcodes is missing, run "python -m laserembeddings download-models" to fix that'
 62 |                 )
 63 |             bpe_codes = self.DEFAULT_BPE_CODES_FILE
 64 |         if bpe_vocab is None:
 65 |             if not os.path.isfile(self.DEFAULT_BPE_VOCAB_FILE):
 66 |                 raise FileNotFoundError(
 67 |                     '93langs.fvocab is missing, run "python -m laserembeddings download-models" to fix that'
 68 |                 )
 69 |             bpe_vocab = self.DEFAULT_BPE_VOCAB_FILE
 70 |         if encoder is None:
 71 |             if not os.path.isfile(self.DEFAULT_ENCODER_FILE):
 72 |                 raise FileNotFoundError(
 73 |                     'bilstm.93langs.2018-12-26.pt is missing, run "python -m laserembeddings download-models" to fix that'
 74 |                 )
 75 |             encoder = self.DEFAULT_ENCODER_FILE
 76 | 
 77 |         self.tokenizer_options = tokenizer_options
 78 |         self.tokenizers: Dict[str, Tokenizer] = {}
 79 | 
 80 |         self.bpe = BPE(bpe_codes, bpe_vocab)
 81 |         self.bpeSentenceEmbedding = BPESentenceEmbedding(
 82 |             encoder, **embedding_options)
 83 | 
 84 |     def _get_tokenizer(self, lang: str) -> Tokenizer:
 85 |         """Returns the Tokenizer instance for the specified language. The returned tokenizers are cached."""
 86 | 
 87 |         if lang not in self.tokenizers:
 88 |             self.tokenizers[lang] = Tokenizer(lang, **self.tokenizer_options)
 89 | 
 90 |         return self.tokenizers[lang]
 91 | 
 92 |     def embed_sentences(self, sentences: Union[List[str], str],
 93 |                         lang: Union[str, List[str]]) -> np.ndarray:
 94 |         """
 95 |         Computes the LASER embeddings of provided sentences using the tokenizer for the specified language.
 96 | 
 97 |         Args:
 98 |             sentences (str or List[str]): the sentences to compute the embeddings from.
 99 |             lang (str or List[str]): the language code(s) (ISO 639-1) used to tokenize the sentences
100 |                 (either as a string - same code for every sentence - or as a list of strings - one code per sentence).
101 | 
102 |         Returns:
103 |             np.ndarray: A N * 1024 NumPy array containing the embeddings, N being the number of sentences provided.
104 |         """
105 |         sentences = [sentences] if isinstance(sentences, str) else sentences
106 |         lang = [lang] * len(sentences) if isinstance(lang, str) else lang
107 | 
108 |         if len(sentences) != len(lang):
109 |             raise ValueError(
110 |                 'lang: invalid length: the number of language codes does not match the number of sentences'
111 |             )
112 | 
113 |         with sre_performance_patch():  # see https://bugs.python.org/issue37723
114 |             sentence_tokens = [
115 |                 self._get_tokenizer(sentence_lang).tokenize(sentence)
116 |                 for sentence, sentence_lang in zip(sentences, lang)
117 |             ]
118 |             bpe_encoded = [
119 |                 self.bpe.encode_tokens(tokens) for tokens in sentence_tokens
120 |             ]
121 | 
122 |             return self.bpeSentenceEmbedding.embed_bpe_sentences(bpe_encoded)
123 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # LASER embeddings
  2 | 
  3 | [![GitHub Workflow Status](https://img.shields.io/github/workflow/status/yannvgn/laserembeddings/Python%20package?style=flat-square)](https://github.com/yannvgn/laserembeddings/actions)
  4 | ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/laserembeddings?style=flat-square)
  5 | [![PyPI](https://img.shields.io/pypi/v/laserembeddings.svg?style=flat-square)](https://pypi.org/project/laserembeddings/)
  6 | [![PyPI - License](https://img.shields.io/pypi/l/laserembeddings.svg?style=flat-square)](https://github.com/yannvgn/laserembeddings/blob/master/LICENSE)
  7 | 
  8 | **Out-of-the-box multilingual sentence embeddings.**
  9 | 
 10 | ![LASER embeddings maps similar sentences in any language to similar language-agnostic embeddings](laserembeddings.gif)
 11 | 
 12 | _laserembeddings_ is a pip-packaged, production-ready port of Facebook Research's [LASER](https://github.com/facebookresearch/LASER) (Language-Agnostic SEntence Representations) to compute multilingual sentence embeddings.
 13 | 
 14 | ✨ **Version 1.1.2 is here! What's new?**
 15 | - A compatibility issue with subword-nmt 0.3.8 was fixed (#39) 🐛
 16 | - The behavior of `Laser.embed_sentences` was unclear/misleading when the number of language codes received in the `lang` argument did not match the number of sentences to encode. It now raises an error in that case 🐛
 17 | 
 18 | ## Context
 19 | 
 20 | [LASER](https://github.com/facebookresearch/LASER) is a collection of scripts and models created by Facebook Research to compute **multilingual sentence embeddings** for zero-shot cross-lingual transfer. 
 21 | 
 22 | What does it mean? LASER is able to transform sentences into **language-independent vectors**. Similar sentences get mapped to close vectors (in terms of cosine distance), regardless of the input language.
 23 | 
 24 | That is great, especially if you don't have training sets for the language(s) you want to process: you can build a classifier on top of LASER embeddings, train it on whatever language(s) you have in your training data, and let it classify texts in any language.
 25 | 
 26 | **The aim of the package is to make LASER as easy-to-use and easy-to-deploy as possible: zero-config, production-ready, etc., just a two-liner to install.**
 27 | 
 28 | 👉 👉 👉 For detailed information, have a look at the amazing [LASER repository](https://github.com/facebookresearch/LASER), read its [presentation article](https://code.fb.com/ai-research/laser-multilingual-sentence-embeddings/) and its [research paper](https://arxiv.org/abs/1812.10464). 👈 👈 👈
 29 | 
 30 | ## Getting started
 31 | 
 32 | ### Prerequisites
 33 | 
 34 | You'll need Python 3.6+ and PyTorch. Please refer to [PyTorch installation instructions](https://pytorch.org/get-started/locally/).
 35 | 
 36 | ### Installation
 37 | 
 38 | ```
 39 | pip install laserembeddings
 40 | ```
 41 | 
 42 | #### Chinese language
 43 | 
 44 | Chinese is not supported by default. If you need to embed Chinese sentences, please install laserembeddings with the "zh" extra. This extra includes [jieba](https://github.com/fxsjy/jieba).
 45 | 
 46 | ```
 47 | pip install laserembeddings[zh]
 48 | ```
 49 | 
 50 | #### Japanese language
 51 | 
 52 | Japanese is not supported by default. If you need to embed Japanese sentences, please install laserembeddings with the "ja" extra. This extra includes [mecab-python3](https://github.com/SamuraiT/mecab-python3) and the [ipadic](https://github.com/polm/ipadic-py) dictionary, which is used in the original LASER project.
 53 | 
 54 | If you have issues running laserembeddings on Japanese sentences, please refer to [mecab-python3 documentation](https://github.com/SamuraiT/mecab-python3) for troubleshooting.
 55 | 
 56 | ```
 57 | pip install laserembeddings[ja]
 58 | ```
 59 | 
 60 | 
 61 | ### Downloading the pre-trained models
 62 | 
 63 | ```
 64 | python -m laserembeddings download-models
 65 | ```
 66 | 
 67 | This will download the models to the default `data` directory next to the source code of the package. Use `python -m laserembeddings download-models path/to/model/directory` to download the models to a specific location.
 68 | 
 69 | ### Usage
 70 | 
 71 | ```python
 72 | from laserembeddings import Laser
 73 | 
 74 | laser = Laser()
 75 | 
 76 | # if all sentences are in the same language:
 77 | 
 78 | embeddings = laser.embed_sentences(
 79 |     ['let your neural network be polyglot',
 80 |      'use multilingual embeddings!'],
 81 |     lang='en')  # lang is only used for tokenization
 82 | 
 83 | # embeddings is a N*1024 (N = number of sentences) NumPy array
 84 | ```
 85 | 
 86 | If the sentences are not in the same language, you can pass a list of language codes:
 87 | ```python
 88 | embeddings = laser.embed_sentences(
 89 |     ['I love pasta.',
 90 |      "J'adore les pâtes.",
 91 |      'Ich liebe Pasta.'],
 92 |     lang=['en', 'fr', 'de'])
 93 | ```
 94 | 
 95 | If you downloaded the models into a specific directory:
 96 | 
 97 | ```python
 98 | from laserembeddings import Laser
 99 | 
100 | path_to_bpe_codes = ...
101 | path_to_bpe_vocab = ...
102 | path_to_encoder = ...
103 | 
104 | laser = Laser(path_to_bpe_codes, path_to_bpe_vocab, path_to_encoder)
105 | 
106 | # you can also supply file objects instead of file paths
107 | ```
108 | 
109 | If you want to pull the models from S3:
110 | 
111 | ```python
112 | from io import BytesIO, StringIO
113 | from laserembeddings import Laser
114 | import boto3
115 | 
116 | s3 = boto3.resource('s3')
117 | MODELS_BUCKET = ...
118 | 
119 | f_bpe_codes = StringIO(s3.Object(MODELS_BUCKET, 'path_to_bpe_codes.fcodes').get()['Body'].read().decode('utf-8'))
120 | f_bpe_vocab = StringIO(s3.Object(MODELS_BUCKET, 'path_to_bpe_vocabulary.fvocab').get()['Body'].read().decode('utf-8'))
121 | f_encoder = BytesIO(s3.Object(MODELS_BUCKET, 'path_to_encoder.pt').get()['Body'].read())
122 | 
123 | laser = Laser(f_bpe_codes, f_bpe_vocab, f_encoder)
124 | ```
125 | 
126 | ## What are the differences with the original implementation?
127 | 
128 | Some dependencies of the original project have been replaced with pure-python dependencies, to make this package easy to install and deploy.
129 | 
130 | Here's a summary of the differences:
131 | 
132 | | Part of the pipeline | LASER dependency (original project) | laserembeddings dependency (this package) | Reason |
133 | |----------------------|-------------------------------------|----------------------------------------|--------|
134 | | Normalization / tokenization | [Moses](https://github.com/moses-smt/mosesdecoder) | [Sacremoses](https://github.com/alvations/sacremoses) 0.0.35, which seems to be the closest version to the Moses version used to train the model | Moses is implemented in Perl |
135 | | BPE encoding | [fastBPE](https://github.com/glample/fastBPE) | [subword-nmt](https://github.com/rsennrich/subword-nmt) | fastBPE cannot be installed via pip and requires compiling C++ code |
136 | | Japanese segmentation (optional) | [MeCab](https://github.com/taku910/mecab) / [JapaneseTokenizer](https://github.com/Kensuke-Mitsuzawa/JapaneseTokenizers) | [mecab-python3](https://github.com/SamuraiT/mecab-python3) and [ipadic](https://github.com/polm/ipadic-py) dictionary | mecab-python3 comes with wheels for major platforms (no compilation needed) |
137 | 
138 | ## Will I get the exact same embeddings?
139 | 
140 | **For most languages, in most of the cases, yes.**
141 | 
142 | Some slight (and not so slight 🙄) differences exist for some languages due to differences in the implementation of the Tokenizer.
143 | 
144 | **[An exhaustive comparison of the embeddings generated with LASER and laserembeddings](tests/report/comparison-with-LASER.md) is automatically generated and will be updated for each new release.**
145 | 
146 | ## FAQ
147 | 
148 | **How can I train the encoder?**
149 | 
150 | You can't. LASER models are pre-trained and do not need to be fine-tuned. The embeddings are generic and perform well without fine-tuning. See https://github.com/facebookresearch/LASER/issues/3#issuecomment-404175463.
151 | 
152 | ## Credits
153 | 
154 | Thanks a lot to the creators of [LASER](https://github.com/facebookresearch/LASER) for open-sourcing the code of LASER and releasing the pre-trained models. All the kudos should go to them 👏.
155 | 
156 | A big thanks to the creators of [Sacremoses](https://github.com/alvations/sacremoses) and [Subword Neural Machine Translation](https://github.com/rsennrich/subword-nmt/) for their great packages.
157 | 
158 | ## Testing
159 | 
160 | The first thing you'll need is [Poetry](https://github.com/sdispater/poetry). Please refer to the [installation guidelines](https://poetry.eustace.io/docs/#installation).
161 | 
162 | Clone this repository and install the project:
163 | ```
164 | poetry install -E zh -E ja
165 | ```
166 | 
167 | To run the tests:
168 | ```
169 | poetry run pytest
170 | ```
171 | 
172 | ### Testing the similarity between the embeddings computed with LASER and laserembeddings
173 | 
174 | First, install the project with the extra dependencies (Chinese and Japanese support):
175 | ```
176 | poetry install -E zh -E ja
177 | ```
178 | 
179 | Then, download the test data:
180 | ```
181 | poetry run python -m laserembeddings download-test-data
182 | ```
183 | 
184 | 👉 If you want to know more about the contents and the generation of the test data, check out the [laserembeddings-test-data](https://github.com/yannvgn/laserembeddings-test-data) repository.
185 | 
186 | Then, run the test with `SIMILARITY_TEST` env. variable set to `1`.
187 | 
188 | ```
189 | SIMILARITY_TEST=1 poetry run pytest tests/test_laser.py
190 | ```
191 | 
192 | Now, have a coffee ☕️ and wait for the test to finish.
193 | 
194 | The similarity report will be generated here: [tests/report/comparison-with-LASER.md](tests/report/comparison-with-LASER.md).
195 | 


--------------------------------------------------------------------------------
/laserembeddings/encoder.py:
--------------------------------------------------------------------------------
  1 | # The code contained in this file was copied/pasted from LASER's source code (source/embed.py)
  2 | # and nearly kept untouched besides:
  3 | # - code formatting
  4 | # - buffered_arange: fix to avoid unnecessary warning on PyTorch >= 1.4.0
  5 | 
  6 | # pylint: disable=redefined-builtin, consider-using-enumerate, arguments-differ, fixme, abstract-method, consider-using-from-import
  7 | 
  8 | from collections import namedtuple
  9 | 
 10 | import re
 11 | import numpy as np
 12 | 
 13 | import torch
 14 | import torch.nn as nn
 15 | 
 16 | __all__ = ['SentenceEncoder', 'Encoder']
 17 | 
 18 | SPACE_NORMALIZER = re.compile(r'\s+')
 19 | Batch = namedtuple('Batch', 'srcs tokens lengths')
 20 | 
 21 | 
 22 | def buffered_arange(max):
 23 |     if not hasattr(buffered_arange,
 24 |                    'buf') or max > buffered_arange.buf.numel():
 25 |         buffered_arange.buf = torch.LongTensor()
 26 |         torch.arange(max, out=buffered_arange.buf)
 27 |     return buffered_arange.buf[:max]
 28 | 
 29 | 
 30 | # TODO Do proper padding from the beginning
 31 | def convert_padding_direction(src_tokens,
 32 |                               padding_idx,
 33 |                               right_to_left=False,
 34 |                               left_to_right=False):
 35 |     assert right_to_left ^ left_to_right
 36 |     pad_mask = src_tokens.eq(padding_idx)
 37 |     if not pad_mask.any():
 38 |         # no padding, return early
 39 |         return src_tokens
 40 |     if left_to_right and not pad_mask[:, 0].any():
 41 |         # already right padded
 42 |         return src_tokens
 43 |     if right_to_left and not pad_mask[:, -1].any():
 44 |         # already left padded
 45 |         return src_tokens
 46 |     max_len = src_tokens.size(1)
 47 |     range = buffered_arange(max_len).type_as(src_tokens).expand_as(src_tokens)
 48 |     num_pads = pad_mask.long().sum(dim=1, keepdim=True)
 49 |     if right_to_left:
 50 |         index = torch.remainder(range - num_pads, max_len)
 51 |     else:
 52 |         index = torch.remainder(range + num_pads, max_len)
 53 |     return src_tokens.gather(1, index)
 54 | 
 55 | 
 56 | class SentenceEncoder:
 57 |     def __init__(self,
 58 |                  model_path,
 59 |                  max_sentences=None,
 60 |                  max_tokens=None,
 61 |                  cpu=False,
 62 |                  fp16=False,
 63 |                  sort_kind='quicksort'):
 64 |         self.use_cuda = torch.cuda.is_available() and not cpu
 65 |         self.max_sentences = max_sentences
 66 |         self.max_tokens = max_tokens
 67 |         if self.max_tokens is None and self.max_sentences is None:
 68 |             self.max_sentences = 1
 69 | 
 70 |         state_dict = torch.load(model_path)
 71 |         self.encoder = Encoder(**state_dict['params'])
 72 |         self.encoder.load_state_dict(state_dict['model'])
 73 |         self.dictionary = state_dict['dictionary']
 74 |         self.pad_index = self.dictionary['<pad>']
 75 |         self.eos_index = self.dictionary['</s>']
 76 |         self.unk_index = self.dictionary['<unk>']
 77 |         if fp16:
 78 |             self.encoder.half()
 79 |         if self.use_cuda:
 80 |             self.encoder.cuda()
 81 |         self.sort_kind = sort_kind
 82 | 
 83 |     def _process_batch(self, batch):
 84 |         tokens = batch.tokens
 85 |         lengths = batch.lengths
 86 |         if self.use_cuda:
 87 |             tokens = tokens.cuda()
 88 |             lengths = lengths.cuda()
 89 |         self.encoder.eval()
 90 |         embeddings = self.encoder(tokens, lengths)['sentemb']
 91 |         return embeddings.detach().cpu().numpy()
 92 | 
 93 |     def _tokenize(self, line):
 94 |         tokens = SPACE_NORMALIZER.sub(" ", line).strip().split()
 95 |         ntokens = len(tokens)
 96 |         ids = torch.LongTensor(ntokens + 1)
 97 |         for i, token in enumerate(tokens):
 98 |             ids[i] = self.dictionary.get(token, self.unk_index)
 99 |         ids[ntokens] = self.eos_index
100 |         return ids
101 | 
102 |     def _make_batches(self, lines):
103 |         tokens = [self._tokenize(line) for line in lines]
104 |         lengths = np.array([t.numel() for t in tokens])
105 |         indices = np.argsort(-lengths, kind=self.sort_kind)  # pylint: disable=invalid-unary-operand-type
106 | 
107 |         def batch(tokens, lengths, indices):
108 |             toks = tokens[0].new_full((len(tokens), tokens[0].shape[0]),
109 |                                       self.pad_index)
110 |             for i in range(len(tokens)):
111 |                 toks[i, -tokens[i].shape[0]:] = tokens[i]
112 |             return Batch(srcs=None,
113 |                          tokens=toks,
114 |                          lengths=torch.LongTensor(lengths)), indices
115 | 
116 |         batch_tokens, batch_lengths, batch_indices = [], [], []
117 |         ntokens = nsentences = 0
118 |         for i in indices:
119 |             if nsentences > 0 and ((self.max_tokens is not None
120 |                                     and ntokens + lengths[i] > self.max_tokens)
121 |                                    or (self.max_sentences is not None
122 |                                        and nsentences == self.max_sentences)):
123 |                 yield batch(batch_tokens, batch_lengths, batch_indices)
124 |                 ntokens = nsentences = 0
125 |                 batch_tokens, batch_lengths, batch_indices = [], [], []
126 |             batch_tokens.append(tokens[i])
127 |             batch_lengths.append(lengths[i])
128 |             batch_indices.append(i)
129 |             ntokens += tokens[i].shape[0]
130 |             nsentences += 1
131 |         if nsentences > 0:
132 |             yield batch(batch_tokens, batch_lengths, batch_indices)
133 | 
134 |     def encode_sentences(self, sentences):
135 |         indices = []
136 |         results = []
137 |         for batch, batch_indices in self._make_batches(sentences):
138 |             indices.extend(batch_indices)
139 |             results.append(self._process_batch(batch))
140 |         return np.vstack(results)[np.argsort(indices, kind=self.sort_kind)]
141 | 
142 | 
143 | class Encoder(nn.Module):
144 |     def __init__(self,
145 |                  num_embeddings,
146 |                  padding_idx,
147 |                  embed_dim=320,
148 |                  hidden_size=512,
149 |                  num_layers=1,
150 |                  bidirectional=False,
151 |                  left_pad=True,
152 |                  padding_value=0.):
153 |         super().__init__()
154 | 
155 |         self.num_layers = num_layers
156 |         self.bidirectional = bidirectional
157 |         self.hidden_size = hidden_size
158 | 
159 |         self.padding_idx = padding_idx
160 |         self.embed_tokens = nn.Embedding(num_embeddings,
161 |                                          embed_dim,
162 |                                          padding_idx=self.padding_idx)
163 | 
164 |         self.lstm = nn.LSTM(
165 |             input_size=embed_dim,
166 |             hidden_size=hidden_size,
167 |             num_layers=num_layers,
168 |             bidirectional=bidirectional,
169 |         )
170 |         self.left_pad = left_pad
171 |         self.padding_value = padding_value
172 | 
173 |         self.output_units = hidden_size
174 |         if bidirectional:
175 |             self.output_units *= 2
176 | 
177 |     def forward(self, src_tokens, src_lengths):
178 |         if self.left_pad:
179 |             # convert left-padding to right-padding
180 |             src_tokens = convert_padding_direction(
181 |                 src_tokens,
182 |                 self.padding_idx,
183 |                 left_to_right=True,
184 |             )
185 | 
186 |         bsz, seqlen = src_tokens.size()
187 | 
188 |         # embed tokens
189 |         x = self.embed_tokens(src_tokens)
190 | 
191 |         # B x T x C -> T x B x C
192 |         x = x.transpose(0, 1)
193 | 
194 |         # pack embedded source tokens into a PackedSequence
195 |         packed_x = nn.utils.rnn.pack_padded_sequence(x, src_lengths.data.cpu())
196 | 
197 |         # apply LSTM
198 |         if self.bidirectional:
199 |             state_size = 2 * self.num_layers, bsz, self.hidden_size
200 |         else:
201 |             state_size = self.num_layers, bsz, self.hidden_size
202 |         h0 = x.data.new(*state_size).zero_()
203 |         c0 = x.data.new(*state_size).zero_()
204 |         packed_outs, (final_hiddens,
205 |                       final_cells) = self.lstm(packed_x, (h0, c0))
206 | 
207 |         # unpack outputs and apply dropout
208 |         x, _ = nn.utils.rnn.pad_packed_sequence(
209 |             packed_outs, padding_value=self.padding_value)
210 |         assert list(x.size()) == [seqlen, bsz, self.output_units]
211 | 
212 |         if self.bidirectional:
213 | 
214 |             def combine_bidir(outs):
215 |                 return torch.cat([
216 |                     torch.cat([outs[2 * i], outs[2 * i + 1]], dim=0).view(
217 |                         1, bsz, self.output_units)
218 |                     for i in range(self.num_layers)
219 |                 ],
220 |                                  dim=0)
221 | 
222 |             final_hiddens = combine_bidir(final_hiddens)
223 |             final_cells = combine_bidir(final_cells)
224 | 
225 |         encoder_padding_mask = src_tokens.eq(self.padding_idx).t()
226 | 
227 |         # Set padded outputs to -inf so they are not selected by max-pooling
228 |         padding_mask = src_tokens.eq(self.padding_idx).t().unsqueeze(-1)
229 |         if padding_mask.any():
230 |             x = x.float().masked_fill_(padding_mask, float('-inf')).type_as(x)
231 | 
232 |         # Build the sentence embedding by max-pooling over the encoder outputs
233 |         sentemb = x.max(dim=0)[0]
234 | 
235 |         return {
236 |             'sentemb':
237 |             sentemb,
238 |             'encoder_out': (x, final_hiddens, final_cells),
239 |             'encoder_padding_mask':
240 |             encoder_padding_mask if encoder_padding_mask.any() else None
241 |         }
242 | 


--------------------------------------------------------------------------------