├── tests ├── __init__.py ├── data │ └── .gitkeep ├── conftest.py ├── test_embedding.py ├── test_preprocessing.py ├── test_utils.py ├── test_laser.py └── report │ └── comparison-with-LASER.md ├── laserembeddings ├── data │ └── .gitkeep ├── __init__.py ├── embedding.py ├── utils.py ├── __main__.py ├── preprocessing.py ├── laser.py └── encoder.py ├── .style.yapf ├── laserembeddings.gif ├── pylintrc ├── pyproject.toml ├── LICENSE ├── .github └── workflows │ └── python-package.yml ├── CHANGELOG.md ├── .travis.yml ├── .gitignore └── README.md /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/data/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /laserembeddings/data/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.style.yapf: -------------------------------------------------------------------------------- 1 | [style] 2 | based_on_style=pep8 3 | -------------------------------------------------------------------------------- /laserembeddings.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yannvgn/laserembeddings/HEAD/laserembeddings.gif -------------------------------------------------------------------------------- /laserembeddings/__init__.py: -------------------------------------------------------------------------------- 1 | from .laser import Laser 2 | 3 | __version__ = '1.1.2' 4 | 5 | __all__ = ['Laser'] 6 | -------------------------------------------------------------------------------- /pylintrc: -------------------------------------------------------------------------------- 1 | [MESSAGES CONTROL] 2 | disable= 3 | line-too-long, 4 | trailing-whitespace, 5 | missing-docstring, 6 | too-many-locals, 7 | too-many-instance-attributes, 8 | invalid-name, 9 | too-few-public-methods, 10 | too-many-arguments, 11 | 12 | [TYPECHECK] 13 | ignored-modules=numpy,torch 14 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | 5 | 6 | @pytest.fixture 7 | def test_data(): 8 | import numpy as np 9 | test_data_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), 10 | 'data', 'laserembeddings-test-data.npz') 11 | 12 | return np.load(test_data_file) if os.path.isfile(test_data_file) else None 13 | -------------------------------------------------------------------------------- /tests/test_embedding.py: -------------------------------------------------------------------------------- 1 | from laserembeddings import Laser 2 | from laserembeddings.embedding import BPESentenceEmbedding 3 | 4 | 5 | def test_bpe_sentence_embedding(): 6 | assert BPESentenceEmbedding( 7 | Laser.DEFAULT_ENCODER_FILE).embed_bpe_sentences(['hello', 'world' 8 | ]).shape == (2, 1024) 9 | 10 | with open(Laser.DEFAULT_ENCODER_FILE, 'rb') as encoder_f: 11 | assert BPESentenceEmbedding(encoder_f).embed_bpe_sentences( 12 | ['hello', 'world']).shape == (2, 1024) 13 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "laserembeddings" 3 | version = "1.1.2" 4 | description = "Production-ready LASER multilingual embeddings" 5 | authors = ["yannvgn "] 6 | license = "BSD-3-Clause" 7 | homepage = "https://github.com/yannvgn/laserembeddings" 8 | repository = "https://github.com/yannvgn/laserembeddings" 9 | readme = "README.md" 10 | 11 | [tool.poetry.dependencies] 12 | python = "^3.6.2" 13 | torch = "^1.0.1.post2" 14 | subword-nmt = "^0.3.6" 15 | numpy = "^1.15.4" 16 | sacremoses = "0.0.35" 17 | transliterate = "1.10.2" 18 | mecab-python3 = { version = "^1.0.1", optional = true } 19 | ipadic = { version = "1.0.0", optional = true } 20 | jieba = { version = "^0.42.1", optional = true } 21 | 22 | [tool.poetry.dev-dependencies] 23 | pytest = "^4.6" 24 | yapf = "^0.27.0" 25 | pylint = "^2.3" 26 | 27 | [tool.poetry.extras] 28 | zh = ["jieba"] 29 | ja = ["mecab-python3", "ipadic"] 30 | 31 | [build-system] 32 | requires = ["poetry_core>=1.0.0"] 33 | build-backend = "poetry.core.masonry.api" 34 | -------------------------------------------------------------------------------- /tests/test_preprocessing.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from laserembeddings import Laser 4 | from laserembeddings.preprocessing import Tokenizer, BPE 5 | 6 | from laserembeddings.utils import sre_performance_patch 7 | 8 | 9 | def test_tokenizer(): 10 | with sre_performance_patch(): 11 | assert Tokenizer('en').tokenize("Let's do it!") == "let 's do it !" 12 | 13 | assert Tokenizer( 14 | 'en', descape=True).tokenize("Let's do it & pass that test!" 15 | ) == "let 's do it & pass that test !" 16 | 17 | with pytest.raises(AssertionError): 18 | Tokenizer(lower_case=False) 19 | 20 | assert not Tokenizer('en').romanize 21 | assert Tokenizer('el').romanize 22 | 23 | 24 | def test_bpe(): 25 | with open(Laser.DEFAULT_BPE_VOCAB_FILE, 'r', encoding='utf-8') as f_vocab: 26 | bpe = BPE(Laser.DEFAULT_BPE_CODES_FILE, f_vocab) 27 | assert bpe.encode_tokens( 28 | "the tests are passing") == 'the test@@ s are passing' 29 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | from io import StringIO 2 | 3 | from laserembeddings.utils import adapt_bpe_codes, sre_performance_patch 4 | 5 | 6 | def test_bpe_codes_adapter(): 7 | test_f = StringIO( 8 | '#version:2.0\ne n 52708119\ne r 51024442\ne n 47209692') 9 | 10 | adapted = adapt_bpe_codes(test_f) 11 | 12 | assert adapted.readline() == '#version:2.0\n' 13 | assert adapted.readline() == 'e n\n' 14 | assert adapted.readline() == 'e r\n' 15 | 16 | for line in adapted: 17 | assert line == 'e n' 18 | 19 | adapted.seek(0) 20 | 21 | for line in adapted: 22 | assert line == '#version:2.0\n' 23 | break 24 | 25 | 26 | def test_sre_performance_patch(): 27 | #pylint: disable=protected-access 28 | try: 29 | import sre_parse 30 | uniq = sre_parse._uniq 31 | 32 | with sre_performance_patch(): 33 | assert sre_parse._uniq(['5', '2', '3', '2', '5', 34 | '1']) == ['5', '2', '3', '1'] 35 | 36 | # make sure the original sre_parse._uniq was restored 37 | assert sre_parse._uniq == uniq 38 | except (ImportError, AttributeError): 39 | pass 40 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. 4 | Copyright (c) 2019 - 2020 yannvgn 5 | 6 | Redistribution and use in source and binary forms, with or without modification, 7 | are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name Facebook nor the names of its contributors may be used to 17 | endorse or promote products derived from this software without specific 18 | prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 21 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 22 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 24 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 25 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 26 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 27 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 29 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Python package 5 | 6 | on: 7 | push: 8 | branches: [ master ] 9 | pull_request: 10 | branches: [ master ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ${{ matrix.os }} 16 | strategy: 17 | fail-fast: false 18 | matrix: 19 | os: [ubuntu-latest, macos-latest, windows-latest] 20 | python-version: ["3.6", "3.7", "3.8", "3.9"] 21 | exclude: 22 | - os: macos-latest 23 | python-version: '3.6' 24 | - os: macos-latest 25 | python-version: '3.7' 26 | - os: macos-latest 27 | python-version: '3.8' 28 | - os: windows-latest 29 | python-version: '3.6' 30 | - os: windows-latest 31 | python-version: '3.7' 32 | - os: windows-latest 33 | python-version: '3.8' 34 | 35 | steps: 36 | - uses: actions/checkout@v2 37 | - name: Set up Python ${{ matrix.python-version }} 38 | uses: actions/setup-python@v2 39 | with: 40 | python-version: ${{ matrix.python-version }} 41 | - name: Install Poetry 42 | uses: snok/install-poetry@v1 43 | with: 44 | virtualenvs-create: true 45 | virtualenvs-in-project: false 46 | installer-parallel: true 47 | - name: Install laserembeddings 48 | run: | 49 | poetry install -E zh -E ja 50 | poetry run python -m laserembeddings download-models 51 | - name: Lint 52 | run: | 53 | poetry run pylint laserembeddings 54 | - name: Test with pytest 55 | run: | 56 | poetry run pytest 57 | -------------------------------------------------------------------------------- /laserembeddings/embedding.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, List, Union 2 | from io import BufferedIOBase 3 | 4 | import numpy as np 5 | 6 | from .encoder import SentenceEncoder 7 | 8 | __all__ = ['BPESentenceEmbedding'] 9 | 10 | 11 | class BPESentenceEmbedding: 12 | """ 13 | LASER embeddings computation from BPE-encoded sentences. 14 | 15 | Args: 16 | encoder (str or BufferedIOBase): the path to LASER's encoder PyTorch model, 17 | or a binary-mode file object. 18 | max_sentences (int, optional): see ``.encoder.SentenceEncoder``. 19 | max_tokens (int, optional): see ``.encoder.SentenceEncoder``. 20 | stable (bool, optional): if True, mergesort sorting algorithm will be used, 21 | otherwise quicksort will be used. Defaults to False. See ``.encoder.SentenceEncoder``. 22 | cpu (bool, optional): if True, forces the use of the CPU even a GPU is available. Defaults to False. 23 | """ 24 | 25 | def __init__(self, 26 | encoder: Union[str, BufferedIOBase], 27 | max_sentences: Optional[int] = None, 28 | max_tokens: Optional[int] = 12000, 29 | stable: bool = False, 30 | cpu: bool = False): 31 | 32 | self.encoder = SentenceEncoder( 33 | encoder, 34 | max_sentences=max_sentences, 35 | max_tokens=max_tokens, 36 | sort_kind='mergesort' if stable else 'quicksort', 37 | cpu=cpu) 38 | 39 | def embed_bpe_sentences(self, bpe_sentences: List[str]) -> np.ndarray: 40 | """ 41 | Computes the LASER embeddings of BPE-encoded sentences 42 | 43 | Args: 44 | bpe_sentences (List[str]): The list of BPE-encoded sentences 45 | 46 | Returns: 47 | np.ndarray: A N * 1024 NumPy array containing the embeddings, N being the number of sentences provided. 48 | """ 49 | return self.encoder.encode_sentences(bpe_sentences) 50 | -------------------------------------------------------------------------------- /laserembeddings/utils.py: -------------------------------------------------------------------------------- 1 | from io import TextIOBase, StringIO 2 | import re 3 | 4 | __all__ = ['adapt_bpe_codes', 'sre_performance_patch'] 5 | 6 | 7 | def adapt_bpe_codes(bpe_codes_f: TextIOBase) -> TextIOBase: 8 | """ 9 | Converts fastBPE codes to subword_nmt BPE codes. 10 | 11 | Args: 12 | bpe_codes_f (TextIOBase): the text-mode file-like object of fastBPE codes 13 | Returns: 14 | TextIOBase: subword_nmt-compatible BPE codes as a text-mode file-like object 15 | """ 16 | return StringIO( 17 | re.sub(r'^([^ ]+) ([^ ]+) ([^ ]+)$', 18 | r'\1 \2', 19 | bpe_codes_f.read(), 20 | flags=re.MULTILINE)) 21 | 22 | 23 | class sre_performance_patch: 24 | """ 25 | Patch fixing https://bugs.python.org/issue37723 for Python 3.7 (<= 3.7.4) 26 | and Python 3.8 (<= 3.8.0 beta 3) 27 | """ 28 | 29 | def __init__(self): 30 | self.sre_parse = None 31 | self.original_sre_parse_uniq = None 32 | 33 | def __enter__(self): 34 | #pylint: disable=import-outside-toplevel 35 | import sys 36 | 37 | if self.original_sre_parse_uniq is None and ( 38 | 0x03070000 <= sys.hexversion <= 0x030704f0 39 | or 0x03080000 <= sys.hexversion <= 0x030800b3): 40 | try: 41 | import sre_parse 42 | self.sre_parse = sre_parse 43 | #pylint: disable=protected-access 44 | self.original_sre_parse_uniq = sre_parse._uniq 45 | sre_parse._uniq = lambda x: list(dict.fromkeys(x)) 46 | except (ImportError, AttributeError): 47 | self.sre_parse = None 48 | self.original_sre_parse_uniq = None 49 | 50 | def __exit__(self, type_, value, traceback): 51 | if self.sre_parse and self.original_sre_parse_uniq: 52 | #pylint: disable=protected-access 53 | self.sre_parse._uniq = self.original_sre_parse_uniq 54 | self.original_sre_parse_uniq = None 55 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | 2 | # [1.1.2](https://github.com/yannvgn/laserembeddings/compare/v1.1.1...v1.1.2) (2021-12-12) 3 | 4 | - A compatibility issue with subword-nmt 0.3.8 was fixed (#39) 🐛 5 | - The behavior of `Laser.embed_sentences` was unclear/misleading when the number of language codes received in the `lang` argument did not match the number of sentences to encode. It now raises an error in that case (#40) 🐛 6 | 7 | 8 | # [1.1.1](https://github.com/yannvgn/laserembeddings/compare/v1.1.0...v1.1.1) (2021-02-06) 9 | 10 | - An issue with PyTorch 1.7.0 was fixed (#32) 🐛 11 | 12 | 13 | # [1.1.0](https://github.com/yannvgn/laserembeddings/compare/v1.0.1...v1.1.0) (2020-10-04) 14 | 15 | - Japanese extra on Windows is back! 🇯🇵 16 | 17 | 18 | # [1.0.1](https://github.com/yannvgn/laserembeddings/compare/v1.0.0...v1.0.1) (2020-03-02) 19 | 20 | - The encoder was fixed to remove an innocuous warning message that would sometimes appear when using PyTorch 1.4 🐛 21 | - Japanese extra is now disabled on Windows (sorry) to prevent installation issues and computation failures in other languages 😕 22 | 23 | 24 | # [1.0.0](https://github.com/yannvgn/laserembeddings/compare/v0.1.3...v1.0.0) (2019-12-19) 25 | 26 | - Greek, Chinese and Japanese are now supported 🇬🇷 🇨🇳 🇯🇵 27 | - Some languages that were only partially supported are now fully supported (New Norwegian, Swedish, Tatar) 🌍 28 | - It should work on Windows now 🙄 29 | - Sentences in different languages can now be processed in the same batch ⚡️ 30 | 31 | 32 | # [0.1.3](https://github.com/yannvgn/laserembeddings/compare/v0.1.2...v0.1.3) (2019-10-03) 33 | 34 | - A lot of languages that were only partially supported are now fully supported (br, bs, ceb, fr, gl, oc, ug, vi) 🌍 35 | 36 | 37 | # [0.1.2](https://github.com/yannvgn/laserembeddings/compare/v0.1.1...v0.1.2) (2019-08-24) 38 | 39 | - Korean is now fully supported ✅ 40 | - A [bug](https://bugs.python.org/issue37723) in Python 3.7 (<= 3.7.4) and 3.8 (<= 3.8.0 beta 3) affecting the tokenizer performance was patched as a temporary solution until next Python releases 🐛 41 | 42 | 43 | # 0.1.1 (2019-07-23) 44 | 45 | - Initial version 🐣 46 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | jobs: 3 | include: 4 | - name: "Python 3.8 on Xenial Linux" 5 | python: 3.8 6 | before_install: 7 | - python -m pip install --upgrade pip 8 | - pip3 install poetry==1.1.* 9 | - pip3 install torch==1.6.0+cpu -f https://download.pytorch.org/whl/torch_stable.html 10 | - name: "Python 3.6 on Xenial Linux" 11 | python: 3.6 12 | before_install: 13 | - python -m pip install --upgrade pip 14 | - pip3 install poetry==1.1.* 15 | - pip3 install torch==1.6.0+cpu -f https://download.pytorch.org/whl/torch_stable.html 16 | - name: "Python 3.7 on macOS" 17 | os: osx 18 | osx_image: xcode11.2 19 | language: shell 20 | before_install: 21 | - python3 -m pip install --upgrade pip 22 | - pip3 install poetry==1.1.* 23 | - pip3 install virtualenv 24 | - virtualenv .env 25 | - source .env/bin/activate 26 | - pip3 install torch 27 | - name: "Python 3.7 on Windows" 28 | os: windows 29 | language: shell 30 | before_install: 31 | - choco install python --version 3.7.0 32 | - python -m pip install --upgrade pip 33 | - pip3 install poetry==1.1.* 34 | - poetry config virtualenvs.create false 35 | - pip3 install torch==1.6.0+cpu -f https://download.pytorch.org/whl/torch_stable.html 36 | env: PATH=/c/Python37:/c/Python37/Scripts:$PATH 37 | - name: "Python 3.8 on Xenial Linux (wheel installation)" 38 | python: 3.8 39 | before_install: 40 | - python -m pip install --upgrade pip 41 | - pip3 install poetry==1.1.* 42 | - pip3 install torch==1.6.0+cpu -f https://download.pytorch.org/whl/torch_stable.html 43 | install: 44 | - poetry build 45 | - pip3 install dist/laserembeddings-*.whl 46 | - python -m laserembeddings download-models 47 | script: 48 | - python -c 'from laserembeddings import Laser; laser = Laser(); laser.embed_sentences(["test"], lang="en")' 49 | 50 | install: 51 | - poetry remove torch # fix: latest torch wheel (1.1.0.post2) not available for linux 52 | - poetry install -E zh -E ja 53 | - python3 -m laserembeddings download-models || python -m laserembeddings download-models 54 | 55 | script: 56 | - poetry run pylint laserembeddings 57 | - poetry run pytest 58 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | laserembeddings/data/** 2 | !laserembeddings/data/.gitkeep 3 | tests/data/** 4 | !tests/data/.gitkeep 5 | 6 | poetry.lock 7 | poetry.toml 8 | 9 | .DS_Store 10 | .vscode 11 | 12 | # Byte-compiled / optimized / DLL files 13 | __pycache__/ 14 | *.py[cod] 15 | *$py.class 16 | 17 | # C extensions 18 | *.so 19 | 20 | # Distribution / packaging 21 | .Python 22 | build/ 23 | develop-eggs/ 24 | dist/ 25 | downloads/ 26 | eggs/ 27 | .eggs/ 28 | lib/ 29 | lib64/ 30 | parts/ 31 | sdist/ 32 | var/ 33 | wheels/ 34 | pip-wheel-metadata/ 35 | share/python-wheels/ 36 | *.egg-info/ 37 | .installed.cfg 38 | *.egg 39 | MANIFEST 40 | 41 | # PyInstaller 42 | # Usually these files are written by a python script from a template 43 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 44 | *.manifest 45 | *.spec 46 | 47 | # Installer logs 48 | pip-log.txt 49 | pip-delete-this-directory.txt 50 | 51 | # Unit test / coverage reports 52 | htmlcov/ 53 | .tox/ 54 | .nox/ 55 | .coverage 56 | .coverage.* 57 | .cache 58 | nosetests.xml 59 | coverage.xml 60 | *.cover 61 | .hypothesis/ 62 | .pytest_cache/ 63 | 64 | # Translations 65 | *.mo 66 | *.pot 67 | 68 | # Django stuff: 69 | *.log 70 | local_settings.py 71 | db.sqlite3 72 | db.sqlite3-journal 73 | 74 | # Flask stuff: 75 | instance/ 76 | .webassets-cache 77 | 78 | # Scrapy stuff: 79 | .scrapy 80 | 81 | # Sphinx documentation 82 | docs/_build/ 83 | 84 | # PyBuilder 85 | target/ 86 | 87 | # Jupyter Notebook 88 | .ipynb_checkpoints 89 | 90 | # IPython 91 | profile_default/ 92 | ipython_config.py 93 | 94 | # pyenv 95 | .python-version 96 | 97 | # pipenv 98 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 99 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 100 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 101 | # install all needed dependencies. 102 | #Pipfile.lock 103 | 104 | # celery beat schedule file 105 | celerybeat-schedule 106 | 107 | # SageMath parsed files 108 | *.sage.py 109 | 110 | # Environments 111 | .env 112 | .venv 113 | env/ 114 | venv/ 115 | ENV/ 116 | env.bak/ 117 | venv.bak/ 118 | 119 | # Spyder project settings 120 | .spyderproject 121 | .spyproject 122 | 123 | # Rope project settings 124 | .ropeproject 125 | 126 | # mkdocs documentation 127 | /site 128 | 129 | # mypy 130 | .mypy_cache/ 131 | .dmypy.json 132 | dmypy.json 133 | 134 | # Pyre type checker 135 | .pyre/ 136 | 137 | # PyCharm files 138 | .idea/* 139 | -------------------------------------------------------------------------------- /tests/test_laser.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | import numpy as np 5 | 6 | from laserembeddings import Laser 7 | 8 | SIMILARITY_TEST = os.getenv('SIMILARITY_TEST') 9 | SKIP_ZH = os.getenv('SKIP_ZH') 10 | SKIP_JA = os.getenv('SKIP_JA') 11 | 12 | 13 | def test_laser(): 14 | with open(Laser.DEFAULT_ENCODER_FILE, 'rb') as f_encoder: 15 | laser = Laser( 16 | Laser.DEFAULT_BPE_CODES_FILE, 17 | None, 18 | f_encoder, 19 | ) 20 | assert laser.embed_sentences( 21 | ['hello world!', 'i hope the tests are passing'], 22 | lang='en').shape == (2, 1024) 23 | assert laser.embed_sentences(['hello world!', "j'aime les pâtes"], 24 | lang=['en', 'fr']).shape == (2, 1024) 25 | assert laser.embed_sentences('hello world!', 26 | lang='en').shape == (1, 1024) 27 | 28 | with pytest.raises(ValueError): 29 | laser.embed_sentences(['hello world!', "j'aime les pâtes"], 30 | lang=['en']) 31 | 32 | 33 | def test_zh(): 34 | if SKIP_ZH: 35 | pytest.skip("SKIP_ZH is set") 36 | laser = Laser() 37 | assert laser.embed_sentences(['干杯!'], lang='zh').shape == (1, 1024) 38 | 39 | 40 | def test_ja(): 41 | if SKIP_JA: 42 | pytest.skip("SKIP_JA is set") 43 | laser = Laser() 44 | assert laser.embed_sentences(['乾杯!'], lang='ja').shape == (1, 1024) 45 | 46 | 47 | def test_similarity(test_data): 48 | if not SIMILARITY_TEST: 49 | pytest.skip("SIMILARITY_TEST not set") 50 | 51 | if not test_data: 52 | raise FileNotFoundError( 53 | 'laserembeddings-test-data.npz is missing, run "python -m laserembeddings download-test-data" to fix that' 54 | ) 55 | 56 | report = os.path.join(os.path.dirname(os.path.realpath(__file__)), 57 | 'report', 'comparison-with-LASER.md') 58 | 59 | laser = Laser() 60 | 61 | with open(report, 'w', encoding='utf-8') as f_report: 62 | 63 | f_report.write( 64 | '# Comparison of the embeddings computed with original LASER with the embeddings computed with this package\n' 65 | ) 66 | f_report.write( 67 | '| |language|avg. cosine similarity|min. cosine similarity|\n') 68 | f_report.write( 69 | '|-|--------|----------------------|----------------------|\n') 70 | 71 | for lang in test_data['langs']: 72 | 73 | sents = test_data[f'{lang}_sentences'] 74 | orig_embeddings = test_data[f'{lang}_embeddings'] 75 | embeddings = laser.embed_sentences(sents, lang) 76 | 77 | assert embeddings.shape == orig_embeddings.shape 78 | 79 | cosine_similarities = np.sum( 80 | orig_embeddings * embeddings, 81 | axis=1) / (np.linalg.norm(orig_embeddings, axis=1) * 82 | np.linalg.norm(embeddings, axis=1)) 83 | 84 | similarity_mean = np.mean(cosine_similarities) 85 | similarity_min = np.min(cosine_similarities) 86 | 87 | f_report.write( 88 | f'|{"✅" if similarity_min > 0.99999 else "⚠️" if similarity_mean > 0.99 else "❌"}|{lang}|{similarity_mean:.5f}|{similarity_min:.5f}|\n' 89 | ) 90 | -------------------------------------------------------------------------------- /tests/report/comparison-with-LASER.md: -------------------------------------------------------------------------------- 1 | # Comparison of the embeddings computed with original LASER with the embeddings computed with this package 2 | | |language|avg. cosine similarity|min. cosine similarity| 3 | |-|--------|----------------------|----------------------| 4 | |✅|af|1.00000|1.00000| 5 | |✅|am|1.00000|1.00000| 6 | |✅|ang|1.00000|1.00000| 7 | |✅|ar|1.00000|1.00000| 8 | |✅|arq|1.00000|1.00000| 9 | |✅|arz|1.00000|1.00000| 10 | |✅|ast|1.00000|1.00000| 11 | |✅|awa|1.00000|1.00000| 12 | |✅|az|1.00000|1.00000| 13 | |✅|be|1.00000|1.00000| 14 | |✅|ber|1.00000|1.00000| 15 | |✅|bg|1.00000|1.00000| 16 | |✅|bn|1.00000|1.00000| 17 | |✅|br|1.00000|1.00000| 18 | |✅|bs|1.00000|1.00000| 19 | |✅|ca|1.00000|1.00000| 20 | |✅|cbk|1.00000|1.00000| 21 | |✅|ceb|1.00000|1.00000| 22 | |✅|ch|1.00000|1.00000| 23 | |✅|cmn|1.00000|1.00000| 24 | |✅|cs|1.00000|1.00000| 25 | |✅|csb|1.00000|1.00000| 26 | |✅|cy|1.00000|1.00000| 27 | |✅|da|1.00000|1.00000| 28 | |✅|de|1.00000|1.00000| 29 | |✅|dsb|1.00000|1.00000| 30 | |✅|dtp|1.00000|1.00000| 31 | |✅|el|1.00000|1.00000| 32 | |✅|en|1.00000|1.00000| 33 | |✅|eo|1.00000|1.00000| 34 | |✅|es|1.00000|1.00000| 35 | |✅|et|1.00000|1.00000| 36 | |✅|eu|1.00000|1.00000| 37 | |✅|fi|1.00000|1.00000| 38 | |✅|fo|1.00000|1.00000| 39 | |✅|fr|1.00000|1.00000| 40 | |⚠️|fy|0.99993|0.99319| 41 | |⚠️|ga|0.99762|0.92033| 42 | |✅|gd|1.00000|1.00000| 43 | |✅|gl|1.00000|1.00000| 44 | |✅|gsw|1.00000|1.00000| 45 | |✅|he|1.00000|1.00000| 46 | |✅|hi|1.00000|1.00000| 47 | |✅|hr|1.00000|1.00000| 48 | |✅|hsb|1.00000|1.00000| 49 | |✅|hu|1.00000|1.00000| 50 | |✅|hy|1.00000|1.00000| 51 | |✅|ia|1.00000|1.00000| 52 | |✅|id|1.00000|1.00000| 53 | |✅|ie|1.00000|1.00000| 54 | |✅|io|1.00000|1.00000| 55 | |✅|is|1.00000|1.00000| 56 | |✅|it|1.00000|1.00000| 57 | |✅|ja|1.00000|1.00000| 58 | |⚠️|jv|0.99987|0.98719| 59 | |⚠️|ka|0.99739|0.73893| 60 | |✅|kab|1.00000|1.00000| 61 | |✅|kk|1.00000|1.00000| 62 | |❌|km|0.96787|0.76779| 63 | |✅|ko|1.00000|1.00000| 64 | |✅|ku|1.00000|1.00000| 65 | |✅|kw|1.00000|1.00000| 66 | |✅|kzj|1.00000|1.00000| 67 | |⚠️|la|0.99547|0.86945| 68 | |✅|lfn|1.00000|1.00000| 69 | |✅|lt|1.00000|1.00000| 70 | |✅|lvs|1.00000|1.00000| 71 | |✅|max|1.00000|1.00000| 72 | |✅|mhr|1.00000|1.00000| 73 | |✅|mk|1.00000|1.00000| 74 | |⚠️|ml|0.99297|0.87608| 75 | |✅|mn|1.00000|1.00000| 76 | |⚠️|mr|0.99952|0.95155| 77 | |✅|nb|1.00000|1.00000| 78 | |✅|nds|1.00000|1.00000| 79 | |✅|nl|1.00000|1.00000| 80 | |✅|nn|1.00000|1.00000| 81 | |✅|nov|1.00000|1.00000| 82 | |✅|oc|1.00000|1.00000| 83 | |✅|orv|1.00000|1.00000| 84 | |✅|pam|1.00000|1.00000| 85 | |⚠️|pes|0.99901|0.93162| 86 | |✅|pl|1.00000|1.00000| 87 | |✅|pms|1.00000|1.00000| 88 | |✅|pt|1.00000|1.00000| 89 | |✅|ro|1.00000|1.00000| 90 | |✅|ru|1.00000|1.00000| 91 | |✅|sk|1.00000|1.00000| 92 | |✅|sl|1.00000|1.00000| 93 | |✅|sq|1.00000|1.00000| 94 | |✅|sr|1.00000|1.00000| 95 | |✅|sv|1.00000|1.00000| 96 | |✅|swg|1.00000|1.00000| 97 | |✅|swh|1.00000|1.00000| 98 | |✅|ta|1.00000|1.00000| 99 | |⚠️|te|0.99838|0.88623| 100 | |✅|th|1.00000|1.00000| 101 | |✅|tk|1.00000|1.00000| 102 | |✅|tl|1.00000|1.00000| 103 | |✅|tr|1.00000|1.00000| 104 | |✅|tt|1.00000|1.00000| 105 | |✅|tzl|1.00000|1.00000| 106 | |✅|ug|1.00000|1.00000| 107 | |✅|uk|1.00000|1.00000| 108 | |✅|ur|1.00000|1.00000| 109 | |✅|uz|1.00000|1.00000| 110 | |✅|vi|1.00000|1.00000| 111 | |✅|war|1.00000|1.00000| 112 | |✅|wuu|1.00000|1.00000| 113 | |✅|xh|1.00000|1.00000| 114 | |⚠️|yi|0.99958|0.96916| 115 | |✅|yue|1.00000|1.00000| 116 | |✅|zsm|1.00000|1.00000| 117 | -------------------------------------------------------------------------------- /laserembeddings/__main__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import urllib.request 4 | import tarfile 5 | 6 | IS_WIN = os.name == 'nt' 7 | 8 | 9 | def non_win_string(s): 10 | return s if not IS_WIN else '' 11 | 12 | 13 | CONSOLE_CLEAR = non_win_string('\033[0;0m') 14 | CONSOLE_BOLD = non_win_string('\033[0;1m') 15 | CONSOLE_WAIT = non_win_string('⏳') 16 | CONSOLE_DONE = non_win_string('✅') 17 | CONSOLE_STARS = non_win_string('✨') 18 | CONSOLE_ERROR = non_win_string('❌') 19 | 20 | 21 | def print_usage(): 22 | print('Usage:') 23 | print('') 24 | print( 25 | f'{CONSOLE_BOLD}python -m laserembeddings download-models [OUTPUT_DIRECTORY]{CONSOLE_CLEAR}' 26 | ) 27 | print( 28 | ' Downloads LASER model files. If OUTPUT_DIRECTORY is omitted,' 29 | '\n' 30 | f' the models will be placed into the {CONSOLE_BOLD}data{CONSOLE_CLEAR} directory of the module' 31 | ) 32 | print('') 33 | print( 34 | f'{CONSOLE_BOLD}python -m laserembeddings download-test-data{CONSOLE_CLEAR}' 35 | ) 36 | print(' downloads data needed to run the tests') 37 | print('') 38 | 39 | 40 | def download_file(url, dest): 41 | print(f'{CONSOLE_WAIT} Downloading {url}...', end='') 42 | sys.stdout.flush() 43 | urllib.request.urlretrieve(url, dest) 44 | print(f'\r{CONSOLE_DONE} Downloaded {url} ') 45 | 46 | 47 | def extract_tar(tar, output_dir): 48 | print(f'{CONSOLE_WAIT} Extracting archive...', end='') 49 | sys.stdout.flush() 50 | with tarfile.open(tar) as t: 51 | t.extractall(output_dir) 52 | print(f'\r{CONSOLE_DONE} Extracted archive ') 53 | 54 | 55 | def download_models(output_dir): 56 | print(f'Downloading models into {output_dir}') 57 | print('') 58 | 59 | download_file('https://dl.fbaipublicfiles.com/laser/models/93langs.fcodes', 60 | os.path.join(output_dir, '93langs.fcodes')) 61 | download_file('https://dl.fbaipublicfiles.com/laser/models/93langs.fvocab', 62 | os.path.join(output_dir, '93langs.fvocab')) 63 | download_file( 64 | 'https://dl.fbaipublicfiles.com/laser/models/bilstm.93langs.2018-12-26.pt', 65 | os.path.join(output_dir, 'bilstm.93langs.2018-12-26.pt')) 66 | 67 | print('') 68 | print(f'{CONSOLE_STARS} You\'re all set!') 69 | 70 | 71 | def download_and_extract_test_data(output_dir): 72 | print(f'Downloading test data into {output_dir}') 73 | print('') 74 | 75 | download_file( 76 | 'https://github.com/yannvgn/laserembeddings-test-data/releases/download/v1.0.2/laserembeddings-test-data.tar.gz', 77 | os.path.join(output_dir, 'laserembeddings-test-data.tar.gz')) 78 | 79 | extract_tar(os.path.join(output_dir, 'laserembeddings-test-data.tar.gz'), 80 | output_dir) 81 | 82 | print('') 83 | print(f'{CONSOLE_STARS} Ready to test all that!') 84 | 85 | 86 | def main(): 87 | if len(sys.argv) == 1: 88 | print_usage() 89 | return 90 | 91 | if any(arg == '--help' for arg in sys.argv): 92 | print_usage() 93 | return 94 | 95 | if sys.argv[1] == 'download-models': 96 | output_dir = sys.argv[2] if len(sys.argv) > 2 else os.path.join( 97 | os.path.dirname(os.path.realpath(__file__)), 'data') 98 | 99 | download_models(output_dir) 100 | 101 | elif sys.argv[1] == 'download-test-data': 102 | if len(sys.argv) > 2: 103 | print_usage() 104 | return 105 | 106 | repository_root = os.path.dirname( 107 | os.path.dirname(os.path.realpath(__file__))) 108 | 109 | if not os.path.isfile(os.path.join(repository_root, 'pyproject.toml')): 110 | print( 111 | f"{CONSOLE_ERROR} Looks like you're not running laserembeddings from its source code" 112 | ) 113 | print( 114 | " → please checkout https://github.com/yannvgn/laserembeddings.git" 115 | ) 116 | print( 117 | ' then run "python -m laserembeddings download-test-data" from the root of the repository' 118 | ) 119 | return 120 | 121 | download_and_extract_test_data( 122 | os.path.join(repository_root, 'tests', 'data')) 123 | 124 | 125 | if __name__ == "__main__": 126 | main() 127 | -------------------------------------------------------------------------------- /laserembeddings/preprocessing.py: -------------------------------------------------------------------------------- 1 | from typing import Union, Optional 2 | from io import TextIOBase 3 | 4 | from sacremoses import MosesPunctNormalizer, MosesTokenizer 5 | from sacremoses.util import xml_unescape 6 | from subword_nmt.apply_bpe import BPE as subword_nmt_bpe, read_vocabulary 7 | from transliterate import translit 8 | 9 | from .utils import adapt_bpe_codes 10 | 11 | # Extras 12 | try: 13 | import jieba 14 | jieba.setLogLevel(60) 15 | except ImportError: 16 | jieba = None 17 | 18 | try: 19 | import MeCab 20 | import ipadic 21 | except ImportError: 22 | MeCab = None 23 | 24 | __all__ = ['Tokenizer', 'BPE'] 25 | 26 | ############################################################################### 27 | # 28 | # Tokenizer 29 | # 30 | ############################################################################### 31 | 32 | 33 | class Tokenizer: 34 | """ 35 | Tokenizer. 36 | 37 | Args: 38 | lang (str): the language code (ISO 639-1) of the texts to tokenize 39 | lower_case (bool, optional): if True, the texts are lower-cased before being tokenized. 40 | Defaults to True. 41 | romanize (bool or None, optional): if True, the texts are romanized. 42 | Defaults to None (romanization enabled based on input language). 43 | descape (bool, optional): if True, the XML-escaped symbols get de-escaped. 44 | Default to False. 45 | """ 46 | 47 | def __init__(self, 48 | lang: str = 'en', 49 | lower_case: bool = True, 50 | romanize: Optional[bool] = None, 51 | descape: bool = False): 52 | assert lower_case, 'lower case is needed by all the models' 53 | 54 | if lang in ('cmn', 'wuu', 'yue'): 55 | lang = 'zh' 56 | if lang == 'jpn': 57 | lang = 'ja' 58 | 59 | if lang == 'zh' and jieba is None: 60 | raise ModuleNotFoundError( 61 | '''No module named 'jieba'. Install laserembeddings with 'zh' extra to fix that: "pip install laserembeddings[zh]"''' 62 | ) 63 | if lang == 'ja' and MeCab is None: 64 | raise ModuleNotFoundError( 65 | '''No module named 'MeCab'. Install laserembeddings with 'ja' extra to fix that: "pip install laserembeddings[ja]"''' 66 | ) 67 | 68 | self.lang = lang 69 | self.lower_case = lower_case 70 | self.romanize = romanize if romanize is not None else lang == 'el' 71 | self.descape = descape 72 | 73 | self.normalizer = MosesPunctNormalizer(lang=lang) 74 | self.tokenizer = MosesTokenizer(lang=lang) 75 | self.mecab_tokenizer = MeCab.Tagger( 76 | f"{ipadic.MECAB_ARGS} -Owakati -b 50000") if lang == 'ja' else None 77 | 78 | def tokenize(self, text: str) -> str: 79 | """Tokenizes a text and returns the tokens as a string""" 80 | 81 | # REM_NON_PRINT_CHAR 82 | # not implemented 83 | 84 | # NORM_PUNC 85 | text = self.normalizer.normalize(text) 86 | 87 | # DESCAPE 88 | if self.descape: 89 | text = xml_unescape(text) 90 | 91 | # MOSES_TOKENIZER 92 | # see: https://github.com/facebookresearch/LASER/issues/55#issuecomment-480881573 93 | text = self.tokenizer.tokenize(text, 94 | return_str=True, 95 | escape=False, 96 | aggressive_dash_splits=False) 97 | 98 | # jieba 99 | if self.lang == 'zh': 100 | text = ' '.join(jieba.cut(text.rstrip('\r\n'))) 101 | 102 | # MECAB 103 | if self.lang == 'ja': 104 | text = self.mecab_tokenizer.parse(text).rstrip('\r\n') 105 | 106 | # ROMAN_LC 107 | if self.romanize: 108 | text = translit(text, self.lang, reversed=True) 109 | 110 | if self.lower_case: 111 | text = text.lower() 112 | 113 | return text 114 | 115 | 116 | ############################################################################### 117 | # 118 | # Apply BPE 119 | # 120 | ############################################################################### 121 | 122 | 123 | class BPE: 124 | """ 125 | BPE encoder. 126 | 127 | Args: 128 | bpe_codes (str or TextIOBase): the path to LASER's BPE codes (``93langs.fcodes``), 129 | or a text-mode file object. 130 | bpe_codes (str or TextIOBase): the path to LASER's BPE vocabulary (``93langs.fvocab``), 131 | or a text-mode file object. 132 | """ 133 | 134 | def __init__(self, bpe_codes: Union[str, TextIOBase], 135 | bpe_vocab: Union[str, TextIOBase]): 136 | 137 | f_bpe_codes = None 138 | f_bpe_vocab = None 139 | 140 | try: 141 | if isinstance(bpe_codes, str): 142 | f_bpe_codes = open(bpe_codes, 'r', encoding='utf-8') # pylint: disable=consider-using-with 143 | if isinstance(bpe_vocab, str): 144 | f_bpe_vocab = open(bpe_vocab, 'r', encoding='utf-8') # pylint: disable=consider-using-with 145 | 146 | self.bpe = subword_nmt_bpe(codes=adapt_bpe_codes(f_bpe_codes 147 | or bpe_codes), 148 | vocab=read_vocabulary(f_bpe_vocab 149 | or bpe_vocab, 150 | threshold=None)) 151 | self.bpe.version = (0, 2) 152 | 153 | finally: 154 | if f_bpe_codes: 155 | f_bpe_codes.close() 156 | if f_bpe_vocab: 157 | f_bpe_vocab.close() 158 | 159 | def encode_tokens(self, sentence_tokens: str) -> str: 160 | """Returns the BPE-encoded sentence from a tokenized sentence""" 161 | return self.bpe.process_line(sentence_tokens) 162 | -------------------------------------------------------------------------------- /laserembeddings/laser.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Any, Union, List, Optional 2 | from io import TextIOBase, BufferedIOBase 3 | import os 4 | 5 | import numpy as np 6 | 7 | from .preprocessing import Tokenizer, BPE 8 | from .embedding import BPESentenceEmbedding 9 | from .utils import sre_performance_patch 10 | 11 | __all__ = ['Laser'] 12 | 13 | 14 | class Laser: 15 | """ 16 | End-to-end LASER embedding. 17 | 18 | The pipeline is: ``Tokenizer.tokenize`` -> ``BPE.encode_tokens`` -> ``BPESentenceEmbedding.embed_bpe_sentences`` 19 | 20 | Args: 21 | bpe_codes (str or TextIOBase, optional): the path to LASER's BPE codes (``93langs.fcodes``), 22 | or a text-mode file object. If omitted, ``Laser.DEFAULT_BPE_CODES_FILE`` is used. 23 | bpe_codes (str or TextIOBase, optional): the path to LASER's BPE vocabulary (``93langs.fvocab``), 24 | or a text-mode file object. If omitted, ``Laser.DEFAULT_BPE_VOCAB_FILE`` is used. 25 | encoder (str or BufferedIOBase, optional): the path to LASER's encoder PyToch model (``bilstm.93langs.2018-12-26.pt``), 26 | or a binary-mode file object. If omitted, ``Laser.DEFAULT_ENCODER_FILE`` is used. 27 | tokenizer_options (Dict[str, Any], optional): additional arguments to pass to the tokenizer. 28 | See ``.preprocessing.Tokenizer``. 29 | embedding_options (Dict[str, Any], optional): additional arguments to pass to the embedding layer. 30 | See ``.embedding.BPESentenceEmbedding``. 31 | 32 | Class attributes: 33 | DATA_DIR (str): the path to the directory of default LASER files. 34 | DEFAULT_BPE_CODES_FILE: the path to default BPE codes file. 35 | DEFAULT_BPE_VOCAB_FILE: the path to default BPE vocabulary file. 36 | DEFAULT_ENCODER_FILE: the path to default LASER encoder PyTorch model file. 37 | """ 38 | 39 | DATA_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), 40 | 'data') 41 | DEFAULT_BPE_CODES_FILE = os.path.join(DATA_DIR, '93langs.fcodes') 42 | DEFAULT_BPE_VOCAB_FILE = os.path.join(DATA_DIR, '93langs.fvocab') 43 | DEFAULT_ENCODER_FILE = os.path.join(DATA_DIR, 44 | 'bilstm.93langs.2018-12-26.pt') 45 | 46 | def __init__(self, 47 | bpe_codes: Optional[Union[str, TextIOBase]] = None, 48 | bpe_vocab: Optional[Union[str, TextIOBase]] = None, 49 | encoder: Optional[Union[str, BufferedIOBase]] = None, 50 | tokenizer_options: Optional[Dict[str, Any]] = None, 51 | embedding_options: Optional[Dict[str, Any]] = None): 52 | 53 | if tokenizer_options is None: 54 | tokenizer_options = {} 55 | if embedding_options is None: 56 | embedding_options = {} 57 | 58 | if bpe_codes is None: 59 | if not os.path.isfile(self.DEFAULT_BPE_CODES_FILE): 60 | raise FileNotFoundError( 61 | '93langs.fcodes is missing, run "python -m laserembeddings download-models" to fix that' 62 | ) 63 | bpe_codes = self.DEFAULT_BPE_CODES_FILE 64 | if bpe_vocab is None: 65 | if not os.path.isfile(self.DEFAULT_BPE_VOCAB_FILE): 66 | raise FileNotFoundError( 67 | '93langs.fvocab is missing, run "python -m laserembeddings download-models" to fix that' 68 | ) 69 | bpe_vocab = self.DEFAULT_BPE_VOCAB_FILE 70 | if encoder is None: 71 | if not os.path.isfile(self.DEFAULT_ENCODER_FILE): 72 | raise FileNotFoundError( 73 | 'bilstm.93langs.2018-12-26.pt is missing, run "python -m laserembeddings download-models" to fix that' 74 | ) 75 | encoder = self.DEFAULT_ENCODER_FILE 76 | 77 | self.tokenizer_options = tokenizer_options 78 | self.tokenizers: Dict[str, Tokenizer] = {} 79 | 80 | self.bpe = BPE(bpe_codes, bpe_vocab) 81 | self.bpeSentenceEmbedding = BPESentenceEmbedding( 82 | encoder, **embedding_options) 83 | 84 | def _get_tokenizer(self, lang: str) -> Tokenizer: 85 | """Returns the Tokenizer instance for the specified language. The returned tokenizers are cached.""" 86 | 87 | if lang not in self.tokenizers: 88 | self.tokenizers[lang] = Tokenizer(lang, **self.tokenizer_options) 89 | 90 | return self.tokenizers[lang] 91 | 92 | def embed_sentences(self, sentences: Union[List[str], str], 93 | lang: Union[str, List[str]]) -> np.ndarray: 94 | """ 95 | Computes the LASER embeddings of provided sentences using the tokenizer for the specified language. 96 | 97 | Args: 98 | sentences (str or List[str]): the sentences to compute the embeddings from. 99 | lang (str or List[str]): the language code(s) (ISO 639-1) used to tokenize the sentences 100 | (either as a string - same code for every sentence - or as a list of strings - one code per sentence). 101 | 102 | Returns: 103 | np.ndarray: A N * 1024 NumPy array containing the embeddings, N being the number of sentences provided. 104 | """ 105 | sentences = [sentences] if isinstance(sentences, str) else sentences 106 | lang = [lang] * len(sentences) if isinstance(lang, str) else lang 107 | 108 | if len(sentences) != len(lang): 109 | raise ValueError( 110 | 'lang: invalid length: the number of language codes does not match the number of sentences' 111 | ) 112 | 113 | with sre_performance_patch(): # see https://bugs.python.org/issue37723 114 | sentence_tokens = [ 115 | self._get_tokenizer(sentence_lang).tokenize(sentence) 116 | for sentence, sentence_lang in zip(sentences, lang) 117 | ] 118 | bpe_encoded = [ 119 | self.bpe.encode_tokens(tokens) for tokens in sentence_tokens 120 | ] 121 | 122 | return self.bpeSentenceEmbedding.embed_bpe_sentences(bpe_encoded) 123 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LASER embeddings 2 | 3 | [![GitHub Workflow Status](https://img.shields.io/github/workflow/status/yannvgn/laserembeddings/Python%20package?style=flat-square)](https://github.com/yannvgn/laserembeddings/actions) 4 | ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/laserembeddings?style=flat-square) 5 | [![PyPI](https://img.shields.io/pypi/v/laserembeddings.svg?style=flat-square)](https://pypi.org/project/laserembeddings/) 6 | [![PyPI - License](https://img.shields.io/pypi/l/laserembeddings.svg?style=flat-square)](https://github.com/yannvgn/laserembeddings/blob/master/LICENSE) 7 | 8 | **Out-of-the-box multilingual sentence embeddings.** 9 | 10 | ![LASER embeddings maps similar sentences in any language to similar language-agnostic embeddings](laserembeddings.gif) 11 | 12 | _laserembeddings_ is a pip-packaged, production-ready port of Facebook Research's [LASER](https://github.com/facebookresearch/LASER) (Language-Agnostic SEntence Representations) to compute multilingual sentence embeddings. 13 | 14 | ✨ **Version 1.1.2 is here! What's new?** 15 | - A compatibility issue with subword-nmt 0.3.8 was fixed (#39) 🐛 16 | - The behavior of `Laser.embed_sentences` was unclear/misleading when the number of language codes received in the `lang` argument did not match the number of sentences to encode. It now raises an error in that case 🐛 17 | 18 | ## Context 19 | 20 | [LASER](https://github.com/facebookresearch/LASER) is a collection of scripts and models created by Facebook Research to compute **multilingual sentence embeddings** for zero-shot cross-lingual transfer. 21 | 22 | What does it mean? LASER is able to transform sentences into **language-independent vectors**. Similar sentences get mapped to close vectors (in terms of cosine distance), regardless of the input language. 23 | 24 | That is great, especially if you don't have training sets for the language(s) you want to process: you can build a classifier on top of LASER embeddings, train it on whatever language(s) you have in your training data, and let it classify texts in any language. 25 | 26 | **The aim of the package is to make LASER as easy-to-use and easy-to-deploy as possible: zero-config, production-ready, etc., just a two-liner to install.** 27 | 28 | 👉 👉 👉 For detailed information, have a look at the amazing [LASER repository](https://github.com/facebookresearch/LASER), read its [presentation article](https://code.fb.com/ai-research/laser-multilingual-sentence-embeddings/) and its [research paper](https://arxiv.org/abs/1812.10464). 👈 👈 👈 29 | 30 | ## Getting started 31 | 32 | ### Prerequisites 33 | 34 | You'll need Python 3.6+ and PyTorch. Please refer to [PyTorch installation instructions](https://pytorch.org/get-started/locally/). 35 | 36 | ### Installation 37 | 38 | ``` 39 | pip install laserembeddings 40 | ``` 41 | 42 | #### Chinese language 43 | 44 | Chinese is not supported by default. If you need to embed Chinese sentences, please install laserembeddings with the "zh" extra. This extra includes [jieba](https://github.com/fxsjy/jieba). 45 | 46 | ``` 47 | pip install laserembeddings[zh] 48 | ``` 49 | 50 | #### Japanese language 51 | 52 | Japanese is not supported by default. If you need to embed Japanese sentences, please install laserembeddings with the "ja" extra. This extra includes [mecab-python3](https://github.com/SamuraiT/mecab-python3) and the [ipadic](https://github.com/polm/ipadic-py) dictionary, which is used in the original LASER project. 53 | 54 | If you have issues running laserembeddings on Japanese sentences, please refer to [mecab-python3 documentation](https://github.com/SamuraiT/mecab-python3) for troubleshooting. 55 | 56 | ``` 57 | pip install laserembeddings[ja] 58 | ``` 59 | 60 | 61 | ### Downloading the pre-trained models 62 | 63 | ``` 64 | python -m laserembeddings download-models 65 | ``` 66 | 67 | This will download the models to the default `data` directory next to the source code of the package. Use `python -m laserembeddings download-models path/to/model/directory` to download the models to a specific location. 68 | 69 | ### Usage 70 | 71 | ```python 72 | from laserembeddings import Laser 73 | 74 | laser = Laser() 75 | 76 | # if all sentences are in the same language: 77 | 78 | embeddings = laser.embed_sentences( 79 | ['let your neural network be polyglot', 80 | 'use multilingual embeddings!'], 81 | lang='en') # lang is only used for tokenization 82 | 83 | # embeddings is a N*1024 (N = number of sentences) NumPy array 84 | ``` 85 | 86 | If the sentences are not in the same language, you can pass a list of language codes: 87 | ```python 88 | embeddings = laser.embed_sentences( 89 | ['I love pasta.', 90 | "J'adore les pâtes.", 91 | 'Ich liebe Pasta.'], 92 | lang=['en', 'fr', 'de']) 93 | ``` 94 | 95 | If you downloaded the models into a specific directory: 96 | 97 | ```python 98 | from laserembeddings import Laser 99 | 100 | path_to_bpe_codes = ... 101 | path_to_bpe_vocab = ... 102 | path_to_encoder = ... 103 | 104 | laser = Laser(path_to_bpe_codes, path_to_bpe_vocab, path_to_encoder) 105 | 106 | # you can also supply file objects instead of file paths 107 | ``` 108 | 109 | If you want to pull the models from S3: 110 | 111 | ```python 112 | from io import BytesIO, StringIO 113 | from laserembeddings import Laser 114 | import boto3 115 | 116 | s3 = boto3.resource('s3') 117 | MODELS_BUCKET = ... 118 | 119 | f_bpe_codes = StringIO(s3.Object(MODELS_BUCKET, 'path_to_bpe_codes.fcodes').get()['Body'].read().decode('utf-8')) 120 | f_bpe_vocab = StringIO(s3.Object(MODELS_BUCKET, 'path_to_bpe_vocabulary.fvocab').get()['Body'].read().decode('utf-8')) 121 | f_encoder = BytesIO(s3.Object(MODELS_BUCKET, 'path_to_encoder.pt').get()['Body'].read()) 122 | 123 | laser = Laser(f_bpe_codes, f_bpe_vocab, f_encoder) 124 | ``` 125 | 126 | ## What are the differences with the original implementation? 127 | 128 | Some dependencies of the original project have been replaced with pure-python dependencies, to make this package easy to install and deploy. 129 | 130 | Here's a summary of the differences: 131 | 132 | | Part of the pipeline | LASER dependency (original project) | laserembeddings dependency (this package) | Reason | 133 | |----------------------|-------------------------------------|----------------------------------------|--------| 134 | | Normalization / tokenization | [Moses](https://github.com/moses-smt/mosesdecoder) | [Sacremoses](https://github.com/alvations/sacremoses) 0.0.35, which seems to be the closest version to the Moses version used to train the model | Moses is implemented in Perl | 135 | | BPE encoding | [fastBPE](https://github.com/glample/fastBPE) | [subword-nmt](https://github.com/rsennrich/subword-nmt) | fastBPE cannot be installed via pip and requires compiling C++ code | 136 | | Japanese segmentation (optional) | [MeCab](https://github.com/taku910/mecab) / [JapaneseTokenizer](https://github.com/Kensuke-Mitsuzawa/JapaneseTokenizers) | [mecab-python3](https://github.com/SamuraiT/mecab-python3) and [ipadic](https://github.com/polm/ipadic-py) dictionary | mecab-python3 comes with wheels for major platforms (no compilation needed) | 137 | 138 | ## Will I get the exact same embeddings? 139 | 140 | **For most languages, in most of the cases, yes.** 141 | 142 | Some slight (and not so slight 🙄) differences exist for some languages due to differences in the implementation of the Tokenizer. 143 | 144 | **[An exhaustive comparison of the embeddings generated with LASER and laserembeddings](tests/report/comparison-with-LASER.md) is automatically generated and will be updated for each new release.** 145 | 146 | ## FAQ 147 | 148 | **How can I train the encoder?** 149 | 150 | You can't. LASER models are pre-trained and do not need to be fine-tuned. The embeddings are generic and perform well without fine-tuning. See https://github.com/facebookresearch/LASER/issues/3#issuecomment-404175463. 151 | 152 | ## Credits 153 | 154 | Thanks a lot to the creators of [LASER](https://github.com/facebookresearch/LASER) for open-sourcing the code of LASER and releasing the pre-trained models. All the kudos should go to them 👏. 155 | 156 | A big thanks to the creators of [Sacremoses](https://github.com/alvations/sacremoses) and [Subword Neural Machine Translation](https://github.com/rsennrich/subword-nmt/) for their great packages. 157 | 158 | ## Testing 159 | 160 | The first thing you'll need is [Poetry](https://github.com/sdispater/poetry). Please refer to the [installation guidelines](https://poetry.eustace.io/docs/#installation). 161 | 162 | Clone this repository and install the project: 163 | ``` 164 | poetry install -E zh -E ja 165 | ``` 166 | 167 | To run the tests: 168 | ``` 169 | poetry run pytest 170 | ``` 171 | 172 | ### Testing the similarity between the embeddings computed with LASER and laserembeddings 173 | 174 | First, install the project with the extra dependencies (Chinese and Japanese support): 175 | ``` 176 | poetry install -E zh -E ja 177 | ``` 178 | 179 | Then, download the test data: 180 | ``` 181 | poetry run python -m laserembeddings download-test-data 182 | ``` 183 | 184 | 👉 If you want to know more about the contents and the generation of the test data, check out the [laserembeddings-test-data](https://github.com/yannvgn/laserembeddings-test-data) repository. 185 | 186 | Then, run the test with `SIMILARITY_TEST` env. variable set to `1`. 187 | 188 | ``` 189 | SIMILARITY_TEST=1 poetry run pytest tests/test_laser.py 190 | ``` 191 | 192 | Now, have a coffee ☕️ and wait for the test to finish. 193 | 194 | The similarity report will be generated here: [tests/report/comparison-with-LASER.md](tests/report/comparison-with-LASER.md). 195 | -------------------------------------------------------------------------------- /laserembeddings/encoder.py: -------------------------------------------------------------------------------- 1 | # The code contained in this file was copied/pasted from LASER's source code (source/embed.py) 2 | # and nearly kept untouched besides: 3 | # - code formatting 4 | # - buffered_arange: fix to avoid unnecessary warning on PyTorch >= 1.4.0 5 | 6 | # pylint: disable=redefined-builtin, consider-using-enumerate, arguments-differ, fixme, abstract-method, consider-using-from-import 7 | 8 | from collections import namedtuple 9 | 10 | import re 11 | import numpy as np 12 | 13 | import torch 14 | import torch.nn as nn 15 | 16 | __all__ = ['SentenceEncoder', 'Encoder'] 17 | 18 | SPACE_NORMALIZER = re.compile(r'\s+') 19 | Batch = namedtuple('Batch', 'srcs tokens lengths') 20 | 21 | 22 | def buffered_arange(max): 23 | if not hasattr(buffered_arange, 24 | 'buf') or max > buffered_arange.buf.numel(): 25 | buffered_arange.buf = torch.LongTensor() 26 | torch.arange(max, out=buffered_arange.buf) 27 | return buffered_arange.buf[:max] 28 | 29 | 30 | # TODO Do proper padding from the beginning 31 | def convert_padding_direction(src_tokens, 32 | padding_idx, 33 | right_to_left=False, 34 | left_to_right=False): 35 | assert right_to_left ^ left_to_right 36 | pad_mask = src_tokens.eq(padding_idx) 37 | if not pad_mask.any(): 38 | # no padding, return early 39 | return src_tokens 40 | if left_to_right and not pad_mask[:, 0].any(): 41 | # already right padded 42 | return src_tokens 43 | if right_to_left and not pad_mask[:, -1].any(): 44 | # already left padded 45 | return src_tokens 46 | max_len = src_tokens.size(1) 47 | range = buffered_arange(max_len).type_as(src_tokens).expand_as(src_tokens) 48 | num_pads = pad_mask.long().sum(dim=1, keepdim=True) 49 | if right_to_left: 50 | index = torch.remainder(range - num_pads, max_len) 51 | else: 52 | index = torch.remainder(range + num_pads, max_len) 53 | return src_tokens.gather(1, index) 54 | 55 | 56 | class SentenceEncoder: 57 | def __init__(self, 58 | model_path, 59 | max_sentences=None, 60 | max_tokens=None, 61 | cpu=False, 62 | fp16=False, 63 | sort_kind='quicksort'): 64 | self.use_cuda = torch.cuda.is_available() and not cpu 65 | self.max_sentences = max_sentences 66 | self.max_tokens = max_tokens 67 | if self.max_tokens is None and self.max_sentences is None: 68 | self.max_sentences = 1 69 | 70 | state_dict = torch.load(model_path) 71 | self.encoder = Encoder(**state_dict['params']) 72 | self.encoder.load_state_dict(state_dict['model']) 73 | self.dictionary = state_dict['dictionary'] 74 | self.pad_index = self.dictionary[''] 75 | self.eos_index = self.dictionary[''] 76 | self.unk_index = self.dictionary[''] 77 | if fp16: 78 | self.encoder.half() 79 | if self.use_cuda: 80 | self.encoder.cuda() 81 | self.sort_kind = sort_kind 82 | 83 | def _process_batch(self, batch): 84 | tokens = batch.tokens 85 | lengths = batch.lengths 86 | if self.use_cuda: 87 | tokens = tokens.cuda() 88 | lengths = lengths.cuda() 89 | self.encoder.eval() 90 | embeddings = self.encoder(tokens, lengths)['sentemb'] 91 | return embeddings.detach().cpu().numpy() 92 | 93 | def _tokenize(self, line): 94 | tokens = SPACE_NORMALIZER.sub(" ", line).strip().split() 95 | ntokens = len(tokens) 96 | ids = torch.LongTensor(ntokens + 1) 97 | for i, token in enumerate(tokens): 98 | ids[i] = self.dictionary.get(token, self.unk_index) 99 | ids[ntokens] = self.eos_index 100 | return ids 101 | 102 | def _make_batches(self, lines): 103 | tokens = [self._tokenize(line) for line in lines] 104 | lengths = np.array([t.numel() for t in tokens]) 105 | indices = np.argsort(-lengths, kind=self.sort_kind) # pylint: disable=invalid-unary-operand-type 106 | 107 | def batch(tokens, lengths, indices): 108 | toks = tokens[0].new_full((len(tokens), tokens[0].shape[0]), 109 | self.pad_index) 110 | for i in range(len(tokens)): 111 | toks[i, -tokens[i].shape[0]:] = tokens[i] 112 | return Batch(srcs=None, 113 | tokens=toks, 114 | lengths=torch.LongTensor(lengths)), indices 115 | 116 | batch_tokens, batch_lengths, batch_indices = [], [], [] 117 | ntokens = nsentences = 0 118 | for i in indices: 119 | if nsentences > 0 and ((self.max_tokens is not None 120 | and ntokens + lengths[i] > self.max_tokens) 121 | or (self.max_sentences is not None 122 | and nsentences == self.max_sentences)): 123 | yield batch(batch_tokens, batch_lengths, batch_indices) 124 | ntokens = nsentences = 0 125 | batch_tokens, batch_lengths, batch_indices = [], [], [] 126 | batch_tokens.append(tokens[i]) 127 | batch_lengths.append(lengths[i]) 128 | batch_indices.append(i) 129 | ntokens += tokens[i].shape[0] 130 | nsentences += 1 131 | if nsentences > 0: 132 | yield batch(batch_tokens, batch_lengths, batch_indices) 133 | 134 | def encode_sentences(self, sentences): 135 | indices = [] 136 | results = [] 137 | for batch, batch_indices in self._make_batches(sentences): 138 | indices.extend(batch_indices) 139 | results.append(self._process_batch(batch)) 140 | return np.vstack(results)[np.argsort(indices, kind=self.sort_kind)] 141 | 142 | 143 | class Encoder(nn.Module): 144 | def __init__(self, 145 | num_embeddings, 146 | padding_idx, 147 | embed_dim=320, 148 | hidden_size=512, 149 | num_layers=1, 150 | bidirectional=False, 151 | left_pad=True, 152 | padding_value=0.): 153 | super().__init__() 154 | 155 | self.num_layers = num_layers 156 | self.bidirectional = bidirectional 157 | self.hidden_size = hidden_size 158 | 159 | self.padding_idx = padding_idx 160 | self.embed_tokens = nn.Embedding(num_embeddings, 161 | embed_dim, 162 | padding_idx=self.padding_idx) 163 | 164 | self.lstm = nn.LSTM( 165 | input_size=embed_dim, 166 | hidden_size=hidden_size, 167 | num_layers=num_layers, 168 | bidirectional=bidirectional, 169 | ) 170 | self.left_pad = left_pad 171 | self.padding_value = padding_value 172 | 173 | self.output_units = hidden_size 174 | if bidirectional: 175 | self.output_units *= 2 176 | 177 | def forward(self, src_tokens, src_lengths): 178 | if self.left_pad: 179 | # convert left-padding to right-padding 180 | src_tokens = convert_padding_direction( 181 | src_tokens, 182 | self.padding_idx, 183 | left_to_right=True, 184 | ) 185 | 186 | bsz, seqlen = src_tokens.size() 187 | 188 | # embed tokens 189 | x = self.embed_tokens(src_tokens) 190 | 191 | # B x T x C -> T x B x C 192 | x = x.transpose(0, 1) 193 | 194 | # pack embedded source tokens into a PackedSequence 195 | packed_x = nn.utils.rnn.pack_padded_sequence(x, src_lengths.data.cpu()) 196 | 197 | # apply LSTM 198 | if self.bidirectional: 199 | state_size = 2 * self.num_layers, bsz, self.hidden_size 200 | else: 201 | state_size = self.num_layers, bsz, self.hidden_size 202 | h0 = x.data.new(*state_size).zero_() 203 | c0 = x.data.new(*state_size).zero_() 204 | packed_outs, (final_hiddens, 205 | final_cells) = self.lstm(packed_x, (h0, c0)) 206 | 207 | # unpack outputs and apply dropout 208 | x, _ = nn.utils.rnn.pad_packed_sequence( 209 | packed_outs, padding_value=self.padding_value) 210 | assert list(x.size()) == [seqlen, bsz, self.output_units] 211 | 212 | if self.bidirectional: 213 | 214 | def combine_bidir(outs): 215 | return torch.cat([ 216 | torch.cat([outs[2 * i], outs[2 * i + 1]], dim=0).view( 217 | 1, bsz, self.output_units) 218 | for i in range(self.num_layers) 219 | ], 220 | dim=0) 221 | 222 | final_hiddens = combine_bidir(final_hiddens) 223 | final_cells = combine_bidir(final_cells) 224 | 225 | encoder_padding_mask = src_tokens.eq(self.padding_idx).t() 226 | 227 | # Set padded outputs to -inf so they are not selected by max-pooling 228 | padding_mask = src_tokens.eq(self.padding_idx).t().unsqueeze(-1) 229 | if padding_mask.any(): 230 | x = x.float().masked_fill_(padding_mask, float('-inf')).type_as(x) 231 | 232 | # Build the sentence embedding by max-pooling over the encoder outputs 233 | sentemb = x.max(dim=0)[0] 234 | 235 | return { 236 | 'sentemb': 237 | sentemb, 238 | 'encoder_out': (x, final_hiddens, final_cells), 239 | 'encoder_padding_mask': 240 | encoder_padding_mask if encoder_padding_mask.any() else None 241 | } 242 | --------------------------------------------------------------------------------