├── MANIFEST.in ├── .vscode └── settings.json ├── requirements.txt ├── .github ├── scripts │ └── run-tests.sh └── workflows │ └── tests.yml ├── test.py ├── create.py ├── setup.py ├── setup.cfg ├── tests └── test_load_nlp.py ├── spacy_sentence_bert ├── meta │ ├── xx_LaBSE.json │ ├── en_nli_bert_base.json │ ├── en_allenai_specter.json │ ├── en_nli_bert_large.json │ ├── en_nli_roberta_base.json │ ├── en_stsb_bert_base.json │ ├── en_stsb_bert_large.json │ ├── en_nli_roberta_large.json │ ├── en_stsb_roberta_base.json │ ├── en_stsb_roberta_large.json │ ├── en_nli_distilbert_base.json │ ├── en_nq_distilbert_base_v1.json │ ├── en_quora_distilbert_base.json │ ├── en_stsb_distilbert_base.json │ ├── en_msmarco_roberta_base_v2.json │ ├── xx_stsb_xlm_r_multilingual.json │ ├── en_msmarco_distilbert_base_v2.json │ ├── en_nli_bert_base_cls_pooling.json │ ├── en_nli_bert_base_max_pooling.json │ ├── en_nli_bert_large_cls_pooling.json │ ├── en_nli_bert_large_max_pooling.json │ ├── en_msmarco_distilroberta_base_v2.json │ ├── en_nli_distilbert_base_max_pooling.json │ ├── xx_quora_distilbert_multilingual.json │ ├── en_average_word_embeddings_komninos.json │ ├── en_paraphrase_distilroberta_base_v1.json │ ├── xx_paraphrase_xlm_r_multilingual_v1.json │ ├── xx_distiluse_base_multilingual_cased_v2.json │ ├── en_average_word_embeddings_glove_6B_300d.json │ ├── en_average_word_embeddings_glove_840B_300d.json │ ├── en_average_word_embeddings_levy_dependency.json │ └── xx_cross_en_de_roberta_sentence_transformer.json ├── __init__.py ├── language.py └── util.py ├── LICENSE ├── main.py ├── .gitignore └── README.md /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include spacy_sentence_bert/meta/*.json -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.pythonPath": "venv/bin/python" 3 | } -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | -e . 2 | 3 | twine 4 | typer 5 | pylint 6 | autopep8 7 | pytest -------------------------------------------------------------------------------- /.github/scripts/run-tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | echo "Running tests..." 5 | python -m pytest 6 | echo "Tests passed!" -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import typer 2 | import spacy 3 | from spacy_sentence_bert import util 4 | 5 | def main(model_name): 6 | nlp = spacy.load(model_name) 7 | 8 | assert nlp.meta['lang'] == model_name[:2] 9 | 10 | assert nlp.meta['name'] == model_name[3:] 11 | 12 | doc = nlp('hi') 13 | cfg = util.configs[model_name] 14 | 15 | assert doc.vector.shape[0] == cfg['dimensions'] 16 | 17 | 18 | if __name__ == "__main__": 19 | typer.run(main) -------------------------------------------------------------------------------- /create.py: -------------------------------------------------------------------------------- 1 | import typer 2 | from spacy_sentence_bert import language, util 3 | 4 | def main(model_name): 5 | if model_name not in util.configs: 6 | raise ValueError(f'Model "{model_name}" not available') 7 | nlp = util.create_lang(model_name) 8 | print(nlp.pipe_names) 9 | doc = nlp('Hello my friend') 10 | print(doc.vector.shape) 11 | nlp.to_disk(f'models/{model_name}') 12 | 13 | if __name__ == "__main__": 14 | typer.run(main) -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | with open('README.md', encoding='utf-8') as f: 4 | long_description = f.read() 5 | 6 | def setup_package(): 7 | setup(name="spacy_sentence_bert", 8 | packages=find_packages(), 9 | long_description=long_description, 10 | long_description_content_type='text/markdown', 11 | package_data={'spacy_sentence_bert': ['meta/*.json']}, 12 | include_package_data=True 13 | ) 14 | 15 | if __name__ == "__main__": 16 | setup_package() 17 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | version = 0.1.2 3 | description = SpaCy models for using sentence-BERT 4 | description-file = README.md 5 | url = https://github.com/MartinoMensio/spacy-sentence-bert 6 | author = Martino Mensio 7 | author_email = martino.mensio@open.ac.uk 8 | 9 | [options] 10 | include_package_data = true 11 | install_requires = 12 | sentence-transformers 13 | spacy>=3.0.0,<4.0.0 14 | protobuf # XLMRobertaConverter requires it 15 | [options.entry_points] 16 | spacy_factories = 17 | sentence_bert = spacy_sentence_bert:SentenceBert 18 | -------------------------------------------------------------------------------- /tests/test_load_nlp.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | 3 | def test_basic_load(): 4 | nlp = spacy.blank("en") 5 | p = nlp.add_pipe('sentence_bert', config={'model_name': 'stsb-roberta-base'}) 6 | assert p != None 7 | assert 'sentence_bert' in nlp.pipe_names 8 | 9 | def test_load_vector(): 10 | nlp = spacy.blank("en") 11 | p = nlp.add_pipe('sentence_bert', config={'model_name': 'stsb-roberta-base'}) 12 | assert p != None 13 | assert 'sentence_bert' in nlp.pipe_names 14 | doc = nlp("This is a test") 15 | vector = doc.vector 16 | assert vector is not None 17 | shape = vector.shape 18 | assert shape != None -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | on: 3 | push: 4 | branches: [ master ] 5 | pull_request: 6 | branches: [ master ] 7 | jobs: 8 | test: 9 | name: Run tests 10 | runs-on: ubuntu-latest 11 | strategy: 12 | matrix: 13 | python-version: [ "3.7", "3.8", "3.9", "3.10" ] 14 | steps: 15 | - uses: actions/checkout@v3 16 | - name: python ${{ matrix.python-version }} 17 | uses: actions/setup-python@v4 18 | with: 19 | python-version: ${{ matrix.python-version }} 20 | cache: 'pip' # caching pip dependencies 21 | - name: Install Python dependencies 22 | uses: py-actions/py-dependency-install@v4 23 | - name: Run pytest 24 | run: ./.github/scripts/run-tests.sh 25 | shell: bash -------------------------------------------------------------------------------- /spacy_sentence_bert/meta/xx_LaBSE.json: -------------------------------------------------------------------------------- 1 | { 2 | "lang": "xx", 3 | "name": "LaBSE", 4 | "version": "0.1.2", 5 | "spacy_version": ">=3.0,<4.0", 6 | "description": "Wrapper of sentence-transformers models for spaCy", 7 | "author": "Martino Mensio", 8 | "email": "martino.mensio@open.ac.uk", 9 | "url": "https://github.com/MartinoMensio/spacy-sentence-bert", 10 | "license": "MIT", 11 | "requirements": [ 12 | "spacy-sentence-bert==0.1.2" 13 | ], 14 | "sources": [{ 15 | "name": "sentence-transformers", 16 | "url": "https://github.com/UKPLab/sentence-transformers", 17 | "license": "Apache-2.0" 18 | }], 19 | "vectors": { 20 | "width": 768, 21 | "vectors": 0, 22 | "keys": 0, 23 | "name": null 24 | }, 25 | "pipeline": [ 26 | "sentence_bert", 27 | "sentencizer" 28 | ], 29 | "factories": { 30 | "sentence_bert": "sentence_bert" 31 | }, 32 | "labels": { 33 | 34 | } 35 | } -------------------------------------------------------------------------------- /spacy_sentence_bert/meta/en_nli_bert_base.json: -------------------------------------------------------------------------------- 1 | { 2 | "lang": "en", 3 | "name": "nli_bert_base", 4 | "version": "0.1.2", 5 | "spacy_version": ">=3.0,<4.0", 6 | "description": "Wrapper of sentence-transformers models for spaCy", 7 | "author": "Martino Mensio", 8 | "email": "martino.mensio@open.ac.uk", 9 | "url": "https://github.com/MartinoMensio/spacy-sentence-bert", 10 | "license": "MIT", 11 | "requirements": [ 12 | "spacy-sentence-bert==0.1.2" 13 | ], 14 | "sources": [{ 15 | "name": "sentence-transformers", 16 | "url": "https://github.com/UKPLab/sentence-transformers", 17 | "license": "Apache-2.0" 18 | }], 19 | "vectors": { 20 | "width": 768, 21 | "vectors": 0, 22 | "keys": 0, 23 | "name": null 24 | }, 25 | "pipeline": [ 26 | "sentence_bert", 27 | "sentencizer" 28 | ], 29 | "factories": { 30 | "sentence_bert": "sentence_bert" 31 | }, 32 | "labels": { 33 | 34 | } 35 | } -------------------------------------------------------------------------------- /spacy_sentence_bert/meta/en_allenai_specter.json: -------------------------------------------------------------------------------- 1 | { 2 | "lang": "en", 3 | "name": "allenai_specter", 4 | "version": "0.1.2", 5 | "spacy_version": ">=3.0,<4.0", 6 | "description": "Wrapper of sentence-transformers models for spaCy", 7 | "author": "Martino Mensio", 8 | "email": "martino.mensio@open.ac.uk", 9 | "url": "https://github.com/MartinoMensio/spacy-sentence-bert", 10 | "license": "MIT", 11 | "requirements": [ 12 | "spacy-sentence-bert==0.1.2" 13 | ], 14 | "sources": [{ 15 | "name": "sentence-transformers", 16 | "url": "https://github.com/UKPLab/sentence-transformers", 17 | "license": "Apache-2.0" 18 | }], 19 | "vectors": { 20 | "width": 768, 21 | "vectors": 0, 22 | "keys": 0, 23 | "name": null 24 | }, 25 | "pipeline": [ 26 | "sentence_bert", 27 | "sentencizer" 28 | ], 29 | "factories": { 30 | "sentence_bert": "sentence_bert" 31 | }, 32 | "labels": { 33 | 34 | } 35 | } -------------------------------------------------------------------------------- /spacy_sentence_bert/meta/en_nli_bert_large.json: -------------------------------------------------------------------------------- 1 | { 2 | "lang": "en", 3 | "name": "nli_bert_large", 4 | "version": "0.1.2", 5 | "spacy_version": ">=3.0,<4.0", 6 | "description": "Wrapper of sentence-transformers models for spaCy", 7 | "author": "Martino Mensio", 8 | "email": "martino.mensio@open.ac.uk", 9 | "url": "https://github.com/MartinoMensio/spacy-sentence-bert", 10 | "license": "MIT", 11 | "requirements": [ 12 | "spacy-sentence-bert==0.1.2" 13 | ], 14 | "sources": [{ 15 | "name": "sentence-transformers", 16 | "url": "https://github.com/UKPLab/sentence-transformers", 17 | "license": "Apache-2.0" 18 | }], 19 | "vectors": { 20 | "width": 1024, 21 | "vectors": 0, 22 | "keys": 0, 23 | "name": null 24 | }, 25 | "pipeline": [ 26 | "sentence_bert", 27 | "sentencizer" 28 | ], 29 | "factories": { 30 | "sentence_bert": "sentence_bert" 31 | }, 32 | "labels": { 33 | 34 | } 35 | } -------------------------------------------------------------------------------- /spacy_sentence_bert/meta/en_nli_roberta_base.json: -------------------------------------------------------------------------------- 1 | { 2 | "lang": "en", 3 | "name": "nli_roberta_base", 4 | "version": "0.1.2", 5 | "spacy_version": ">=3.0,<4.0", 6 | "description": "Wrapper of sentence-transformers models for spaCy", 7 | "author": "Martino Mensio", 8 | "email": "martino.mensio@open.ac.uk", 9 | "url": "https://github.com/MartinoMensio/spacy-sentence-bert", 10 | "license": "MIT", 11 | "requirements": [ 12 | "spacy-sentence-bert==0.1.2" 13 | ], 14 | "sources": [{ 15 | "name": "sentence-transformers", 16 | "url": "https://github.com/UKPLab/sentence-transformers", 17 | "license": "Apache-2.0" 18 | }], 19 | "vectors": { 20 | "width": 768, 21 | "vectors": 0, 22 | "keys": 0, 23 | "name": null 24 | }, 25 | "pipeline": [ 26 | "sentence_bert", 27 | "sentencizer" 28 | ], 29 | "factories": { 30 | "sentence_bert": "sentence_bert" 31 | }, 32 | "labels": { 33 | 34 | } 35 | } -------------------------------------------------------------------------------- /spacy_sentence_bert/meta/en_stsb_bert_base.json: -------------------------------------------------------------------------------- 1 | { 2 | "lang": "en", 3 | "name": "stsb_bert_base", 4 | "version": "0.1.2", 5 | "spacy_version": ">=3.0,<4.0", 6 | "description": "Wrapper of sentence-transformers models for spaCy", 7 | "author": "Martino Mensio", 8 | "email": "martino.mensio@open.ac.uk", 9 | "url": "https://github.com/MartinoMensio/spacy-sentence-bert", 10 | "license": "MIT", 11 | "requirements": [ 12 | "spacy-sentence-bert==0.1.2" 13 | ], 14 | "sources": [{ 15 | "name": "sentence-transformers", 16 | "url": "https://github.com/UKPLab/sentence-transformers", 17 | "license": "Apache-2.0" 18 | }], 19 | "vectors": { 20 | "width": 768, 21 | "vectors": 0, 22 | "keys": 0, 23 | "name": null 24 | }, 25 | "pipeline": [ 26 | "sentence_bert", 27 | "sentencizer" 28 | ], 29 | "factories": { 30 | "sentence_bert": "sentence_bert" 31 | }, 32 | "labels": { 33 | 34 | } 35 | } -------------------------------------------------------------------------------- /spacy_sentence_bert/meta/en_stsb_bert_large.json: -------------------------------------------------------------------------------- 1 | { 2 | "lang": "en", 3 | "name": "stsb_bert_large", 4 | "version": "0.1.2", 5 | "spacy_version": ">=3.0,<4.0", 6 | "description": "Wrapper of sentence-transformers models for spaCy", 7 | "author": "Martino Mensio", 8 | "email": "martino.mensio@open.ac.uk", 9 | "url": "https://github.com/MartinoMensio/spacy-sentence-bert", 10 | "license": "MIT", 11 | "requirements": [ 12 | "spacy-sentence-bert==0.1.2" 13 | ], 14 | "sources": [{ 15 | "name": "sentence-transformers", 16 | "url": "https://github.com/UKPLab/sentence-transformers", 17 | "license": "Apache-2.0" 18 | }], 19 | "vectors": { 20 | "width": 1024, 21 | "vectors": 0, 22 | "keys": 0, 23 | "name": null 24 | }, 25 | "pipeline": [ 26 | "sentence_bert", 27 | "sentencizer" 28 | ], 29 | "factories": { 30 | "sentence_bert": "sentence_bert" 31 | }, 32 | "labels": { 33 | 34 | } 35 | } -------------------------------------------------------------------------------- /spacy_sentence_bert/meta/en_nli_roberta_large.json: -------------------------------------------------------------------------------- 1 | { 2 | "lang": "en", 3 | "name": "nli_roberta_large", 4 | "version": "0.1.2", 5 | "spacy_version": ">=3.0,<4.0", 6 | "description": "Wrapper of sentence-transformers models for spaCy", 7 | "author": "Martino Mensio", 8 | "email": "martino.mensio@open.ac.uk", 9 | "url": "https://github.com/MartinoMensio/spacy-sentence-bert", 10 | "license": "MIT", 11 | "requirements": [ 12 | "spacy-sentence-bert==0.1.2" 13 | ], 14 | "sources": [{ 15 | "name": "sentence-transformers", 16 | "url": "https://github.com/UKPLab/sentence-transformers", 17 | "license": "Apache-2.0" 18 | }], 19 | "vectors": { 20 | "width": 1024, 21 | "vectors": 0, 22 | "keys": 0, 23 | "name": null 24 | }, 25 | "pipeline": [ 26 | "sentence_bert", 27 | "sentencizer" 28 | ], 29 | "factories": { 30 | "sentence_bert": "sentence_bert" 31 | }, 32 | "labels": { 33 | 34 | } 35 | } -------------------------------------------------------------------------------- /spacy_sentence_bert/meta/en_stsb_roberta_base.json: -------------------------------------------------------------------------------- 1 | { 2 | "lang": "en", 3 | "name": "stsb_roberta_base", 4 | "version": "0.1.2", 5 | "spacy_version": ">=3.0,<4.0", 6 | "description": "Wrapper of sentence-transformers models for spaCy", 7 | "author": "Martino Mensio", 8 | "email": "martino.mensio@open.ac.uk", 9 | "url": "https://github.com/MartinoMensio/spacy-sentence-bert", 10 | "license": "MIT", 11 | "requirements": [ 12 | "spacy-sentence-bert==0.1.2" 13 | ], 14 | "sources": [{ 15 | "name": "sentence-transformers", 16 | "url": "https://github.com/UKPLab/sentence-transformers", 17 | "license": "Apache-2.0" 18 | }], 19 | "vectors": { 20 | "width": 768, 21 | "vectors": 0, 22 | "keys": 0, 23 | "name": null 24 | }, 25 | "pipeline": [ 26 | "sentence_bert", 27 | "sentencizer" 28 | ], 29 | "factories": { 30 | "sentence_bert": "sentence_bert" 31 | }, 32 | "labels": { 33 | 34 | } 35 | } -------------------------------------------------------------------------------- /spacy_sentence_bert/meta/en_stsb_roberta_large.json: -------------------------------------------------------------------------------- 1 | { 2 | "lang": "en", 3 | "name": "stsb_roberta_large", 4 | "version": "0.1.2", 5 | "spacy_version": ">=3.0,<4.0", 6 | "description": "Wrapper of sentence-transformers models for spaCy", 7 | "author": "Martino Mensio", 8 | "email": "martino.mensio@open.ac.uk", 9 | "url": "https://github.com/MartinoMensio/spacy-sentence-bert", 10 | "license": "MIT", 11 | "requirements": [ 12 | "spacy-sentence-bert==0.1.2" 13 | ], 14 | "sources": [{ 15 | "name": "sentence-transformers", 16 | "url": "https://github.com/UKPLab/sentence-transformers", 17 | "license": "Apache-2.0" 18 | }], 19 | "vectors": { 20 | "width": 1024, 21 | "vectors": 0, 22 | "keys": 0, 23 | "name": null 24 | }, 25 | "pipeline": [ 26 | "sentence_bert", 27 | "sentencizer" 28 | ], 29 | "factories": { 30 | "sentence_bert": "sentence_bert" 31 | }, 32 | "labels": { 33 | 34 | } 35 | } -------------------------------------------------------------------------------- /spacy_sentence_bert/meta/en_nli_distilbert_base.json: -------------------------------------------------------------------------------- 1 | { 2 | "lang": "en", 3 | "name": "nli_distilbert_base", 4 | "version": "0.1.2", 5 | "spacy_version": ">=3.0,<4.0", 6 | "description": "Wrapper of sentence-transformers models for spaCy", 7 | "author": "Martino Mensio", 8 | "email": "martino.mensio@open.ac.uk", 9 | "url": "https://github.com/MartinoMensio/spacy-sentence-bert", 10 | "license": "MIT", 11 | "requirements": [ 12 | "spacy-sentence-bert==0.1.2" 13 | ], 14 | "sources": [{ 15 | "name": "sentence-transformers", 16 | "url": "https://github.com/UKPLab/sentence-transformers", 17 | "license": "Apache-2.0" 18 | }], 19 | "vectors": { 20 | "width": 768, 21 | "vectors": 0, 22 | "keys": 0, 23 | "name": null 24 | }, 25 | "pipeline": [ 26 | "sentence_bert", 27 | "sentencizer" 28 | ], 29 | "factories": { 30 | "sentence_bert": "sentence_bert" 31 | }, 32 | "labels": { 33 | 34 | } 35 | } -------------------------------------------------------------------------------- /spacy_sentence_bert/meta/en_nq_distilbert_base_v1.json: -------------------------------------------------------------------------------- 1 | { 2 | "lang": "en", 3 | "name": "nq_distilbert_base_v1", 4 | "version": "0.1.2", 5 | "spacy_version": ">=3.0,<4.0", 6 | "description": "Wrapper of sentence-transformers models for spaCy", 7 | "author": "Martino Mensio", 8 | "email": "martino.mensio@open.ac.uk", 9 | "url": "https://github.com/MartinoMensio/spacy-sentence-bert", 10 | "license": "MIT", 11 | "requirements": [ 12 | "spacy-sentence-bert==0.1.2" 13 | ], 14 | "sources": [{ 15 | "name": "sentence-transformers", 16 | "url": "https://github.com/UKPLab/sentence-transformers", 17 | "license": "Apache-2.0" 18 | }], 19 | "vectors": { 20 | "width": 768, 21 | "vectors": 0, 22 | "keys": 0, 23 | "name": null 24 | }, 25 | "pipeline": [ 26 | "sentence_bert", 27 | "sentencizer" 28 | ], 29 | "factories": { 30 | "sentence_bert": "sentence_bert" 31 | }, 32 | "labels": { 33 | 34 | } 35 | } -------------------------------------------------------------------------------- /spacy_sentence_bert/meta/en_quora_distilbert_base.json: -------------------------------------------------------------------------------- 1 | { 2 | "lang": "en", 3 | "name": "quora_distilbert_base", 4 | "version": "0.1.2", 5 | "spacy_version": ">=3.0,<4.0", 6 | "description": "Wrapper of sentence-transformers models for spaCy", 7 | "author": "Martino Mensio", 8 | "email": "martino.mensio@open.ac.uk", 9 | "url": "https://github.com/MartinoMensio/spacy-sentence-bert", 10 | "license": "MIT", 11 | "requirements": [ 12 | "spacy-sentence-bert==0.1.2" 13 | ], 14 | "sources": [{ 15 | "name": "sentence-transformers", 16 | "url": "https://github.com/UKPLab/sentence-transformers", 17 | "license": "Apache-2.0" 18 | }], 19 | "vectors": { 20 | "width": 768, 21 | "vectors": 0, 22 | "keys": 0, 23 | "name": null 24 | }, 25 | "pipeline": [ 26 | "sentence_bert", 27 | "sentencizer" 28 | ], 29 | "factories": { 30 | "sentence_bert": "sentence_bert" 31 | }, 32 | "labels": { 33 | 34 | } 35 | } -------------------------------------------------------------------------------- /spacy_sentence_bert/meta/en_stsb_distilbert_base.json: -------------------------------------------------------------------------------- 1 | { 2 | "lang": "en", 3 | "name": "stsb_distilbert_base", 4 | "version": "0.1.2", 5 | "spacy_version": ">=3.0,<4.0", 6 | "description": "Wrapper of sentence-transformers models for spaCy", 7 | "author": "Martino Mensio", 8 | "email": "martino.mensio@open.ac.uk", 9 | "url": "https://github.com/MartinoMensio/spacy-sentence-bert", 10 | "license": "MIT", 11 | "requirements": [ 12 | "spacy-sentence-bert==0.1.2" 13 | ], 14 | "sources": [{ 15 | "name": "sentence-transformers", 16 | "url": "https://github.com/UKPLab/sentence-transformers", 17 | "license": "Apache-2.0" 18 | }], 19 | "vectors": { 20 | "width": 768, 21 | "vectors": 0, 22 | "keys": 0, 23 | "name": null 24 | }, 25 | "pipeline": [ 26 | "sentence_bert", 27 | "sentencizer" 28 | ], 29 | "factories": { 30 | "sentence_bert": "sentence_bert" 31 | }, 32 | "labels": { 33 | 34 | } 35 | } -------------------------------------------------------------------------------- /spacy_sentence_bert/meta/en_msmarco_roberta_base_v2.json: -------------------------------------------------------------------------------- 1 | { 2 | "lang": "en", 3 | "name": "msmarco_roberta_base_v2", 4 | "version": "0.1.2", 5 | "spacy_version": ">=3.0,<4.0", 6 | "description": "Wrapper of sentence-transformers models for spaCy", 7 | "author": "Martino Mensio", 8 | "email": "martino.mensio@open.ac.uk", 9 | "url": "https://github.com/MartinoMensio/spacy-sentence-bert", 10 | "license": "MIT", 11 | "requirements": [ 12 | "spacy-sentence-bert==0.1.2" 13 | ], 14 | "sources": [{ 15 | "name": "sentence-transformers", 16 | "url": "https://github.com/UKPLab/sentence-transformers", 17 | "license": "Apache-2.0" 18 | }], 19 | "vectors": { 20 | "width": 768, 21 | "vectors": 0, 22 | "keys": 0, 23 | "name": null 24 | }, 25 | "pipeline": [ 26 | "sentence_bert", 27 | "sentencizer" 28 | ], 29 | "factories": { 30 | "sentence_bert": "sentence_bert" 31 | }, 32 | "labels": { 33 | 34 | } 35 | } -------------------------------------------------------------------------------- /spacy_sentence_bert/meta/xx_stsb_xlm_r_multilingual.json: -------------------------------------------------------------------------------- 1 | { 2 | "lang": "xx", 3 | "name": "stsb_xlm_r_multilingual", 4 | "version": "0.1.2", 5 | "spacy_version": ">=3.0,<4.0", 6 | "description": "Wrapper of sentence-transformers models for spaCy", 7 | "author": "Martino Mensio", 8 | "email": "martino.mensio@open.ac.uk", 9 | "url": "https://github.com/MartinoMensio/spacy-sentence-bert", 10 | "license": "MIT", 11 | "requirements": [ 12 | "spacy-sentence-bert==0.1.2" 13 | ], 14 | "sources": [{ 15 | "name": "sentence-transformers", 16 | "url": "https://github.com/UKPLab/sentence-transformers", 17 | "license": "Apache-2.0" 18 | }], 19 | "vectors": { 20 | "width": 768, 21 | "vectors": 0, 22 | "keys": 0, 23 | "name": null 24 | }, 25 | "pipeline": [ 26 | "sentence_bert", 27 | "sentencizer" 28 | ], 29 | "factories": { 30 | "sentence_bert": "sentence_bert" 31 | }, 32 | "labels": { 33 | 34 | } 35 | } -------------------------------------------------------------------------------- /spacy_sentence_bert/meta/en_msmarco_distilbert_base_v2.json: -------------------------------------------------------------------------------- 1 | { 2 | "lang": "en", 3 | "name": "msmarco_distilbert_base_v2", 4 | "version": "0.1.2", 5 | "spacy_version": ">=3.0,<4.0", 6 | "description": "Wrapper of sentence-transformers models for spaCy", 7 | "author": "Martino Mensio", 8 | "email": "martino.mensio@open.ac.uk", 9 | "url": "https://github.com/MartinoMensio/spacy-sentence-bert", 10 | "license": "MIT", 11 | "requirements": [ 12 | "spacy-sentence-bert==0.1.2" 13 | ], 14 | "sources": [{ 15 | "name": "sentence-transformers", 16 | "url": "https://github.com/UKPLab/sentence-transformers", 17 | "license": "Apache-2.0" 18 | }], 19 | "vectors": { 20 | "width": 768, 21 | "vectors": 0, 22 | "keys": 0, 23 | "name": null 24 | }, 25 | "pipeline": [ 26 | "sentence_bert", 27 | "sentencizer" 28 | ], 29 | "factories": { 30 | "sentence_bert": "sentence_bert" 31 | }, 32 | "labels": { 33 | 34 | } 35 | } -------------------------------------------------------------------------------- /spacy_sentence_bert/meta/en_nli_bert_base_cls_pooling.json: -------------------------------------------------------------------------------- 1 | { 2 | "lang": "en", 3 | "name": "nli_bert_base_cls_pooling", 4 | "version": "0.1.2", 5 | "spacy_version": ">=3.0,<4.0", 6 | "description": "Wrapper of sentence-transformers models for spaCy", 7 | "author": "Martino Mensio", 8 | "email": "martino.mensio@open.ac.uk", 9 | "url": "https://github.com/MartinoMensio/spacy-sentence-bert", 10 | "license": "MIT", 11 | "requirements": [ 12 | "spacy-sentence-bert==0.1.2" 13 | ], 14 | "sources": [{ 15 | "name": "sentence-transformers", 16 | "url": "https://github.com/UKPLab/sentence-transformers", 17 | "license": "Apache-2.0" 18 | }], 19 | "vectors": { 20 | "width": 768, 21 | "vectors": 0, 22 | "keys": 0, 23 | "name": null 24 | }, 25 | "pipeline": [ 26 | "sentence_bert", 27 | "sentencizer" 28 | ], 29 | "factories": { 30 | "sentence_bert": "sentence_bert" 31 | }, 32 | "labels": { 33 | 34 | } 35 | } -------------------------------------------------------------------------------- /spacy_sentence_bert/meta/en_nli_bert_base_max_pooling.json: -------------------------------------------------------------------------------- 1 | { 2 | "lang": "en", 3 | "name": "nli_bert_base_max_pooling", 4 | "version": "0.1.2", 5 | "spacy_version": ">=3.0,<4.0", 6 | "description": "Wrapper of sentence-transformers models for spaCy", 7 | "author": "Martino Mensio", 8 | "email": "martino.mensio@open.ac.uk", 9 | "url": "https://github.com/MartinoMensio/spacy-sentence-bert", 10 | "license": "MIT", 11 | "requirements": [ 12 | "spacy-sentence-bert==0.1.2" 13 | ], 14 | "sources": [{ 15 | "name": "sentence-transformers", 16 | "url": "https://github.com/UKPLab/sentence-transformers", 17 | "license": "Apache-2.0" 18 | }], 19 | "vectors": { 20 | "width": 768, 21 | "vectors": 0, 22 | "keys": 0, 23 | "name": null 24 | }, 25 | "pipeline": [ 26 | "sentence_bert", 27 | "sentencizer" 28 | ], 29 | "factories": { 30 | "sentence_bert": "sentence_bert" 31 | }, 32 | "labels": { 33 | 34 | } 35 | } -------------------------------------------------------------------------------- /spacy_sentence_bert/meta/en_nli_bert_large_cls_pooling.json: -------------------------------------------------------------------------------- 1 | { 2 | "lang": "en", 3 | "name": "nli_bert_large_cls_pooling", 4 | "version": "0.1.2", 5 | "spacy_version": ">=3.0,<4.0", 6 | "description": "Wrapper of sentence-transformers models for spaCy", 7 | "author": "Martino Mensio", 8 | "email": "martino.mensio@open.ac.uk", 9 | "url": "https://github.com/MartinoMensio/spacy-sentence-bert", 10 | "license": "MIT", 11 | "requirements": [ 12 | "spacy-sentence-bert==0.1.2" 13 | ], 14 | "sources": [{ 15 | "name": "sentence-transformers", 16 | "url": "https://github.com/UKPLab/sentence-transformers", 17 | "license": "Apache-2.0" 18 | }], 19 | "vectors": { 20 | "width": 1024, 21 | "vectors": 0, 22 | "keys": 0, 23 | "name": null 24 | }, 25 | "pipeline": [ 26 | "sentence_bert", 27 | "sentencizer" 28 | ], 29 | "factories": { 30 | "sentence_bert": "sentence_bert" 31 | }, 32 | "labels": { 33 | 34 | } 35 | } -------------------------------------------------------------------------------- /spacy_sentence_bert/meta/en_nli_bert_large_max_pooling.json: -------------------------------------------------------------------------------- 1 | { 2 | "lang": "en", 3 | "name": "nli_bert_large_max_pooling", 4 | "version": "0.1.2", 5 | "spacy_version": ">=3.0,<4.0", 6 | "description": "Wrapper of sentence-transformers models for spaCy", 7 | "author": "Martino Mensio", 8 | "email": "martino.mensio@open.ac.uk", 9 | "url": "https://github.com/MartinoMensio/spacy-sentence-bert", 10 | "license": "MIT", 11 | "requirements": [ 12 | "spacy-sentence-bert==0.1.2" 13 | ], 14 | "sources": [{ 15 | "name": "sentence-transformers", 16 | "url": "https://github.com/UKPLab/sentence-transformers", 17 | "license": "Apache-2.0" 18 | }], 19 | "vectors": { 20 | "width": 1024, 21 | "vectors": 0, 22 | "keys": 0, 23 | "name": null 24 | }, 25 | "pipeline": [ 26 | "sentence_bert", 27 | "sentencizer" 28 | ], 29 | "factories": { 30 | "sentence_bert": "sentence_bert" 31 | }, 32 | "labels": { 33 | 34 | } 35 | } -------------------------------------------------------------------------------- /spacy_sentence_bert/meta/en_msmarco_distilroberta_base_v2.json: -------------------------------------------------------------------------------- 1 | { 2 | "lang": "en", 3 | "name": "msmarco_distilroberta_base_v2", 4 | "version": "0.1.2", 5 | "spacy_version": ">=3.0,<4.0", 6 | "description": "Wrapper of sentence-transformers models for spaCy", 7 | "author": "Martino Mensio", 8 | "email": "martino.mensio@open.ac.uk", 9 | "url": "https://github.com/MartinoMensio/spacy-sentence-bert", 10 | "license": "MIT", 11 | "requirements": [ 12 | "spacy-sentence-bert==0.1.2" 13 | ], 14 | "sources": [{ 15 | "name": "sentence-transformers", 16 | "url": "https://github.com/UKPLab/sentence-transformers", 17 | "license": "Apache-2.0" 18 | }], 19 | "vectors": { 20 | "width": 768, 21 | "vectors": 0, 22 | "keys": 0, 23 | "name": null 24 | }, 25 | "pipeline": [ 26 | "sentence_bert", 27 | "sentencizer" 28 | ], 29 | "factories": { 30 | "sentence_bert": "sentence_bert" 31 | }, 32 | "labels": { 33 | 34 | } 35 | } -------------------------------------------------------------------------------- /spacy_sentence_bert/meta/en_nli_distilbert_base_max_pooling.json: -------------------------------------------------------------------------------- 1 | { 2 | "lang": "en", 3 | "name": "nli_distilbert_base_max_pooling", 4 | "version": "0.1.2", 5 | "spacy_version": ">=3.0,<4.0", 6 | "description": "Wrapper of sentence-transformers models for spaCy", 7 | "author": "Martino Mensio", 8 | "email": "martino.mensio@open.ac.uk", 9 | "url": "https://github.com/MartinoMensio/spacy-sentence-bert", 10 | "license": "MIT", 11 | "requirements": [ 12 | "spacy-sentence-bert==0.1.2" 13 | ], 14 | "sources": [{ 15 | "name": "sentence-transformers", 16 | "url": "https://github.com/UKPLab/sentence-transformers", 17 | "license": "Apache-2.0" 18 | }], 19 | "vectors": { 20 | "width": 768, 21 | "vectors": 0, 22 | "keys": 0, 23 | "name": null 24 | }, 25 | "pipeline": [ 26 | "sentence_bert", 27 | "sentencizer" 28 | ], 29 | "factories": { 30 | "sentence_bert": "sentence_bert" 31 | }, 32 | "labels": { 33 | 34 | } 35 | } -------------------------------------------------------------------------------- /spacy_sentence_bert/meta/xx_quora_distilbert_multilingual.json: -------------------------------------------------------------------------------- 1 | { 2 | "lang": "xx", 3 | "name": "quora_distilbert_multilingual", 4 | "version": "0.1.2", 5 | "spacy_version": ">=3.0,<4.0", 6 | "description": "Wrapper of sentence-transformers models for spaCy", 7 | "author": "Martino Mensio", 8 | "email": "martino.mensio@open.ac.uk", 9 | "url": "https://github.com/MartinoMensio/spacy-sentence-bert", 10 | "license": "MIT", 11 | "requirements": [ 12 | "spacy-sentence-bert==0.1.2" 13 | ], 14 | "sources": [{ 15 | "name": "sentence-transformers", 16 | "url": "https://github.com/UKPLab/sentence-transformers", 17 | "license": "Apache-2.0" 18 | }], 19 | "vectors": { 20 | "width": 768, 21 | "vectors": 0, 22 | "keys": 0, 23 | "name": null 24 | }, 25 | "pipeline": [ 26 | "sentence_bert", 27 | "sentencizer" 28 | ], 29 | "factories": { 30 | "sentence_bert": "sentence_bert" 31 | }, 32 | "labels": { 33 | 34 | } 35 | } -------------------------------------------------------------------------------- /spacy_sentence_bert/meta/en_average_word_embeddings_komninos.json: -------------------------------------------------------------------------------- 1 | { 2 | "lang": "en", 3 | "name": "average_word_embeddings_komninos", 4 | "version": "0.1.2", 5 | "spacy_version": ">=3.0,<4.0", 6 | "description": "Wrapper of sentence-transformers models for spaCy", 7 | "author": "Martino Mensio", 8 | "email": "martino.mensio@open.ac.uk", 9 | "url": "https://github.com/MartinoMensio/spacy-sentence-bert", 10 | "license": "MIT", 11 | "requirements": [ 12 | "spacy-sentence-bert==0.1.2" 13 | ], 14 | "sources": [{ 15 | "name": "sentence-transformers", 16 | "url": "https://github.com/UKPLab/sentence-transformers", 17 | "license": "Apache-2.0" 18 | }], 19 | "vectors": { 20 | "width": 300, 21 | "vectors": 0, 22 | "keys": 0, 23 | "name": null 24 | }, 25 | "pipeline": [ 26 | "sentence_bert", 27 | "sentencizer" 28 | ], 29 | "factories": { 30 | "sentence_bert": "sentence_bert" 31 | }, 32 | "labels": { 33 | 34 | } 35 | } -------------------------------------------------------------------------------- /spacy_sentence_bert/meta/en_paraphrase_distilroberta_base_v1.json: -------------------------------------------------------------------------------- 1 | { 2 | "lang": "en", 3 | "name": "paraphrase_distilroberta_base_v1", 4 | "version": "0.1.2", 5 | "spacy_version": ">=3.0,<4.0", 6 | "description": "Wrapper of sentence-transformers models for spaCy", 7 | "author": "Martino Mensio", 8 | "email": "martino.mensio@open.ac.uk", 9 | "url": "https://github.com/MartinoMensio/spacy-sentence-bert", 10 | "license": "MIT", 11 | "requirements": [ 12 | "spacy-sentence-bert==0.1.2" 13 | ], 14 | "sources": [{ 15 | "name": "sentence-transformers", 16 | "url": "https://github.com/UKPLab/sentence-transformers", 17 | "license": "Apache-2.0" 18 | }], 19 | "vectors": { 20 | "width": 768, 21 | "vectors": 0, 22 | "keys": 0, 23 | "name": null 24 | }, 25 | "pipeline": [ 26 | "sentence_bert", 27 | "sentencizer" 28 | ], 29 | "factories": { 30 | "sentence_bert": "sentence_bert" 31 | }, 32 | "labels": { 33 | 34 | } 35 | } -------------------------------------------------------------------------------- /spacy_sentence_bert/meta/xx_paraphrase_xlm_r_multilingual_v1.json: -------------------------------------------------------------------------------- 1 | { 2 | "lang": "xx", 3 | "name": "paraphrase_xlm_r_multilingual_v1", 4 | "version": "0.1.2", 5 | "spacy_version": ">=3.0,<4.0", 6 | "description": "Wrapper of sentence-transformers models for spaCy", 7 | "author": "Martino Mensio", 8 | "email": "martino.mensio@open.ac.uk", 9 | "url": "https://github.com/MartinoMensio/spacy-sentence-bert", 10 | "license": "MIT", 11 | "requirements": [ 12 | "spacy-sentence-bert==0.1.2" 13 | ], 14 | "sources": [{ 15 | "name": "sentence-transformers", 16 | "url": "https://github.com/UKPLab/sentence-transformers", 17 | "license": "Apache-2.0" 18 | }], 19 | "vectors": { 20 | "width": 768, 21 | "vectors": 0, 22 | "keys": 0, 23 | "name": null 24 | }, 25 | "pipeline": [ 26 | "sentence_bert", 27 | "sentencizer" 28 | ], 29 | "factories": { 30 | "sentence_bert": "sentence_bert" 31 | }, 32 | "labels": { 33 | 34 | } 35 | } -------------------------------------------------------------------------------- /spacy_sentence_bert/meta/xx_distiluse_base_multilingual_cased_v2.json: -------------------------------------------------------------------------------- 1 | { 2 | "lang": "xx", 3 | "name": "distiluse_base_multilingual_cased_v2", 4 | "version": "0.1.2", 5 | "spacy_version": ">=3.0,<4.0", 6 | "description": "Wrapper of sentence-transformers models for spaCy", 7 | "author": "Martino Mensio", 8 | "email": "martino.mensio@open.ac.uk", 9 | "url": "https://github.com/MartinoMensio/spacy-sentence-bert", 10 | "license": "MIT", 11 | "requirements": [ 12 | "spacy-sentence-bert==0.1.2" 13 | ], 14 | "sources": [{ 15 | "name": "sentence-transformers", 16 | "url": "https://github.com/UKPLab/sentence-transformers", 17 | "license": "Apache-2.0" 18 | }], 19 | "vectors": { 20 | "width": 512, 21 | "vectors": 0, 22 | "keys": 0, 23 | "name": null 24 | }, 25 | "pipeline": [ 26 | "sentence_bert", 27 | "sentencizer" 28 | ], 29 | "factories": { 30 | "sentence_bert": "sentence_bert" 31 | }, 32 | "labels": { 33 | 34 | } 35 | } -------------------------------------------------------------------------------- /spacy_sentence_bert/meta/en_average_word_embeddings_glove_6B_300d.json: -------------------------------------------------------------------------------- 1 | { 2 | "lang": "en", 3 | "name": "average_word_embeddings_glove_6B_300d", 4 | "version": "0.1.2", 5 | "spacy_version": ">=3.0,<4.0", 6 | "description": "Wrapper of sentence-transformers models for spaCy", 7 | "author": "Martino Mensio", 8 | "email": "martino.mensio@open.ac.uk", 9 | "url": "https://github.com/MartinoMensio/spacy-sentence-bert", 10 | "license": "MIT", 11 | "requirements": [ 12 | "spacy-sentence-bert==0.1.2" 13 | ], 14 | "sources": [{ 15 | "name": "sentence-transformers", 16 | "url": "https://github.com/UKPLab/sentence-transformers", 17 | "license": "Apache-2.0" 18 | }], 19 | "vectors": { 20 | "width": 300, 21 | "vectors": 0, 22 | "keys": 0, 23 | "name": null 24 | }, 25 | "pipeline": [ 26 | "sentence_bert", 27 | "sentencizer" 28 | ], 29 | "factories": { 30 | "sentence_bert": "sentence_bert" 31 | }, 32 | "labels": { 33 | 34 | } 35 | } -------------------------------------------------------------------------------- /spacy_sentence_bert/meta/en_average_word_embeddings_glove_840B_300d.json: -------------------------------------------------------------------------------- 1 | { 2 | "lang": "en", 3 | "name": "average_word_embeddings_glove_840B_300d", 4 | "version": "0.1.2", 5 | "spacy_version": ">=3.0,<4.0", 6 | "description": "Wrapper of sentence-transformers models for spaCy", 7 | "author": "Martino Mensio", 8 | "email": "martino.mensio@open.ac.uk", 9 | "url": "https://github.com/MartinoMensio/spacy-sentence-bert", 10 | "license": "MIT", 11 | "requirements": [ 12 | "spacy-sentence-bert==0.1.2" 13 | ], 14 | "sources": [{ 15 | "name": "sentence-transformers", 16 | "url": "https://github.com/UKPLab/sentence-transformers", 17 | "license": "Apache-2.0" 18 | }], 19 | "vectors": { 20 | "width": 300, 21 | "vectors": 0, 22 | "keys": 0, 23 | "name": null 24 | }, 25 | "pipeline": [ 26 | "sentence_bert", 27 | "sentencizer" 28 | ], 29 | "factories": { 30 | "sentence_bert": "sentence_bert" 31 | }, 32 | "labels": { 33 | 34 | } 35 | } -------------------------------------------------------------------------------- /spacy_sentence_bert/meta/en_average_word_embeddings_levy_dependency.json: -------------------------------------------------------------------------------- 1 | { 2 | "lang": "en", 3 | "name": "average_word_embeddings_levy_dependency", 4 | "version": "0.1.2", 5 | "spacy_version": ">=3.0,<4.0", 6 | "description": "Wrapper of sentence-transformers models for spaCy", 7 | "author": "Martino Mensio", 8 | "email": "martino.mensio@open.ac.uk", 9 | "url": "https://github.com/MartinoMensio/spacy-sentence-bert", 10 | "license": "MIT", 11 | "requirements": [ 12 | "spacy-sentence-bert==0.1.2" 13 | ], 14 | "sources": [{ 15 | "name": "sentence-transformers", 16 | "url": "https://github.com/UKPLab/sentence-transformers", 17 | "license": "Apache-2.0" 18 | }], 19 | "vectors": { 20 | "width": 300, 21 | "vectors": 0, 22 | "keys": 0, 23 | "name": null 24 | }, 25 | "pipeline": [ 26 | "sentence_bert", 27 | "sentencizer" 28 | ], 29 | "factories": { 30 | "sentence_bert": "sentence_bert" 31 | }, 32 | "labels": { 33 | 34 | } 35 | } -------------------------------------------------------------------------------- /spacy_sentence_bert/meta/xx_cross_en_de_roberta_sentence_transformer.json: -------------------------------------------------------------------------------- 1 | { 2 | "lang": "xx", 3 | "name": "cross_en_de_roberta_sentence_transformer", 4 | "version": "0.1.2", 5 | "spacy_version": ">=3.0,<4.0", 6 | "description": "Wrapper of sentence-transformers models for spaCy", 7 | "author": "Martino Mensio", 8 | "email": "martino.mensio@open.ac.uk", 9 | "url": "https://github.com/MartinoMensio/spacy-sentence-bert", 10 | "license": "MIT", 11 | "requirements": [ 12 | "spacy-sentence-bert==0.1.2" 13 | ], 14 | "sources": [{ 15 | "name": "sentence-transformers", 16 | "url": "https://github.com/UKPLab/sentence-transformers", 17 | "license": "Apache-2.0" 18 | }], 19 | "vectors": { 20 | "width": 768, 21 | "vectors": 0, 22 | "keys": 0, 23 | "name": null 24 | }, 25 | "pipeline": [ 26 | "sentence_bert", 27 | "sentencizer" 28 | ], 29 | "factories": { 30 | "sentence_bert": "sentence_bert" 31 | }, 32 | "labels": { 33 | 34 | } 35 | } -------------------------------------------------------------------------------- /spacy_sentence_bert/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | from __future__ import unicode_literals 3 | 4 | import warnings 5 | from spacy.tokens import Doc 6 | from spacy.util import load_model_from_init_py 7 | 8 | from sentence_transformers import SentenceTransformer 9 | 10 | from . import util, language 11 | from .language import SentenceBert 12 | from .util import create_lang as load_model 13 | 14 | __version__ = util.pkg_meta["version"] 15 | 16 | 17 | 18 | # warning suppress for empty vocabulary 19 | warnings.filterwarnings('ignore', message=r"\[W007\]", category=UserWarning) 20 | 21 | def load(**overrides): 22 | return load_model_from_init_py(__file__, **overrides) 23 | 24 | 25 | def create_from(nlp, model_name): 26 | '''From an existing `nlp` object, adds the vectors from the specific `model_name` by adding pipeline stages''' 27 | return language.SentenceBert.create_nlp(model_name, nlp) 28 | 29 | def doc_from_bytes(nlp, bytes): 30 | """Returns a serialised doc from the bytes coming from `doc.to_bytes()` """ 31 | doc = Doc(nlp.vocab).from_bytes(bytes) 32 | language.set_hooks(doc) 33 | return doc 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Martino Mensio 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | import numpy as np 3 | from sentence_transformers import SentenceTransformer 4 | 5 | def vectorise(sent): 6 | return model.encode([sent.text])[0] 7 | 8 | def overwrite_vectors(doc): 9 | doc.user_hooks['vector'] = vectorise 10 | doc.user_span_hooks['vector'] = vectorise 11 | doc.user_token_hooks['vector'] = vectorise 12 | return doc 13 | 14 | 15 | nlp = spacy.blank('en') 16 | nlp.add_pipe(overwrite_vectors) 17 | 18 | 19 | 20 | # https://github.com/UKPLab/sentence-transformers 21 | model = SentenceTransformer('bert-base-nli-mean-tokens') # 768 22 | model = SentenceTransformer('roberta-large-nli-stsb-mean-tokens') # 1024 23 | 24 | 25 | sentences = ['This framework generates embeddings for each input sentence', 26 | 'Sentences are passed as a list of string.', 27 | 'The quick brown fox jumps over the lazy dog.', 28 | 'Sentences are given as a list of strings'] 29 | docs = [nlp(s) for s in sentences] 30 | 31 | print(docs[0].vector.shape) 32 | 33 | m = np.zeros((len(docs), len(docs))) 34 | for i, d_i in enumerate(docs): 35 | for j, d_j in enumerate(docs): 36 | m[i,j] = d_i.similarity(d_j) 37 | 38 | print(m) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | models/** 3 | packages/** 4 | .DS_store 5 | 6 | 7 | # Byte-compiled / optimized / DLL files 8 | __pycache__/ 9 | *.py[cod] 10 | *$py.class 11 | 12 | # C extensions 13 | *.so 14 | 15 | # Distribution / packaging 16 | .Python 17 | build/ 18 | develop-eggs/ 19 | dist/ 20 | downloads/ 21 | eggs/ 22 | .eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | wheels/ 29 | share/python-wheels/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | MANIFEST 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .nox/ 49 | .coverage 50 | .coverage.* 51 | .cache 52 | nosetests.xml 53 | coverage.xml 54 | *.cover 55 | *.py,cover 56 | .hypothesis/ 57 | .pytest_cache/ 58 | cover/ 59 | 60 | # Translations 61 | *.mo 62 | *.pot 63 | 64 | # Django stuff: 65 | *.log 66 | local_settings.py 67 | db.sqlite3 68 | db.sqlite3-journal 69 | 70 | # Flask stuff: 71 | instance/ 72 | .webassets-cache 73 | 74 | # Scrapy stuff: 75 | .scrapy 76 | 77 | # Sphinx documentation 78 | docs/_build/ 79 | 80 | # PyBuilder 81 | .pybuilder/ 82 | target/ 83 | 84 | # Jupyter Notebook 85 | .ipynb_checkpoints 86 | 87 | # IPython 88 | profile_default/ 89 | ipython_config.py 90 | 91 | # pyenv 92 | # For a library or package, you might want to ignore these files since the code is 93 | # intended to run in multiple environments; otherwise, check them in: 94 | # .python-version 95 | 96 | # pipenv 97 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 98 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 99 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 100 | # install all needed dependencies. 101 | #Pipfile.lock 102 | 103 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 104 | __pypackages__/ 105 | 106 | # Celery stuff 107 | celerybeat-schedule 108 | celerybeat.pid 109 | 110 | # SageMath parsed files 111 | *.sage.py 112 | 113 | # Environments 114 | .env 115 | .venv 116 | env/ 117 | venv/ 118 | ENV/ 119 | env.bak/ 120 | venv.bak/ 121 | 122 | # Spyder project settings 123 | .spyderproject 124 | .spyproject 125 | 126 | # Rope project settings 127 | .ropeproject 128 | 129 | # mkdocs documentation 130 | /site 131 | 132 | # mypy 133 | .mypy_cache/ 134 | .dmypy.json 135 | dmypy.json 136 | 137 | # Pyre type checker 138 | .pyre/ 139 | 140 | # pytype static type analyzer 141 | .pytype/ 142 | 143 | # Cython debug symbols 144 | cython_debug/ -------------------------------------------------------------------------------- /spacy_sentence_bert/language.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | 3 | from spacy.language import Language 4 | from spacy.tokens import Doc, Span, Token 5 | from sentence_transformers import SentenceTransformer 6 | 7 | from . import util 8 | 9 | 10 | def get_vector(sent): 11 | doc = sent.doc 12 | model_name = doc._.sentence_bert_model_name 13 | model = SentenceBert.get_model(model_name) 14 | vector = model.encode([sent.text])[0] 15 | return vector 16 | 17 | # create an extension where the model will be used 18 | Doc.set_extension('sentence_bert_model_name', default=None, force=True) 19 | 20 | # set the extension both on doc and span level. This will contain the computed vector 21 | Token.set_extension('sentence_bert', getter=get_vector, force=True) 22 | Span.set_extension('sentence_bert', getter=get_vector, force=True) 23 | Doc.set_extension('sentence_bert', getter=get_vector, force=True) 24 | 25 | # the pipeline stage factory 26 | @Language.factory('sentence_bert', default_config={ 27 | 'model_name': None, 28 | 'debug': True 29 | }) 30 | def sentence_bert_factory(nlp, name, model_name, debug): 31 | if model_name: 32 | # esplicitly chosen 33 | if model_name in util.configs: 34 | # one of the known ones 35 | config = util.configs[model_name] 36 | model_name = config['name'] 37 | else: 38 | # may be a SentenceBert model name directly 39 | try: 40 | return SentenceBert(model_name, debug=debug) 41 | except: 42 | raise ValueError(f'Model "{model_name}" not available. Please choose one of {list(util.configs.keys())} or use one of the allowed values by SentenceBert.') 43 | else: 44 | # try to map from existing nlp 45 | # the language code needs to match 46 | meta_lang = nlp.meta['lang'] 47 | # try to map from the model name 48 | meta_name = nlp.meta["name"] 49 | model_name = f'{meta_lang}_{meta_name}' 50 | if model_name not in util.configs: 51 | raise ValueError(f'Could not map nlp.meta["lang"]={meta_lang} and nlp.meta["name"]={meta_name} to an existing model.\n' 52 | f'Please set the parameter "model_name" to one of {list(util.configs.keys())} or to a SentenceBert model name.') 53 | return SentenceBert(model_name, debug=debug) 54 | 55 | 56 | class SentenceBert(object): 57 | 58 | models = {} 59 | 60 | def __init__(self, model_name: str, debug: bool) -> None: 61 | self.model = SentenceBert.get_model(model_name) 62 | self.model_name = model_name 63 | 64 | def __call__(self, doc): 65 | doc._.sentence_bert_model_name = self.model_name 66 | set_hooks(doc) 67 | return doc 68 | 69 | @staticmethod 70 | def get_model(model_name: str): 71 | if model_name in SentenceBert.models: 72 | model = SentenceBert.models[model_name] 73 | else: 74 | model = SentenceTransformer(model_name) 75 | SentenceBert.models[model_name] = model 76 | return model 77 | 78 | 79 | def set_hooks(doc): 80 | '''Overwrites the vectors from extension attributes''' 81 | doc.user_hooks["vector"] = lambda a: a._.sentence_bert 82 | doc.user_span_hooks["vector"] = lambda a: a._.sentence_bert 83 | doc.user_token_hooks["vector"] = lambda a: a._.sentence_bert 84 | return doc 85 | 86 | 87 | def create_nlp(model_name, nlp=None): 88 | if not nlp: 89 | if model_name not in util.configs: 90 | raise ValueError(f'Model "{model_name}" not available') 91 | config = util.configs[model_name] 92 | nlp = spacy.blank(config['spacy_base_model']) 93 | nlp.add_pipe('sentencizer') 94 | nlp.add_pipe('sentence_bert', config={'model_name': model_name}, first=True) 95 | 96 | return nlp 97 | -------------------------------------------------------------------------------- /spacy_sentence_bert/util.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pathlib import Path 3 | 4 | try: # Python 3.8 5 | import importlib.metadata as importlib_metadata 6 | except ImportError: 7 | import importlib_metadata # noqa: F401 8 | 9 | 10 | pkg_meta = importlib_metadata.metadata(__name__.split(".")[0]) 11 | 12 | # From https://www.sbert.net/docs/pretrained_models.html 13 | configs = { 14 | # Paraphrase Identification 15 | 'en_paraphrase_distilroberta_base_v1': { 16 | 'spacy_base_model': 'en', 17 | 'dimensions': 768, 18 | 'name': 'paraphrase-distilroberta-base-v1' 19 | }, 20 | 'xx_paraphrase_xlm_r_multilingual_v1': { 21 | 'spacy_base_model': 'xx', 22 | 'dimensions': 768, 23 | 'name': 'paraphrase-xlm-r-multilingual-v1' 24 | }, 25 | # Semantic Textual Similarity 26 | # https://docs.google.com/spreadsheets/d/14QplCdTCDwEmTqrn1LH4yrbKvdogK4oQvYO1K1aPR5M/edit#gid=0 27 | 'en_stsb_roberta_large': { # previously named en_roberta_large_nli_stsb_mean_tokens 28 | 'spacy_base_model': 'en', 29 | 'dimensions': 1024, 30 | 'name': 'stsb-roberta-large' 31 | }, 32 | 'en_stsb_roberta_base': { # previously named en_roberta_base_nli_stsb_mean_tokens 33 | 'spacy_base_model': 'en', 34 | 'dimensions': 768, 35 | 'name': 'stsb-roberta-base' 36 | }, 37 | 'en_stsb_bert_large': { # previously named en_bert_large_nli_stsb_mean_tokens 38 | 'spacy_base_model': 'en', 39 | 'dimensions': 1024, 40 | 'name': 'stsb-bert-large' 41 | }, 42 | 'en_stsb_distilbert_base': { # previously named en_distilbert_base_nli_stsb_mean_tokens 43 | 'spacy_base_model': 'en', 44 | 'dimensions': 768, 45 | 'name': 'stsb-distilbert-base' 46 | }, 47 | 'en_stsb_bert_base': { # previously named en_bert_base_nli_stsb_mean_tokens 48 | 'spacy_base_model': 'en', 49 | 'dimensions': 768, 50 | 'name': 'stsb-bert-base' 51 | }, 52 | 'en_nli_bert_large': { # previously named en_bert_large_nli_mean_tokens 53 | 'spacy_base_model': 'en', 54 | 'dimensions': 1024, 55 | 'name': 'nli-bert-large' 56 | }, 57 | 'en_nli_distilbert_base': { # previously named en_distilbert_base_nli_mean_tokens 58 | 'spacy_base_model': 'en', 59 | 'dimensions': 768, 60 | 'name': 'nli-distilbert-base' 61 | }, 62 | 'en_nli_roberta_large': { # previously named en_roberta_large_nli_mean_tokens 63 | 'spacy_base_model': 'en', 64 | 'dimensions': 1024, 65 | 'name': 'nli-roberta-large' 66 | }, 67 | 'en_nli_bert_large_max_pooling': { # previously named en_bert_large_nli_max_tokens 68 | 'spacy_base_model': 'en', 69 | 'dimensions': 1024, 70 | 'name': 'nli-bert-large-max-pooling' 71 | }, 72 | 'en_nli_bert_large_cls_pooling': { # previously named en_bert_large_nli_cls_token 73 | 'spacy_base_model': 'en', 74 | 'dimensions': 1024, 75 | 'name': 'nli-bert-large-cls-pooling' 76 | }, 77 | 'en_nli_distilbert_base_max_pooling': { # new 78 | 'spacy_base_model': 'en', 79 | 'dimensions': 768, 80 | 'name': 'nli-distilbert-base-max-pooling' 81 | }, 82 | 'en_nli_roberta_base': { # previously named en_roberta_base_nli_mean_tokens 83 | 'spacy_base_model': 'en', 84 | 'dimensions': 768, 85 | 'name': 'nli-roberta-base' 86 | }, 87 | 'en_nli_bert_base_max_pooling': { # previously named en_bert_base_nli_max_tokens 88 | 'spacy_base_model': 'en', 89 | 'dimensions': 768, 90 | 'name': 'nli-bert-base-max-pooling' 91 | }, 92 | 'en_nli_bert_base': { # previously named en_bert_base_nli_mean_tokens 93 | 'spacy_base_model': 'en', 94 | 'dimensions': 768, 95 | 'name': 'nli-bert-base' 96 | }, 97 | 'en_nli_bert_base_cls_pooling': { # previously named en_bert_base_nli_cls_token 98 | 'spacy_base_model': 'en', 99 | 'dimensions': 768, 100 | 'name': 'nli-bert-base-cls-pooling' 101 | }, 102 | # Average Word Embeddings Models 103 | 'en_average_word_embeddings_glove_6B_300d': { 104 | 'spacy_base_model': 'en', 105 | 'dimensions': 300, 106 | 'name': 'average_word_embeddings_glove.6B.300d' 107 | }, 108 | 'en_average_word_embeddings_komninos': { 109 | 'spacy_base_model': 'en', 110 | 'dimensions': 300, 111 | 'name': 'average_word_embeddings_komninos' 112 | }, 113 | 'en_average_word_embeddings_levy_dependency': { 114 | 'spacy_base_model': 'en', 115 | 'dimensions': 300, 116 | 'name': 'average_word_embeddings_levy_dependency' 117 | }, 118 | 'en_average_word_embeddings_glove_840B_300d': { 119 | 'spacy_base_model': 'en', 120 | 'dimensions': 300, 121 | 'name': 'average_word_embeddings_glove.840B.300d' 122 | }, 123 | # Duplicate Questions Detection 124 | 'en_quora_distilbert_base': { 125 | 'spacy_base_model': 'en', 126 | 'dimensions': 768, 127 | 'name': 'quora-distilbert-base' 128 | }, 129 | 'xx_quora_distilbert_multilingual': { 130 | 'spacy_base_model': 'xx', 131 | 'dimensions': 768, 132 | 'name': 'quora-distilbert-multilingual' 133 | }, 134 | # Question-Answer Retrieval - MSMARCO 135 | 'en_msmarco_distilroberta_base_v2': { 136 | 'spacy_base_model': 'en', 137 | 'dimensions': 768, 138 | 'name': 'msmarco-distilroberta-base-v2' 139 | }, 140 | 'en_msmarco_roberta_base_v2': { 141 | 'spacy_base_model': 'en', 142 | 'dimensions': 768, 143 | 'name': 'msmarco-roberta-base-v2' 144 | }, 145 | 'en_msmarco_distilbert_base_v2': { 146 | 'spacy_base_model': 'en', 147 | 'dimensions': 768, 148 | 'name': 'msmarco-distilbert-base-v2' 149 | }, 150 | # Question-Answer Retrieval - Natural Questions 151 | 'en_nq_distilbert_base_v1': { 152 | 'spacy_base_model': 'en', 153 | 'dimensions': 768, 154 | 'name': 'nq-distilbert-base-v1' 155 | }, 156 | # Multi-Lingual Models 157 | 'xx_distiluse_base_multilingual_cased_v2': { 158 | 'spacy_base_model': 'xx', 159 | 'dimensions': 512, 160 | 'name': 'distiluse-base-multilingual-cased-v2' 161 | }, 162 | 'xx_stsb_xlm_r_multilingual': { 163 | 'spacy_base_model': 'xx', 164 | 'dimensions': 768, 165 | 'name': 'stsb-xlm-r-multilingual' 166 | }, 167 | 'xx_cross_en_de_roberta_sentence_transformer': { 168 | 'spacy_base_model': 'xx', 169 | 'dimensions': 768, 170 | 'name': 'T-Systems-onsite/cross-en-de-roberta-sentence-transformer' 171 | }, 172 | # Bitext mining 173 | 'xx_LaBSE': { 174 | 'spacy_base_model': 'xx', 175 | 'dimensions': 768, 176 | 'name': 'LaBSE' 177 | }, 178 | # Scientific Publications 179 | 'en_allenai_specter': { 180 | 'spacy_base_model': 'en', 181 | 'dimensions': 768, 182 | 'name': 'allenai-specter' 183 | }, 184 | 185 | 186 | 187 | 188 | ### These old models can still be loaded from their SentenceBert model name 189 | # 'xx_distiluse_base_multilingual_cased': { 190 | # 'spacy_base_model': 'xx', 191 | # 'dimensions': 512, 192 | # 'name': 'distiluse-base-multilingual-cased' 193 | # }, 194 | # 'xx_xlm_r_base_en_ko_nli_ststb': { 195 | # 'spacy_base_model': 'xx', 196 | # 'dimensions': 768, 197 | # 'name': 'xlm-r-base-en-ko-nli-ststb' 198 | # }, 199 | # 'xx_xlm_r_large_en_ko_nli_ststb': { 200 | # 'spacy_base_model': 'xx', 201 | # 'dimensions': 1024, 202 | # 'name': 'xlm-r-large-en-ko-nli-ststb' 203 | # }, 204 | } 205 | 206 | def create_lang(model_name): 207 | '''Creates a Language object from the `model_name`''' 208 | from . import language 209 | nlp = language.create_nlp(model_name) 210 | if model_name in configs: 211 | selected_config = configs[model_name] 212 | nlp.vocab.reset_vectors(width=selected_config['dimensions']) # does not do anything! 213 | with open(Path(__file__).parent.absolute() / 'meta' / f'{model_name}.json') as f: 214 | nlp.meta = json.load(f) 215 | return nlp 216 | 217 | def name_spacy_to_sentencebert(spacy_name): 218 | # remove initial prefix 219 | # result = spacy_name[3:] 220 | # from underscore to dash 221 | result = spacy_name.replace('_', '-') 222 | return result 223 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Tests](https://github.com/MartinoMensio/spacy-sentence-bert/actions/workflows/tests.yml/badge.svg)](https://github.com/MartinoMensio/spacy-sentence-bert/actions/workflows/tests.yml) 2 | [![Downloads](https://static.pepy.tech/badge/spacy-sentence-bert)](https://pepy.tech/project/spacy-sentence-bert) 3 | [![Current Release Version](https://img.shields.io/github/release/MartinoMensio/spacy-sentence-bert.svg?style=flat-square&logo=github)](https://github.com/MartinoMensio/spacy-sentence-bert/releases) 4 | [![pypi Version](https://img.shields.io/pypi/v/spacy-sentence-bert.svg?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/spacy-sentence-bert/) 5 | # Sentence-BERT for spaCy 6 | 7 | This package wraps [sentence-transformers](https://github.com/UKPLab/sentence-transformers) (also known as [sentence-BERT](http://arxiv.org/abs/1908.10084)) directly in spaCy. 8 | You can substitute the vectors provided in any [spaCy model](https://spacy.io/models) with vectors that have been tuned specifically for semantic similarity. 9 | 10 | The models below are suggested for analysing sentence similarity, as the STS benchmark indicates. 11 | Keep in mind that `sentence-transformers` are configured with a maximum sequence length of 128. Therefore for longer texts it may be more suitable to work with other models (e.g. [Universal Sentence Encoder](https://github.com/MartinoMensio/spacy-universal-sentence-encoder-tfhub)). 12 | 13 | ## Install 14 | 15 | Compatibility: 16 | - python 3.7/3.8/3.9/3.10 17 | - spaCy>=3.0.0,<4.0.0, last tested on version 3.5 18 | - sentence-transformers: tested on version 2.2.2 19 | 20 | To install this package, you can run one of the following: 21 | 22 | - `pip install spacy-sentence-bert` 23 | - `pip install git+https://github.com/MartinoMensio/spacy-sentence-bert.git` 24 | 25 | You can install standalone spaCy packages from GitHub with pip. If you install standalone packages, you will be able to load a language model directly by using the `spacy.load` API, without need to add a pipeline stage. 26 | This table takes the models listed on the [Sentence Transformers documentation](https://www.sbert.net/docs/pretrained_models.html) and shows some statistics along with the instruction to install the standalone models. 27 | If you don't want to install the standalone models, you can still use them by adding a pipeline stage (see below). 28 | 29 | 30 | | sentence-BERT name | spacy model name | dimensions | language | STS benchmark | standalone install | 31 | |----------------------------------------|--------------------|----------------------|------------|---------------|---------| 32 | | `paraphrase-distilroberta-base-v1` | `en_paraphrase_distilroberta_base_v1` | 768 | en | 81.81 | `pip install https://github.com/MartinoMensio/spacy-sentence-bert/releases/download/v0.1.2/en_paraphrase_distilroberta_base_v1-0.1.2.tar.gz#en_paraphrase_distilroberta_base_v1-0.1.2` | 33 | | `paraphrase-xlm-r-multilingual-v1` | `xx_paraphrase_xlm_r_multilingual_v1` | 768 | 50+ | 83.50 | `pip install https://github.com/MartinoMensio/spacy-sentence-bert/releases/download/v0.1.2/xx_paraphrase_xlm_r_multilingual_v1-0.1.2.tar.gz#xx_paraphrase_xlm_r_multilingual_v1-0.1.2` | 34 | | `stsb-roberta-large` | `en_stsb_roberta_large` | 1024 | en | 86.39 | `pip install https://github.com/MartinoMensio/spacy-sentence-bert/releases/download/v0.1.2/en_stsb_roberta_large-0.1.2.tar.gz#en_stsb_roberta_large-0.1.2` | 35 | | `stsb-roberta-base` | `en_stsb_roberta_base` | 768 | en | 85.44 | `pip install https://github.com/MartinoMensio/spacy-sentence-bert/releases/download/v0.1.2/en_stsb_roberta_base-0.1.2.tar.gz#en_stsb_roberta_base-0.1.2` | 36 | | `stsb-bert-large` | `en_stsb_bert_large` | 1024 | en | 85.29 | `pip install https://github.com/MartinoMensio/spacy-sentence-bert/releases/download/v0.1.2/en_stsb_bert_large-0.1.2.tar.gz#en_stsb_bert_large-0.1.2` | 37 | | `stsb-distilbert-base` | `en_stsb_distilbert_base` | 768 | en | 85.16 | `pip install https://github.com/MartinoMensio/spacy-sentence-bert/releases/download/v0.1.2/en_stsb_distilbert_base-0.1.2.tar.gz#en_stsb_distilbert_base-0.1.2` | 38 | | `stsb-bert-base` | `en_stsb_bert_base` | 768 | en | 85.14 | `pip install https://github.com/MartinoMensio/spacy-sentence-bert/releases/download/v0.1.2/en_stsb_bert_base-0.1.2.tar.gz#en_stsb_bert_base-0.1.2` | 39 | | `nli-bert-large` | `en_nli_bert_large` | 1024 | en | 79.19 | `pip install https://github.com/MartinoMensio/spacy-sentence-bert/releases/download/v0.1.2/en_nli_bert_large-0.1.2.tar.gz#en_nli_bert_large-0.1.2` | 40 | | `nli-distilbert-base` | `en_nli_distilbert_base` | 768 | en | 78.69 | `pip install https://github.com/MartinoMensio/spacy-sentence-bert/releases/download/v0.1.2/en_nli_distilbert_base-0.1.2.tar.gz#en_nli_distilbert_base-0.1.2` | 41 | | `nli-roberta-large` | `en_nli_roberta_large` | 1024 | en | 78.69 | `pip install https://github.com/MartinoMensio/spacy-sentence-bert/releases/download/v0.1.2/en_nli_roberta_large-0.1.2.tar.gz#en_nli_roberta_large-0.1.2` | 42 | | `nli-bert-large-max-pooling` | `en_nli_bert_large_max_pooling` | 1024 | en | 78.41 | `pip install https://github.com/MartinoMensio/spacy-sentence-bert/releases/download/v0.1.2/en_nli_bert_large_max_pooling-0.1.2.tar.gz#en_nli_bert_large_max_pooling-0.1.2` | 43 | | `nli-bert-large-cls-pooling` | `en_nli_bert_large_cls_pooling` | 1024 | en | 78.29 | `pip install https://github.com/MartinoMensio/spacy-sentence-bert/releases/download/v0.1.2/en_nli_bert_large_cls_pooling-0.1.2.tar.gz#en_nli_bert_large_cls_pooling-0.1.2` | 44 | | `nli-distilbert-base-max-pooling` | `en_nli_distilbert_base_max_pooling` | 768 | en | 77.61 | `pip install https://github.com/MartinoMensio/spacy-sentence-bert/releases/download/v0.1.2/en_nli_distilbert_base_max_pooling-0.1.2.tar.gz#en_nli_distilbert_base_max_pooling-0.1.2` | 45 | | `nli-roberta-base` | `en_nli_roberta_base` | 768 | en | 77.49 | `pip install https://github.com/MartinoMensio/spacy-sentence-bert/releases/download/v0.1.2/en_nli_roberta_base-0.1.2.tar.gz#en_nli_roberta_base-0.1.2` | 46 | | `nli-bert-base-max-pooling` | `en_nli_bert_base_max_pooling` | 768 | en | 77.21 | `pip install https://github.com/MartinoMensio/spacy-sentence-bert/releases/download/v0.1.2/en_nli_bert_base_max_pooling-0.1.2.tar.gz#en_nli_bert_base_max_pooling-0.1.2` | 47 | | `nli-bert-base` | `en_nli_bert_base` | 768 | en | 77.12 | `pip install https://github.com/MartinoMensio/spacy-sentence-bert/releases/download/v0.1.2/en_nli_bert_base-0.1.2.tar.gz#en_nli_bert_base-0.1.2` | 48 | | `nli-bert-base-cls-pooling` | `en_nli_bert_base_cls_pooling` | 768 | en | 76.30 | `pip install https://github.com/MartinoMensio/spacy-sentence-bert/releases/download/v0.1.2/en_nli_bert_base_cls_pooling-0.1.2.tar.gz#en_nli_bert_base_cls_pooling-0.1.2` | 49 | | `average_word_embeddings_glove.6B.300d` | `en_average_word_embeddings_glove.6B.300d` | 768 | en | 61.77 | `pip install https://github.com/MartinoMensio/spacy-sentence-bert/releases/download/v0.1.2/en_average_word_embeddings_glove.6B.300d-0.1.2.tar.gz#en_average_word_embeddings_glove.6B.300d-0.1.2` | 50 | | `average_word_embeddings_komninos` | `en_average_word_embeddings_komninos` | 768 | en | 61.56 | `pip install https://github.com/MartinoMensio/spacy-sentence-bert/releases/download/v0.1.2/en_average_word_embeddings_komninos-0.1.2.tar.gz#en_average_word_embeddings_komninos-0.1.2` | 51 | | `average_word_embeddings_levy_dependency` | `en_average_word_embeddings_levy_dependency` | 768 | en | 59.22 | `pip install https://github.com/MartinoMensio/spacy-sentence-bert/releases/download/v0.1.2/en_average_word_embeddings_levy_dependency-0.1.2.tar.gz#en_average_word_embeddings_levy_dependency-0.1.2` | 52 | | `average_word_embeddings_glove.840B.300d` | `en_average_word_embeddings_glove.840B.300d` | 768 | en | 52.54 | `pip install https://github.com/MartinoMensio/spacy-sentence-bert/releases/download/v0.1.2/en_average_word_embeddings_glove.840B.300d-0.1.2.tar.gz#en_average_word_embeddings_glove.840B.300d-0.1.2` | 53 | | `quora-distilbert-base` | `en_quora_distilbert_base` | 768 | en | N/A | `pip install https://github.com/MartinoMensio/spacy-sentence-bert/releases/download/v0.1.2/en_quora_distilbert_base-0.1.2.tar.gz#en_quora_distilbert_base-0.1.2` | 54 | | `quora-distilbert-multilingual` | `xx_quora_distilbert_multilingual` | 768 | 50+ | N/A | `pip install https://github.com/MartinoMensio/spacy-sentence-bert/releases/download/v0.1.2/xx_quora_distilbert_multilingual-0.1.2.tar.gz#xx_quora_distilbert_multilingual-0.1.2` | 55 | | `msmarco-distilroberta-base-v2` | `en_msmarco_distilroberta_base_v2` | 768 | en | N/A | `pip install https://github.com/MartinoMensio/spacy-sentence-bert/releases/download/v0.1.2/en_msmarco_distilroberta_base_v2-0.1.2.tar.gz#en_msmarco_distilroberta_base_v2-0.1.2` | 56 | | `msmarco-roberta-base-v2` | `en_msmarco_roberta_base_v2` | 768 | en | N/A | `pip install https://github.com/MartinoMensio/spacy-sentence-bert/releases/download/v0.1.2/en_msmarco_roberta_base_v2-0.1.2.tar.gz#en_msmarco_roberta_base_v2-0.1.2` | 57 | | `msmarco-distilbert-base-v2` | `en_msmarco_distilbert_base_v2` | 768 | en | N/A | `pip install https://github.com/MartinoMensio/spacy-sentence-bert/releases/download/v0.1.2/en_msmarco_distilbert_base_v2-0.1.2.tar.gz#en_msmarco_distilbert_base_v2-0.1.2` | 58 | | `nq-distilbert-base-v1` | `en_nq_distilbert_base_v1` | 768 | en | N/A | `pip install https://github.com/MartinoMensio/spacy-sentence-bert/releases/download/v0.1.2/en_nq_distilbert_base_v1-0.1.2.tar.gz#en_nq_distilbert_base_v1-0.1.2` | 59 | | `distiluse-base-multilingual-cased-v2` | `xx_distiluse_base_multilingual_cased_v2` | 512 | 50+ | N/A | `pip install https://github.com/MartinoMensio/spacy-sentence-bert/releases/download/v0.1.2/xx_distiluse_base_multilingual_cased_v2-0.1.2.tar.gz#xx_distiluse_base_multilingual_cased_v2-0.1.2` | 60 | | `stsb-xlm-r-multilingual` | `xx_stsb_xlm_r_multilingual` | 768 | 50+ | N/A | `pip install https://github.com/MartinoMensio/spacy-sentence-bert/releases/download/v0.1.2/xx_stsb_xlm_r_multilingual-0.1.2.tar.gz#xx_stsb_xlm_r_multilingual-0.1.2` | 61 | | `T-Systems-onsite/cross-en-de-roberta-sentence-transformer` | `xx_cross_en_de_roberta_sentence_transformer` | 768 | en,de | N/A | `pip install https://github.com/MartinoMensio/spacy-sentence-bert/releases/download/v0.1.2/xx_cross_en_de_roberta_sentence_transformer-0.1.2.tar.gz#xx_cross_en_de_roberta_sentence_transformer-0.1.2` | 62 | | `LaBSE` | `xx_LaBSE` | 768 | 109 | N/A | `pip install https://github.com/MartinoMensio/spacy-sentence-bert/releases/download/v0.1.2/xx_LaBSE-0.1.2.tar.gz#xx_LaBSE-0.1.2` | 63 | | `allenai-specter` | `en_allenai_specter` | 768 | en | N/A | `pip install https://github.com/MartinoMensio/spacy-sentence-bert/releases/download/v0.1.2/en_allenai_specter-0.1.2.tar.gz#en_allenai_specter-0.1.2` | 64 | 65 | If your model is not in this list (e.g., `xlm-r-base-en-ko-nli-ststb`), you can still use it with this library but not as a standalone language. You will need to add a pipeline stage properly configured (see below the `nlp.add_pipe` API). 66 | 67 | 68 | 69 | ## Usage 70 | 71 | There are different ways to load the models of `sentence-bert`. 72 | 73 | - `spacy.load` API: you need to have installed one of the models from the table above 74 | - `spacy_sentence_bert.load_model`: you can load one of the models from the table above without having installed the standalone packages 75 | - `nlp.add_pipe` API: you can load any of the `sentence-bert` models on top of your `nlp` object 76 | 77 | 78 | ### `spacy.load` API 79 | 80 | Standalone model installed from GitHub (e.g., from the table above, `pip install https://github.com/MartinoMensio/spacy-sentence-bert/releases/download/v0.1.2/en_stsb_roberta_large-0.1.2.tar.gz#en_stsb_roberta_large-0.1.2`), you can load directly the model with the spaCy API: 81 | 82 | ```python 83 | import spacy 84 | nlp = spacy.load('en_stsb_roberta_large') 85 | ``` 86 | 87 | ### `spacy_sentence_bert.load_model` API 88 | 89 | You can obtain the same result without having to install the standalone model, by using this method: 90 | 91 | ```python 92 | import spacy_sentence_bert 93 | nlp = spacy_sentence_bert.load_model('en_stsb_roberta_large') 94 | ``` 95 | 96 | ### `nlp.add_pipe` API 97 | 98 | If you want to use one of the sentence embeddings over an existing Language object, you can use the `nlp.add_pipe` method. 99 | This also works if you want to use a language model that is not listed in the table above. Just make sure that [sentence-transformers](https://github.com/UKPLab/sentence-transformers) supports it. 100 | 101 | ```python 102 | import spacy 103 | nlp = spacy.blank('en') 104 | nlp.add_pipe('sentence_bert', config={'model_name': 'allenai-specter'}) 105 | nlp.pipe_names 106 | ``` 107 | 108 | The models, when first used, download sentence-BERT to the folder defined with `TORCH_HOME` in the environment variables (default `~/.cache/torch`). 109 | 110 | Once you have loaded the model, use it through the `vector` property and the `similarity` method of spaCy: 111 | 112 | ```python 113 | # get two documents 114 | doc_1 = nlp('Hi there, how are you?') 115 | doc_2 = nlp('Hello there, how are you doing today?') 116 | # get the vector of the Doc, Span or Token 117 | print(doc_1.vector.shape) 118 | print(doc_1[3].vector.shape) 119 | print(doc_1[2:4].vector.shape) 120 | # or use the similarity method that is based on the vectors, on Doc, Span or Token 121 | print(doc_1.similarity(doc_2[0:7])) 122 | ``` 123 | 124 | 125 | 126 | ## Utils 127 | 128 | To build and upload 129 | ```bash 130 | VERSION=0.1.2 131 | # build the standalone models (17) 132 | ./build_models.sh 133 | # build the archive at dist/spacy_sentence_bert-${VERSION}.tar.gz 134 | python setup.py sdist 135 | # upload to pypi 136 | twine upload dist/spacy_sentence_bert-${VERSION}.tar.gz 137 | ``` --------------------------------------------------------------------------------