├── tests
    ├── __init__.py
    ├── common
    │   ├── __init__.py
    │   ├── test_util.py
    │   ├── test_model_utils.py
    │   └── test_contrastive_utils.py
    ├── test_predictor.py
    ├── fixtures
    │   ├── data
    │   │   ├── encoder_inputs.txt
    │   │   └── openwebtext
    │   │   │   ├── valid.txt
    │   │   │   └── train.txt
    │   ├── experiment_mlm_only.jsonnet
    │   ├── experiment.jsonnet
    │   ├── experiment_contrastive_only.jsonnet
    │   ├── experiment_scalar_mix.jsonnet
    │   ├── experiment_feedforward.jsonnet
    │   └── common.jsonnet
    ├── conftest.py
    ├── test_encoder.py
    ├── test_model.py
    └── test_dataset_reader.py
├── .allennlp_plugins
├── declutr
    ├── common
    │   ├── __init__.py
    │   ├── util.py
    │   ├── masked_lm_utils.py
    │   ├── model_utils.py
    │   └── contrastive_utils.py
    ├── modules
    │   ├── __init__.py
    │   ├── text_field_embedders
    │   │   ├── __init__.py
    │   │   └── mlm_text_field_embedder.py
    │   └── token_embedders
    │   │   ├── __init__.py
    │   │   └── pretrained_transformer_embedder_mlm.py
    ├── __init__.py
    ├── miners
    │   ├── __init__.py
    │   └── pytorch_metric_learning.py
    ├── losses
    │   ├── __init__.py
    │   └── pytorch_metric_learning.py
    ├── predictor.py
    ├── encoder.py
    ├── dataset_reader.py
    └── model.py
├── .coveragerc
├── pytest.ini
├── mypy.ini
├── pyproject.toml
├── .github
    ├── dependabot.yml
    └── workflows
    │   └── build.yml
├── .flake8
├── training_config
    ├── transformer_mean.jsonnet
    ├── transformer_cls.jsonnet
    ├── declutr_base.jsonnet
    ├── declutr_small.jsonnet
    ├── declutr.jsonnet
    ├── contrastive_only.jsonnet
    └── mlm_only.jsonnet
├── scripts
    ├── save_pretrained_hf.py
    ├── preprocess_wikitext_103.py
    └── preprocess_openwebtext.py
├── setup.py
├── .gitignore
├── CONTRIBUTING.md
├── notebooks
    ├── evaluating.ipynb
    └── training.ipynb
├── LICENSE
└── README.md


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.allennlp_plugins:
--------------------------------------------------------------------------------
1 | declutr


--------------------------------------------------------------------------------
/declutr/common/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/declutr/modules/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/common/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | source = declutr
3 | omit = tests/* \
4 |        *\__init__.py
5 | 


--------------------------------------------------------------------------------
/declutr/__init__.py:
--------------------------------------------------------------------------------
1 | from declutr.model import DeCLUTR
2 | from declutr.encoder import Encoder
3 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | filterwarnings =
3 |     ignore::DeprecationWarning
4 |     ignore::PendingDeprecationWarning
5 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | ignore_missing_imports = true
3 | no_site_packages = true
4 | 
5 | [mypy-tests.*]
6 | strict_optional = false


--------------------------------------------------------------------------------
/declutr/modules/text_field_embedders/__init__.py:
--------------------------------------------------------------------------------
1 | from declutr.modules.text_field_embedders.mlm_text_field_embedder import MLMTextFieldEmbedder
2 | 


--------------------------------------------------------------------------------
/declutr/miners/__init__.py:
--------------------------------------------------------------------------------
1 | from declutr.miners.pytorch_metric_learning import (
2 |     PairMarginMiner,
3 |     PyTorchMetricLearningMiner,
4 | )
5 | 


--------------------------------------------------------------------------------
/declutr/losses/__init__.py:
--------------------------------------------------------------------------------
1 | from declutr.losses.pytorch_metric_learning import (
2 |     CrossBatchMemory,
3 |     NTXentLoss,
4 |     PyTorchMetricLearningLoss,
5 | )
6 | 


--------------------------------------------------------------------------------
/declutr/modules/token_embedders/__init__.py:
--------------------------------------------------------------------------------
1 | from declutr.modules.token_embedders.pretrained_transformer_embedder_mlm import (
2 |     PretrainedTransformerEmbedderMLM,
3 | )
4 | 


--------------------------------------------------------------------------------
/declutr/common/util.py:
--------------------------------------------------------------------------------
1 | def sanitize_text(text: str, lowercase: bool = False) -> str:
2 |     """Cleans text by removing whitespace, newlines and tabs and (optionally) lowercasing."""
3 |     sanitized_text = " ".join(text.strip().split())
4 |     sanitized_text = sanitized_text.lower() if lowercase else sanitized_text
5 |     return sanitized_text
6 | 


--------------------------------------------------------------------------------
/tests/test_predictor.py:
--------------------------------------------------------------------------------
1 | class TestDeCLUTRPredictor:
2 |     def test_json_to_instance(self, predictor) -> None:
3 |         json_dict = {"text": "They may take our lives, but they'll never take our freedom!"}
4 |         output = predictor._json_to_instance(json_dict)
5 |         assert "anchors" in output
6 |         assert "positives" not in output
7 | 


--------------------------------------------------------------------------------
/tests/fixtures/data/encoder_inputs.txt:
--------------------------------------------------------------------------------
1 | "A man inspects the uniform of a figure in some East Asian country."
2 | "The man is sleeping"
3 | "A soccer game with multiple males playing."
4 | "Some men are playing a sport."
5 | "A black race car starts up in front of a crowd of people."
6 | "A man is driving down a lonely road."
7 | "A smiling costumed woman is holding an umbrella."
8 | "A happy woman in a fairy costume holds an umbrella."


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.black]
 2 | line-length = 100
 3 | 
 4 | include = '\.pyi?$'
 5 | 
 6 | exclude = '''
 7 | (
 8 |       __pycache__
 9 |     | \btutorials\b
10 |     | \bbuild\b
11 |     | \.git
12 |     | \.mypy_cache
13 |     | \.pytest_cache
14 |     | \.vscode
15 |     | \.venv
16 |     | \bdist\b
17 |     | \bdoc\b
18 | )
19 | '''
20 | 
21 | [build-system]
22 | requires = ["setuptools", "wheel"]
23 | build-backend = "setuptools.build_meta"


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # Please see the documentation for all configuration options:
 2 | # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
 3 | 
 4 | version: 2
 5 | updates:
 6 | 
 7 |   # Maintain dependencies for GitHub Actions
 8 |   - package-ecosystem: "github-actions"
 9 |     directory: "/"
10 |     schedule:
11 |       interval: "daily"
12 |       
13 |   # Maintain dependencies for pip
14 |   - package-ecosystem: "pip"
15 |     directory: "/"
16 |     schedule:
17 |       interval: "daily"
18 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | max-line-length = 115
 3 | 
 4 | ignore =
 5 |     # these rules don't play well with black
 6 |     E203  # whitespace before :
 7 |     W503  # line break before binary operator
 8 | 
 9 | per-file-ignores =
10 |     # __init__.py files are allowed to have unused imports and lines-too-long
11 |     */__init__.py:F401
12 |     */**/**/__init__.py:F401,E501
13 | 
14 |     # tests don't have to respect
15 |     #  E731: do not assign a lambda expression, use a def
16 |     tests/**:E731
17 | 
18 |     # scripts don't have to respect
19 |     #  E402: imports not at top of file (because we mess with sys.path)
20 |     scripts/**:E402
21 | 


--------------------------------------------------------------------------------
/declutr/predictor.py:
--------------------------------------------------------------------------------
 1 | from overrides import overrides
 2 | 
 3 | from allennlp.common.util import JsonDict
 4 | from allennlp.data import Instance
 5 | from allennlp.predictors.predictor import Predictor
 6 | 
 7 | 
 8 | @Predictor.register("declutr")
 9 | class DeCLUTRPredictor(Predictor):
10 |     """Predictor wrapper for `DeCLUTR` model.
11 | 
12 |     Registered as a `Predictor` with name "declutr".
13 |     """
14 | 
15 |     @overrides
16 |     def _json_to_instance(self, json_dict: JsonDict) -> Instance:
17 |         text = json_dict["text"]
18 |         # Context manager ensures that the sample_spans property of our DatasetReader is False
19 |         with self._dataset_reader.no_sample():
20 |             return self._dataset_reader.text_to_instance(text=text)
21 | 


--------------------------------------------------------------------------------
/tests/common/test_util.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from declutr.common import util
 4 | from hypothesis import given
 5 | from hypothesis.strategies import booleans, text
 6 | 
 7 | 
 8 | @given(text=text(), lowercase=booleans())
 9 | def test_sanitize_text(text: str, lowercase: bool) -> None:
10 |     sanitized_text = util.sanitize_text(text, lowercase=lowercase)
11 | 
12 |     # There should be no cases of multiple spaces or tabs
13 |     assert re.search(r"[ ]{2,}", sanitized_text) is None
14 |     assert "\t" not in sanitized_text
15 |     # The beginning and end of the string should be stripped of whitespace
16 |     assert not sanitized_text.startswith(("\n", " "))
17 |     assert not sanitized_text.endswith(("\n", " "))
18 |     # Sometimes, hypothesis generates text that cannot be lowercased (like latin characters).
19 |     # We don't particularly care about this, and it breaks this check.
20 |     # Only run if the generated text can be lowercased.
21 |     if lowercase and text.lower().islower():
22 |         assert all(not char.isupper() for char in sanitized_text)
23 | 


--------------------------------------------------------------------------------
/tests/fixtures/experiment_mlm_only.jsonnet:
--------------------------------------------------------------------------------
 1 | local COMMON = import 'common.jsonnet';
 2 | local transformer_model = "distilroberta-base";
 3 | 
 4 | {
 5 |     "vocabulary": COMMON['vocabulary'],
 6 |     "dataset_reader": COMMON['dataset_reader'],
 7 |     "datasets_for_vocab_creation": ["train"],
 8 |     "train_data_path": COMMON['train_data_path'],
 9 |     "validation_data_path": COMMON['validation_data_path'],
10 |     "model": {
11 |         "type": "declutr.DeCLUTR",
12 |         "text_field_embedder": {
13 |             "type": "declutr.modules.text_field_embedders.mlm_text_field_embedder.MLMTextFieldEmbedder",
14 |             "token_embedders": {
15 |                 "tokens": {
16 |                     "type": "declutr.modules.token_embedders.pretrained_transformer_embedder_mlm.PretrainedTransformerEmbedderMLM",
17 |                     "model_name": transformer_model,
18 |                     "masked_language_modeling": true
19 |                 },
20 |             },
21 |         },
22 |         "loss": null
23 |     },
24 |     "data_loader": COMMON['data_loader'],
25 |     "trainer": COMMON['trainer']
26 | }


--------------------------------------------------------------------------------
/tests/fixtures/experiment.jsonnet:
--------------------------------------------------------------------------------
 1 | local COMMON = import 'common.jsonnet';
 2 | local transformer_model = "distilroberta-base";
 3 | 
 4 | {
 5 |     "vocabulary": COMMON['vocabulary'],
 6 |     "dataset_reader": COMMON['dataset_reader'],
 7 |     "datasets_for_vocab_creation": ["train"],
 8 |     "train_data_path": COMMON['train_data_path'],
 9 |     "validation_data_path": COMMON['validation_data_path'],
10 |     "model": {
11 |         "type": "declutr.DeCLUTR",
12 |         "text_field_embedder": {
13 |             "type": "declutr.modules.text_field_embedders.MLMTextFieldEmbedder",
14 |             "token_embedders": {
15 |                 "tokens": {
16 |                     "type": "declutr.modules.token_embedders.PretrainedTransformerEmbedderMLM",
17 |                     "model_name": transformer_model,
18 |                     "masked_language_modeling": true
19 |                 },
20 |             },
21 |         },
22 |         "loss": {
23 |             "type": "declutr.losses.pytorch_metric_learning.NTXentLoss",
24 |             "temperature": 0.05,
25 |         },
26 |     },
27 |     "data_loader": COMMON['data_loader'],
28 |     "trainer": COMMON['trainer']
29 | }


--------------------------------------------------------------------------------
/tests/fixtures/experiment_contrastive_only.jsonnet:
--------------------------------------------------------------------------------
 1 | local COMMON = import 'common.jsonnet';
 2 | local transformer_model = "distilroberta-base";
 3 | 
 4 | {
 5 |     "vocabulary": COMMON['vocabulary'],
 6 |     "dataset_reader": COMMON['dataset_reader'],
 7 |     "datasets_for_vocab_creation": ["train"],
 8 |     "train_data_path": COMMON['train_data_path'],
 9 |     "validation_data_path": COMMON['validation_data_path'],
10 |     "model": {
11 |         "type": "declutr.DeCLUTR",
12 |         "text_field_embedder": {
13 |             "type": "declutr.modules.text_field_embedders.mlm_text_field_embedder.MLMTextFieldEmbedder",
14 |             "token_embedders": {
15 |                 "tokens": {
16 |                     "type": "declutr.modules.token_embedders.pretrained_transformer_embedder_mlm.PretrainedTransformerEmbedderMLM",
17 |                     "model_name": transformer_model,
18 |                     "masked_language_modeling": false
19 |                 },
20 |             },
21 |         },
22 |         "loss": {
23 |             "type": "declutr.losses.pytorch_metric_learning.NTXentLoss",
24 |             "temperature": 0.05,
25 |         },
26 |     },
27 |     "data_loader": COMMON['data_loader'],
28 |     "trainer": COMMON['trainer']
29 | }


--------------------------------------------------------------------------------
/tests/fixtures/experiment_scalar_mix.jsonnet:
--------------------------------------------------------------------------------
 1 | local COMMON = import 'common.jsonnet';
 2 | local transformer_model = "distilroberta-base";
 3 | 
 4 | {
 5 |     "vocabulary": COMMON['vocabulary'],
 6 |     "dataset_reader": COMMON['dataset_reader'],
 7 |     "datasets_for_vocab_creation": ["train"],
 8 |     "train_data_path": COMMON['train_data_path'],
 9 |     "validation_data_path": COMMON['validation_data_path'],
10 |     "model": {
11 |         "type": "declutr.DeCLUTR",
12 |         "text_field_embedder": {
13 |             "type": "declutr.modules.text_field_embedders.mlm_text_field_embedder.MLMTextFieldEmbedder",
14 |             "token_embedders": {
15 |                 "tokens": {
16 |                     "type": "declutr.modules.token_embedders.pretrained_transformer_embedder_mlm.PretrainedTransformerEmbedderMLM",
17 |                     "model_name": transformer_model,
18 |                     "last_layer_only": false,
19 |                     "masked_language_modeling": true
20 |                 },
21 |             },
22 |         },
23 |         "loss": {
24 |             "type": "declutr.losses.pytorch_metric_learning.NTXentLoss",
25 |             "temperature": 0.05,
26 |         },
27 |     },
28 |     "data_loader": COMMON['data_loader'],
29 |     "trainer": COMMON['trainer']
30 | }


--------------------------------------------------------------------------------
/tests/fixtures/experiment_feedforward.jsonnet:
--------------------------------------------------------------------------------
 1 | local COMMON = import 'common.jsonnet';
 2 | local transformer_model = "distilroberta-base";
 3 | 
 4 | {
 5 |     "vocabulary": COMMON['vocabulary'],
 6 |     "dataset_reader": COMMON['dataset_reader'],
 7 |     "datasets_for_vocab_creation": ["train"],
 8 |     "train_data_path": COMMON['train_data_path'],
 9 |     "validation_data_path": COMMON['validation_data_path'],
10 |     "model": {
11 |         "type": "declutr.DeCLUTR",
12 |         "text_field_embedder": {
13 |             "type": "declutr.modules.text_field_embedders.mlm_text_field_embedder.MLMTextFieldEmbedder",
14 |             "token_embedders": {
15 |                 "tokens": {
16 |                     "type": "declutr.modules.token_embedders.pretrained_transformer_embedder_mlm.PretrainedTransformerEmbedderMLM",
17 |                     "model_name": transformer_model,
18 |                     "masked_language_modeling": true
19 |                 },
20 |             },
21 |         },
22 |         "feedforward": {
23 |             "input_dim": 768,
24 |             "num_layers": 1,
25 |             "hidden_dims": 16,
26 |             "activations": "relu",
27 |         },
28 |         "loss": {
29 |             "type": "declutr.losses.pytorch_metric_learning.NTXentLoss",
30 |             "temperature": 0.05,
31 |         },
32 |     },
33 |     "data_loader": COMMON['data_loader'],
34 |     "trainer": COMMON['trainer']
35 | }


--------------------------------------------------------------------------------
/declutr/miners/pytorch_metric_learning.py:
--------------------------------------------------------------------------------
 1 | from pytorch_metric_learning import miners
 2 | 
 3 | from allennlp.common import Registrable
 4 | 
 5 | 
 6 | class PyTorchMetricLearningMiner(Registrable):
 7 |     """This class just allows us to implement `Registrable` for PyTorch Metric Learning miner functions.
 8 |     Subclasses of this class should also subclass a miner function from PyTorch Metric Learning
 9 |     (see: https://kevinmusgrave.github.io/pytorch-metric-learning/miners/), and accept as arguments
10 |     to the constructor the same arguments that the miner function does. See `MaximumLossMiner` below
11 |     for an example.
12 |     """
13 | 
14 |     default_implementation = "pair_margin"
15 | 
16 | 
17 | @PyTorchMetricLearningMiner.register("pair_margin")
18 | class PairMarginMiner(PyTorchMetricLearningMiner, miners.PairMarginMiner):
19 |     """Wraps the `PairMarginMiner` implementation from Pytorch Metric Learning:
20 |     (https://kevinmusgrave.github.io/pytorch-metric-learning/miners/#pairmarginminer).
21 | 
22 |     Registered as a `PyTorchMetricLearningMiner` with name "pair_margin".
23 |     """
24 | 
25 |     def __init__(
26 |         self,
27 |         pos_margin: float,
28 |         neg_margin: float,
29 |         use_similarity: bool = True,
30 |         squared_distances: bool = False,
31 |     ) -> None:
32 | 
33 |         super().__init__(
34 |             pos_margin=pos_margin,
35 |             neg_margin=neg_margin,
36 |             use_similarity=use_similarity,
37 |             squared_distances=squared_distances,
38 |         )
39 | 


--------------------------------------------------------------------------------
/training_config/transformer_mean.jsonnet:
--------------------------------------------------------------------------------
 1 | // This should be a registered name in the Transformers library (see https://huggingface.co/models) 
 2 | // OR a path on disk to a serialized transformer model.
 3 | local transformer_model = std.extVar("TRANSFORMER_MODEL");
 4 | 
 5 | // This will be used to set the max/min # of tokens in the positive and negative examples.
 6 | local max_length = 512;
 7 | 
 8 | {
 9 |     "vocabulary": {
10 |         "type": "empty"
11 |     },
12 |     "dataset_reader": {
13 |         "type": "declutr",
14 |         "lazy": true,
15 |         "tokenizer": {
16 |             "type": "pretrained_transformer",
17 |             "model_name": transformer_model,
18 |             // Account for special tokens (e.g. CLS and SEP), otherwise a cryptic error is thrown.
19 |             "max_length": max_length - 2,
20 |         },
21 |         "token_indexers": {
22 |             "tokens": {
23 |                 "type": "pretrained_transformer",
24 |                 "model_name": transformer_model,
25 |             },
26 |         },
27 |     }, 
28 |     "train_data_path": null,
29 |     "model": {
30 |         "type": "declutr",
31 |         "text_field_embedder": {
32 |             "type": "mlm",
33 |             "token_embedders": {
34 |                 "tokens": {
35 |                     "type": "pretrained_transformer_mlm",
36 |                     "model_name": transformer_model,
37 |                     "masked_language_modeling": true
38 |                 },
39 |             },
40 |         },
41 |     },
42 |     "data_loader": {
43 |         "batch_size": 16,
44 |         "num_workers": 1,
45 |         "drop_last": true,
46 |     },
47 |     "trainer": {
48 |         "type": "no_op"
49 |     },
50 | }


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from typing import List
 3 | 
 4 | import pytest
 5 | from allennlp.common import util as common_util
 6 | from allennlp.common.file_utils import cached_path
 7 | from allennlp.models.archival import Archive, load_archive
 8 | from allennlp.predictors import Predictor
 9 | 
10 | from declutr.encoder import PRETRAINED_MODELS, Encoder
11 | from declutr.predictor import DeCLUTRPredictor
12 | 
13 | # Note: Most of these are scoped as "module" to prevent a warning from hypothesis
14 | # about fixtures being reset between function calls.
15 | 
16 | 
17 | @pytest.fixture(params=["declutr-small", "declutr-base"], scope="module")
18 | def archive(request) -> Archive:
19 |     if request.param in PRETRAINED_MODELS:
20 |         pretrained_model_name_or_path = PRETRAINED_MODELS[request.param]
21 |     common_util.import_module_and_submodules("declutr")
22 |     pretrained_model_name_or_path = cached_path(pretrained_model_name_or_path)
23 |     return load_archive(pretrained_model_name_or_path)
24 | 
25 | 
26 | @pytest.fixture(scope="module")
27 | def predictor(archive) -> DeCLUTRPredictor:
28 |     return Predictor.from_archive(archive, predictor_name="declutr")
29 | 
30 | 
31 | @pytest.fixture(params=["declutr-small", "declutr-base"], scope="module")
32 | def encoder(request) -> Encoder:
33 |     return Encoder(request.param)
34 | 
35 | 
36 | @pytest.fixture(scope="module")
37 | def inputs_filepath() -> str:
38 |     # Some random examples taken from https://nlp.stanford.edu/projects/snli/
39 |     return "tests/fixtures/data/encoder_inputs.txt"
40 | 
41 | 
42 | @pytest.fixture(scope="module")
43 | def inputs(inputs_filepath) -> List[str]:
44 |     return Path(inputs_filepath).read_text().split("\n")
45 | 


--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: build
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ master ]
 9 |   pull_request:
10 |     branches: [ master ]
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ${{ matrix.os }}
16 |     strategy:
17 |       matrix:
18 |         os: [ubuntu-latest, macos-latest]
19 |         python-version: [3.6, 3.7, 3.8]
20 | 
21 |     steps:
22 |     - uses: actions/checkout@v3
23 |     - name: Set up Python ${{ matrix.python-version }}
24 |       uses: actions/setup-python@v4.1.0
25 |       with:
26 |         python-version: ${{ matrix.python-version }}
27 |     - name: Install dependencies
28 |       run: |
29 |         python -m pip install --upgrade pip
30 |         pip install --editable ".[dev]"
31 |     - name: Format code with black
32 |       run: |
33 |         black .
34 |     - name: Lint with flake8
35 |       run: |
36 |         # stop the build if there are Python syntax errors or undefined names
37 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
38 |         # exit-zero treats all errors as warnings.
39 |         flake8 . --count --exit-zero --max-complexity=10 --statistics
40 |     - name: Type check with mypy
41 |       run: |
42 |         mypy .
43 |     - name: Test with pytest
44 |       run: |
45 |         pytest tests --cov ./declutr --cov-report=xml --cov-config=./.coveragerc
46 |     - name: Upload coverage to Codecov
47 |       uses: codecov/codecov-action@v3
48 |       with:
49 |         file: ./coverage.xml
50 |         # Ignore codecov failures as the codecov server is not
51 |         # very reliable but we don't want to report a failure
52 |         # in the github UI just because the coverage report failed to
53 |         # be published.
54 |         fail_ci_if_error: false
55 | 


--------------------------------------------------------------------------------
/scripts/save_pretrained_hf.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import typer
 4 | from allennlp.common import util as common_util
 5 | from allennlp.models.archival import load_archive
 6 | from allennlp.predictors import Predictor
 7 | 
 8 | # Emoji's used in typer.secho calls
 9 | # See: https://github.com/carpedm20/emoji/blob/master/emoji/unicode_codes.py"
10 | SAVING = "\U0001F4BE"
11 | HUGGING_FACE = "\U0001F917"
12 | 
13 | 
14 | def main(archive_file: str, save_directory: Path) -> None:
15 |     """Saves the model and tokenizer from an AllenNLP `archive_file` path pointing to a trained
16 |     DeCLUTR model to a format that can be used with HuggingFace Transformers at `save_directory`."""
17 |     save_directory = Path(save_directory)
18 |     save_directory.parents[0].mkdir(parents=True, exist_ok=True)
19 | 
20 |     common_util.import_module_and_submodules("declutr")
21 |     # cuda_device -1 places the model onto the CPU before saving. This avoids issues with
22 |     # distributed models.
23 |     overrides = "{'trainer.cuda_device': -1}"
24 |     archive = load_archive(archive_file, overrides=overrides)
25 |     predictor = Predictor.from_archive(archive, predictor_name="declutr")
26 | 
27 |     token_embedder = predictor._model._text_field_embedder._token_embedders["tokens"]
28 |     model = token_embedder.transformer_model
29 |     tokenizer = token_embedder.tokenizer
30 | 
31 |     # Casting as a string to avoid this error: https://github.com/huggingface/transformers/pull/4650
32 |     # Can be removed after PR is merged and Transformers is updated.
33 |     model.save_pretrained(str(save_directory))
34 |     tokenizer.save_pretrained(str(save_directory))
35 | 
36 |     typer.secho(
37 |         (
38 |             f"{SAVING} {HUGGING_FACE} Transformers compatible model saved to: {save_directory}."
39 |             " See https://huggingface.co/transformers/model_sharing.html for instructions on"
40 |             f" hosting the model with {HUGGING_FACE} Transformers."
41 |         ),
42 |         bold=True,
43 |     )
44 | 
45 | 
46 | if __name__ == "__main__":
47 |     typer.run(main)
48 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open("README.md", "r") as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | setuptools.setup(
 7 |     name="declutr",
 8 |     version="0.1.0rc1",
 9 |     author="John Giorgi",
10 |     author_email="johnmgiorgi@gmail.com",
11 |     description=("DeCLUTR: Deep Contrastive Learning for Unsupervised Textual Representations"),
12 |     long_description=long_description,
13 |     long_description_content_type="text/markdown",
14 |     url="https://github.com/JohnGiorgi/DeCLUTR",
15 |     packages=setuptools.find_packages(),
16 |     keywords=[
17 |         "universal sentence embeddings",
18 |         "contrastive learning",
19 |         "natural language processing",
20 |         "allennlp",
21 |         "pytorch",
22 |         "transformers",
23 |         "representation learning",
24 |         "deep metric learning",
25 |         "sentence embeddings",
26 |         "sentence similarity",
27 |         "semantic similarity",
28 |     ],
29 |     classifiers=[
30 |         "Development Status :: 1 - Planning",
31 |         "Environment :: Console",
32 |         "Intended Audience :: Science/Research",
33 |         "License :: OSI Approved :: Apache Software License",
34 |         "Operating System :: OS Independent",
35 |         "Programming Language :: Python :: 3",
36 |         "Programming Language :: Python :: 3.6",
37 |         "Programming Language :: Python :: 3.7",
38 |         "Programming Language :: Python :: 3.8",
39 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
40 |         "Typing :: Typed",
41 |     ],
42 |     python_requires=">=3.6.1",
43 |     install_requires=[
44 |         "allennlp>=1.1.0, <1.2.0",
45 |         "pytorch-metric-learning>=0.9.98",
46 |         "typer>=0.3.2",
47 |         "validators>=0.18.2",
48 |     ],
49 |     extras_require={
50 |         "dev": [
51 |             "black",
52 |             "coverage",
53 |             "codecov",
54 |             "flake8",
55 |             "hypothesis",
56 |             "pytest",
57 |             "pytest-cov",
58 |             "mypy",
59 |         ]
60 |     },
61 | )
62 | 


--------------------------------------------------------------------------------
/tests/fixtures/common.jsonnet:
--------------------------------------------------------------------------------
 1 | // This should be a registered name in the Transformers library (see https://huggingface.co/models) 
 2 | // OR a path on disk to a serialized transformer model. 
 3 | local transformer_model = "distilroberta-base";
 4 | // This will be used to set the max/min # of tokens in the positive and negative examples.
 5 | local max_length = 16;
 6 | local min_length = 8;
 7 | 
 8 | {
 9 |     "vocabulary": {
10 |         "type": "empty"
11 |     },
12 |     "dataset_reader": {
13 |         "type": "declutr.dataset_reader.DeCLUTRDatasetReader",
14 |         "lazy": true,
15 |         "num_anchors": 2,
16 |         "num_positives": 2,
17 |         "max_span_len": max_length,
18 |         "min_span_len": min_length,
19 |         "tokenizer": {
20 |             "type": "pretrained_transformer",
21 |             "model_name": transformer_model,
22 |             "max_length": max_length,
23 |         },
24 |         "token_indexers": {
25 |             "tokens": {
26 |                 "type": "pretrained_transformer",
27 |                 "model_name": transformer_model,
28 |             },
29 |         },
30 |     }, 
31 |     "train_data_path": "tests/fixtures/data/openwebtext/train.txt",
32 |     "validation_data_path": "tests/fixtures/data/openwebtext/valid.txt",
33 |     "model": {
34 |         "type": "declutr.DeCLUTR",
35 |     },
36 |     "data_loader": {
37 |         "batch_size": 4,
38 |         "num_workers": 1,
39 |         "drop_last": true
40 |     },
41 |     "trainer": {
42 |         "optimizer": {
43 |             "type": "huggingface_adamw",
44 |             "lr": 5e-5,
45 |             "weight_decay": 0.1,
46 |             "parameter_groups": [
47 |                 // Apply weight decay to pre-trained params, excluding LayerNorm params and biases
48 |                 [["bias", "LayerNorm\\.weight", "layer_norm\\.weight"], {"weight_decay": 0}],
49 |             ],
50 |         },
51 |         "num_epochs": 1,
52 |         "checkpointer": {
53 |             "num_serialized_models_to_keep": -1,
54 |         },
55 |         "grad_norm": 1.0,
56 |         "learning_rate_scheduler": {
57 |             "type": "slanted_triangular",
58 |         },
59 |     },
60 | }


--------------------------------------------------------------------------------
/training_config/transformer_cls.jsonnet:
--------------------------------------------------------------------------------
 1 | // This should be a registered name in the Transformers library (see https://huggingface.co/models) 
 2 | // OR a path on disk to a serialized transformer model.
 3 | local transformer_model = std.extVar("TRANSFORMER_MODEL");
 4 | // The hidden size of the model, which can be found in its config as "hidden_size".
 5 | local transformer_dim = std.parseInt(std.extVar("TRANSFORMER_DIM"));
 6 | 
 7 | // This will be used to set the max/min # of tokens in the positive and negative examples.
 8 | local max_length = 512;
 9 | // Certain transformers use the last special token in the sequence to produce sequence embeddings
10 | // (e.g XLNet).
11 | local cls_is_last_token = false;
12 | 
13 | {
14 |     "vocabulary": {
15 |         "type": "empty"
16 |     },
17 |     "dataset_reader": {
18 |         "type": "declutr",
19 |         "lazy": true,
20 |         "tokenizer": {
21 |             "type": "pretrained_transformer",
22 |             "model_name": transformer_model,
23 |             // Account for special tokens (e.g. CLS and SEP), otherwise a cryptic error is thrown.
24 |             "max_length": max_length - 2,
25 |         },
26 |         "token_indexers": {
27 |             "tokens": {
28 |                 "type": "pretrained_transformer",
29 |                 "model_name": transformer_model,
30 |             },
31 |         },
32 |     }, 
33 |     "train_data_path": null,
34 |     "model": {
35 |         "type": "declutr",
36 |         "text_field_embedder": {
37 |             "type": "mlm",
38 |             "token_embedders": {
39 |                 "tokens": {
40 |                     "type": "pretrained_transformer_mlm",
41 |                     "model_name": transformer_model,
42 |                     "masked_language_modeling": true
43 |                 },
44 |             },
45 |         },
46 |         "seq2vec_encoder": {
47 |             "type": "cls_pooler",
48 |             "embedding_dim": transformer_dim,
49 |             "cls_is_last_token": cls_is_last_token
50 |         },
51 |     },
52 |     "data_loader": {
53 |         "batch_size": 16,
54 |         "num_workers": 1,
55 |         "drop_last": true,
56 |     },
57 |     "trainer": {
58 |         "type": "no_op"
59 |     },
60 | }


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | # vscode
107 | .vscode/
108 | 
109 | # MacOS stuff:
110 | # General
111 | .DS_Store
112 | .AppleDouble
113 | .LSOverride
114 | # Icon must end with two \r
115 | Icon
116 | # Thumbnails
117 | ._*
118 | # Files that might appear in the root of a volume
119 | .DocumentRevisions-V100
120 | .fseventsd
121 | .Spotlight-V100
122 | .TemporaryItems
123 | .Trashes
124 | .VolumeIcon.icns
125 | .com.apple.timemachine.donotpresent
126 | # Directories potentially created on remote AFP share
127 | .AppleDB
128 | .AppleDesktop
129 | Network Trash Folder
130 | Temporary Items
131 | .apdisk
132 | 
133 | # Added by us
134 | datasets
135 | pretrained_models
136 | SentEval


--------------------------------------------------------------------------------
/tests/test_encoder.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from typing import List
 3 | 
 4 | import pytest
 5 | import torch
 6 | from declutr import Encoder
 7 | from hypothesis import given, settings
 8 | from hypothesis.strategies import booleans
 9 | from torch.nn import CosineSimilarity
10 | 
11 | 
12 | class TestEncoder:
13 |     cosine = CosineSimilarity(dim=-1)
14 | 
15 |     # The base model will take longer than the small model, which triggers a test timing error.
16 |     # Turn off deadlines to avoid this.
17 |     @settings(deadline=None)
18 |     @given(sphereize=booleans())
19 |     def test_encoder(
20 |         self, inputs: List[str], inputs_filepath: Path, encoder: Encoder, sphereize: bool
21 |     ) -> None:
22 |         # The relative ranking should not change if sphereize is True/False, so run tests with both.
23 |         encoder._sphereize = sphereize
24 | 
25 |         # Run three distinct tests, which should cover all use cases of Encoder:
26 |         #  1. A List[str] input where batch_size is not None.
27 |         embeddings = encoder(inputs, batch_size=len(inputs))
28 |         embeddings = torch.from_numpy(embeddings)
29 |         # These are hard-coded examples that should have the highest cosine similarity.
30 |         assert torch.topk(self.cosine(embeddings[2], embeddings), k=2)[-1][-1].item() == 3
31 |         assert torch.topk(self.cosine(embeddings[6], embeddings), k=2)[-1][-1].item() == 7
32 | 
33 |         #  2. A str input where batch_size is None. Check that the expected UserWarning is raised.
34 |         embeddings = []
35 |         for text in inputs:
36 |             if sphereize:
37 |                 with pytest.warns(UserWarning):
38 |                     embeddings.append(encoder(text, batch_size=None))
39 |             else:
40 |                 embeddings.append(encoder(text, batch_size=None))
41 |         embeddings = torch.as_tensor(embeddings).squeeze(1)
42 |         assert torch.topk(self.cosine(embeddings[2], embeddings), k=2)[-1][-1].item() == 3
43 |         assert torch.topk(self.cosine(embeddings[6], embeddings), k=2)[-1][-1].item() == 7
44 | 
45 |         #  3. A filepath input that points to file with one example per line.
46 |         embeddings = encoder(inputs_filepath, batch_size=len(inputs))
47 |         embeddings = torch.from_numpy(embeddings)
48 |         assert torch.topk(self.cosine(embeddings[2], embeddings), k=2)[-1][-1].item() == 3
49 |         assert torch.topk(self.cosine(embeddings[6], embeddings), k=2)[-1][-1].item() == 7
50 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | To submit a pull request, please do the following:
 4 | 
 5 | 1. Fork the [repository](https://github.com/JohnGiorgi/DeCLUTR) by clicking on the 'Fork' button on the repository's page. This creates a copy of the code under your GitHub user account.
 6 | 
 7 | 2. Clone your fork to your local disk, and add the base repository as a remote:
 8 | 
 9 |    ```bash
10 |    $ git clone git@github.com:<your Github handle>/DeCLUTR.git
11 |    $ cd DeCLUTR
12 |    $ git remote add upstream https://github.com/JohnGiorgi/DeCLUTR.git
13 |    ```
14 | 
15 | 3. Create a new branch to hold your development changes:
16 | 
17 |    ```bash
18 |    $ git checkout -b a-descriptive-name-for-my-changes
19 |    ```
20 | 
21 |    __do not__ work on the `master` branch.
22 | 
23 | 4. Set up a development environment by running the following command in a virtual environment:
24 | 
25 |    ```bash
26 |    $ pip install -e ".[dev]"
27 |    ```
28 | 
29 |    (If the repository was already installed in the virtual environment, remove it with `pip uninstall` before reinstalling it in editable mode with the `-e` flag.)
30 | 
31 | 5. Develop the features on your branch.
32 | 
33 |    This repository relies on `black` to format its source code
34 |    consistently. After you make changes, format them with:
35 | 
36 |    ```bash
37 |    $ black declutr
38 |    ```
39 | 
40 |    This repository also uses `flake8` to check for coding mistakes. To run the checks locally:
41 | 
42 |    ```bash
43 |    $ flake8 declutr
44 |    ```
45 | 
46 |    Once you're happy with your changes, add changed files using `git add` and
47 |    make a commit with `git commit` to record your changes locally:
48 | 
49 |    ```bash
50 |    $ git add modified_file.py
51 |    $ git commit
52 |    ```
53 | 
54 |    Please write [good commit messages](https://chris.beams.io/posts/git-commit/).
55 | 
56 |    It is a good idea to sync your copy of the code with the original
57 |    repository regularly. This way you can quickly account for changes:
58 | 
59 |    ```bash
60 |    $ git fetch upstream
61 |    $ git rebase upstream/master
62 |    ```
63 | 
64 |    Push the changes to your account using:
65 | 
66 |    ```bash
67 |    $ git push -u origin a-descriptive-name-for-my-changes
68 |    ```
69 | 
70 | 6. Once you are satisfied, go to the webpage of your fork on GitHub.
71 |    Click on 'Pull request' to send your changes to the project maintainers for review.
72 | 
73 | > This is a work in progress. Inspiration for these guidelines were drawn from [here](https://github.com/huggingface/transformers/blob/master/CONTRIBUTING.md) and [here](https://github.com/nayafia/contributing-template).


--------------------------------------------------------------------------------
/declutr/common/masked_lm_utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Tuple
 2 | 
 3 | import torch
 4 | from transformers import PreTrainedTokenizer
 5 | 
 6 | from allennlp.data import TextFieldTensors
 7 | 
 8 | 
 9 | def _mask_tokens(
10 |     inputs: torch.Tensor, tokenizer: PreTrainedTokenizer, mlm_probability: float = 0.15
11 | ) -> Tuple[torch.Tensor, torch.Tensor]:
12 |     """Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10%
13 |     original. Copied from:
14 |     https://github.com/huggingface/transformers/blob/master/examples/run_language_modeling.py"""
15 | 
16 |     if tokenizer.mask_token is None:
17 |         raise ValueError(
18 |             (
19 |                 "This tokenizer does not have a mask token which is necessary for masked language"
20 |                 " modeling. Remove the --mlm flag if you want to use this tokenizer."
21 |             )
22 |         )
23 | 
24 |     labels = inputs.clone()
25 |     # We sample a few tokens in each sequence for masked-LM training (with probability
26 |     # mlm_probability defaults to 0.15 in Bert/RoBERTa)
27 |     probability_matrix = torch.full(labels.shape, mlm_probability)
28 |     special_tokens_mask = [
29 |         tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True)
30 |         for val in labels.tolist()
31 |     ]
32 |     probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
33 |     if tokenizer._pad_token is not None:
34 |         padding_mask = labels.eq(tokenizer.pad_token_id)
35 |         probability_matrix.masked_fill_(padding_mask, value=0.0)
36 |     masked_indices = torch.bernoulli(probability_matrix).bool()
37 |     labels[~masked_indices] = -100  # We only compute loss on masked tokens
38 | 
39 |     # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
40 |     indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
41 |     inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
42 | 
43 |     # 10% of the time, we replace masked input tokens with random word
44 |     indices_random = (
45 |         torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
46 |     )
47 |     random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
48 |     inputs[indices_random] = random_words[indices_random]
49 | 
50 |     # The rest of the time (10% of the time) we keep the masked input tokens unchanged
51 |     return inputs, labels
52 | 
53 | 
54 | def mask_tokens(
55 |     tokens: TextFieldTensors,
56 |     tokenizer: PreTrainedTokenizer,
57 |     mlm_probability: float = 0.15,
58 | ) -> TextFieldTensors:
59 |     device = tokens["tokens"]["token_ids"].device
60 |     inputs, labels = _mask_tokens(
61 |         inputs=tokens["tokens"]["token_ids"].to("cpu"),
62 |         tokenizer=tokenizer,
63 |         mlm_probability=mlm_probability,
64 |     )
65 |     tokens["tokens"]["token_ids"] = inputs.to(device)
66 |     tokens["tokens"]["masked_lm_labels"] = labels.to(device)
67 |     return tokens
68 | 


--------------------------------------------------------------------------------
/declutr/common/model_utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Tuple
 2 | 
 3 | import torch
 4 | import torch.distributed as dist
 5 | from allennlp.common import util
 6 | from allennlp.data import TextFieldTensors
 7 | 
 8 | 
 9 | def unpack_batch(tokens: TextFieldTensors) -> TextFieldTensors:
10 |     """If the tensors of `tokens` are three-dimensional, we reshape them to be two-dimensional
11 |     before returning the `TextFieldTensors` object. Otherwise, this is a no-op.
12 | 
13 |     # Parameters
14 | 
15 |     tokens : `TextFieldTensors`
16 |         A `TextFieldTensors` object containnig the tensors to (possibly) reshape.
17 | 
18 |     # Returns
19 | 
20 |     `TextFieldTensors`
21 |         Containing the (possibly) reshaped tensors.
22 |     """
23 |     for name, tensor in tokens["tokens"].items():
24 |         if len(tensor.size()) == 3:
25 |             tokens["tokens"][name] = tensor.reshape(tensor.size(0) * tensor.size(1), tensor.size(2))
26 |     return tokens
27 | 
28 | 
29 | def all_gather_anchor_positive_pairs(
30 |     anchors: torch.Tensor, positives: torch.Tensor
31 | ) -> Tuple[torch.Tensor, torch.Tensor]:
32 |     """If training on 2 or more GPUs, `all_gather`s the embeddings produced on each replica,
33 |     ensuring that the gradients for the embeddings produced on each replica are not lost. The
34 |     returned anchor, positive pairs can be fed to a contrastive loss. This method is necessary to
35 |     ensure that we train against the expected number of negatives 2 * (batch size - 1) per batch,
36 |     as a naive implementation would end up training against 2 * (batch size / n_gpus - 1) number of
37 |     negatives. If we are not training on 2 or more GPUs, this method is a no-op and returns its
38 |     inputs.
39 | 
40 |     # Parameters
41 | 
42 |     anchors : torch.Tensor
43 |         Embedded text representing the anchors.
44 |     positives : TextFieldTensors
45 |         Embedded text representing the positives.
46 | 
47 |     # Returns
48 | 
49 |     Tuple[torch.Tensor, torch.Tensor]
50 |         Embedded anchor, positive pairs that can be fed to a contrastive loss.
51 |     """
52 | 
53 |     # If we are not using distributed training, this is a no-op.
54 |     if not util.is_distributed():
55 |         return anchors, positives
56 | 
57 |     # Gather the encoded anchors and positives on all replicas
58 |     anchors_list = [torch.zeros_like(anchors) for _ in range(dist.get_world_size())]
59 |     positives_list = [torch.zeros_like(positives) for _ in range(dist.get_world_size())]
60 |     dist.all_gather(anchors_list, anchors.contiguous())
61 |     dist.all_gather(positives_list, positives.contiguous())
62 |     # The gathered copy of the current replicas positive pairs have no gradients, so we overwrite
63 |     # them with the positive pairs generated on this replica, which DO have gradients.
64 |     anchors_list[dist.get_rank()] = anchors
65 |     positives_list[dist.get_rank()] = positives
66 |     # Finally, we concatenate the positive pairs so they can be fed to the contrastive loss.
67 |     anchors = torch.cat(anchors_list)
68 |     positives = torch.cat(positives_list)
69 | 
70 |     return anchors, positives
71 | 


--------------------------------------------------------------------------------
/training_config/declutr_base.jsonnet:
--------------------------------------------------------------------------------
 1 | // This should be a registered name in the Transformers library (see https://huggingface.co/models) 
 2 | // OR a path on disk to a serialized transformer model.
 3 | local transformer_model = "roberta-base";
 4 | 
 5 | // This will be used to set the max/min # of tokens in the positive and negative examples.
 6 | local max_length = 512;
 7 | local min_length = 32;
 8 | 
 9 | {
10 |     "vocabulary": {
11 |         "type": "empty"
12 |     },
13 |     "dataset_reader": {
14 |         "type": "declutr",
15 |         "lazy": true,
16 |         "num_anchors": 2,
17 |         "num_positives": 2,
18 |         "max_span_len": max_length,
19 |         "min_span_len": min_length,
20 |         "tokenizer": {
21 |             "type": "pretrained_transformer",
22 |             "model_name": transformer_model,
23 |             // Account for special tokens (e.g. CLS and SEP), otherwise a cryptic error is thrown.
24 |             "max_length": max_length - 2,
25 |         },
26 |         "token_indexers": {
27 |             "tokens": {
28 |                 "type": "pretrained_transformer",
29 |                 "model_name": transformer_model,
30 |             },
31 |         },
32 |     }, 
33 |     "train_data_path": null,
34 |     "model": {
35 |         "type": "declutr",
36 |         "text_field_embedder": {
37 |             "type": "mlm",
38 |             "token_embedders": {
39 |                 "tokens": {
40 |                     "type": "pretrained_transformer_mlm",
41 |                     "model_name": transformer_model,
42 |                     "masked_language_modeling": true
43 |                 },
44 |             },
45 |         },
46 |         "loss": {
47 |             "type": "nt_xent",
48 |             "temperature": 0.05,
49 |         },
50 |         // There was a small bug in the original implementation that caused gradients derived from
51 |         // the contrastive loss to be scaled by 1/N, where N is the number of GPUs used during
52 |         // training. This has been fixed. To reproduce results from the paper, set this to false.
53 |         // Note that this will have no effect if you are not using distributed training with more
54 |         // than 1 GPU.
55 |         "scale_fix": false
56 |     },
57 |     "data_loader": {
58 |         "batch_size": 4,
59 |         "num_workers": 1,
60 |         "drop_last": true,
61 |     },
62 |     "trainer": {
63 |         // Set use_amp to true to use automatic mixed-precision during training (if your GPU supports it)
64 |         "use_amp": true,
65 |         "optimizer": {
66 |             "type": "huggingface_adamw",
67 |             "lr": 5e-5,
68 |             "eps": 1e-06,
69 |             "correct_bias": false,
70 |             "weight_decay": 0.1,
71 |             "parameter_groups": [
72 |                 // Apply weight decay to pre-trained params, excluding LayerNorm params and biases
73 |                 [["bias", "LayerNorm\\.weight", "layer_norm\\.weight"], {"weight_decay": 0}],
74 |             ],
75 |         },
76 |         "num_epochs": 1,
77 |         "checkpointer": {
78 |             // A value of null or -1 will save the weights of the model at the end of every epoch
79 |             "num_serialized_models_to_keep": -1,
80 |         },
81 |         "grad_norm": 1.0,
82 |         "learning_rate_scheduler": {
83 |             "type": "slanted_triangular",
84 |         },
85 |     },
86 | }


--------------------------------------------------------------------------------
/training_config/declutr_small.jsonnet:
--------------------------------------------------------------------------------
 1 | // This should be a registered name in the Transformers library (see https://huggingface.co/models) 
 2 | // OR a path on disk to a serialized transformer model.
 3 | local transformer_model = "distilroberta-base";
 4 | 
 5 | // This will be used to set the max/min # of tokens in the positive and negative examples.
 6 | local max_length = 512;
 7 | local min_length = 32;
 8 | 
 9 | {
10 |     "vocabulary": {
11 |         "type": "empty"
12 |     },
13 |     "dataset_reader": {
14 |         "type": "declutr",
15 |         "lazy": true,
16 |         "num_anchors": 2,
17 |         "num_positives": 2,
18 |         "max_span_len": max_length,
19 |         "min_span_len": min_length,
20 |         "tokenizer": {
21 |             "type": "pretrained_transformer",
22 |             "model_name": transformer_model,
23 |             // Account for special tokens (e.g. CLS and SEP), otherwise a cryptic error is thrown.
24 |             "max_length": max_length - 2,
25 |         },
26 |         "token_indexers": {
27 |             "tokens": {
28 |                 "type": "pretrained_transformer",
29 |                 "model_name": transformer_model,
30 |             },
31 |         },
32 |     }, 
33 |     "train_data_path": null,
34 |     "model": {
35 |         "type": "declutr",
36 |         "text_field_embedder": {
37 |             "type": "mlm",
38 |             "token_embedders": {
39 |                 "tokens": {
40 |                     "type": "pretrained_transformer_mlm",
41 |                     "model_name": transformer_model,
42 |                     "masked_language_modeling": true
43 |                 },
44 |             },
45 |         },
46 |         "loss": {
47 |             "type": "nt_xent",
48 |             "temperature": 0.05,
49 |         },
50 |         // There was a small bug in the original implementation that caused gradients derived from
51 |         // the contrastive loss to be scaled by 1/N, where N is the number of GPUs used during
52 |         // training. This has been fixed. To reproduce results from the paper, set this to false.
53 |         // Note that this will have no effect if you are not using distributed training with more
54 |         // than 1 GPU.
55 |         "scale_fix": false
56 |     },
57 |     "data_loader": {
58 |         "batch_size": 4,
59 |         "num_workers": 1,
60 |         "drop_last": true,
61 |     },
62 |     "trainer": {
63 |         // Set use_amp to true to use automatic mixed-precision during training (if your GPU supports it)
64 |         "use_amp": true,
65 |         "optimizer": {
66 |             "type": "huggingface_adamw",
67 |             "lr": 5e-5,
68 |             "eps": 1e-06,
69 |             "correct_bias": false,
70 |             "weight_decay": 0.1,
71 |             "parameter_groups": [
72 |                 // Apply weight decay to pre-trained params, excluding LayerNorm params and biases
73 |                 [["bias", "LayerNorm\\.weight", "layer_norm\\.weight"], {"weight_decay": 0}],
74 |             ],
75 |         },
76 |         "num_epochs": 1,
77 |         "checkpointer": {
78 |             // A value of null or -1 will save the weights of the model at the end of every epoch
79 |             "num_serialized_models_to_keep": -1,
80 |         },
81 |         "grad_norm": 1.0,
82 |         "learning_rate_scheduler": {
83 |             "type": "slanted_triangular",
84 |         },
85 |     },
86 | }


--------------------------------------------------------------------------------
/training_config/declutr.jsonnet:
--------------------------------------------------------------------------------
 1 | // This should be a registered name in the Transformers library (see https://huggingface.co/models) 
 2 | // OR a path on disk to a serialized transformer model.
 3 | local transformer_model = std.extVar("TRANSFORMER_MODEL");
 4 | 
 5 | // This will be used to set the max/min # of tokens in the positive and negative examples.
 6 | local max_length = 512;
 7 | local min_length = 32;
 8 | 
 9 | {
10 |     "vocabulary": {
11 |         "type": "empty"
12 |     },
13 |     "dataset_reader": {
14 |         "type": "declutr",
15 |         "lazy": true,
16 |         "num_anchors": 2,
17 |         "num_positives": 2,
18 |         "max_span_len": max_length,
19 |         "min_span_len": min_length,
20 |         "tokenizer": {
21 |             "type": "pretrained_transformer",
22 |             "model_name": transformer_model,
23 |             // Account for special tokens (e.g. CLS and SEP), otherwise a cryptic error is thrown.
24 |             "max_length": max_length - 2,
25 |         },
26 |         "token_indexers": {
27 |             "tokens": {
28 |                 "type": "pretrained_transformer",
29 |                 "model_name": transformer_model,
30 |             },
31 |         },
32 |     }, 
33 |     "train_data_path": null,
34 |     "model": {
35 |         "type": "declutr",
36 |         "text_field_embedder": {
37 |             "type": "mlm",
38 |             "token_embedders": {
39 |                 "tokens": {
40 |                     "type": "pretrained_transformer_mlm",
41 |                     "model_name": transformer_model,
42 |                     "masked_language_modeling": true
43 |                 },
44 |             },
45 |         },
46 |         "loss": {
47 |             "type": "nt_xent",
48 |             "temperature": 0.05,
49 |         },
50 |         // There was a small bug in the original implementation that caused gradients derived from
51 |         // the contrastive loss to be scaled by 1/N, where N is the number of GPUs used during
52 |         // training. This has been fixed. To reproduce results from the paper, set this to false.
53 |         // Note that this will have no effect if you are not using distributed training with more
54 |         // than 1 GPU.
55 |         "scale_fix": false
56 |     },
57 |     "data_loader": {
58 |         "batch_size": 4,
59 |         "num_workers": 1,
60 |         "drop_last": true,
61 |     },
62 |     "trainer": {
63 |         // Set use_amp to true to use automatic mixed-precision during training (if your GPU supports it)
64 |         "use_amp": true,
65 |         "optimizer": {
66 |             "type": "huggingface_adamw",
67 |             "lr": 5e-5,
68 |             "eps": 1e-06,
69 |             "correct_bias": false,
70 |             "weight_decay": 0.1,
71 |             "parameter_groups": [
72 |                 // Apply weight decay to pre-trained params, excluding LayerNorm params and biases
73 |                 [["bias", "LayerNorm\\.weight", "layer_norm\\.weight"], {"weight_decay": 0}],
74 |             ],
75 |         },
76 |         "num_epochs": 1,
77 |         "checkpointer": {
78 |             // A value of null or -1 will save the weights of the model at the end of every epoch
79 |             "num_serialized_models_to_keep": -1,
80 |         },
81 |         "grad_norm": 1.0,
82 |         "learning_rate_scheduler": {
83 |             "type": "slanted_triangular",
84 |         },
85 |     },
86 | }


--------------------------------------------------------------------------------
/training_config/contrastive_only.jsonnet:
--------------------------------------------------------------------------------
 1 | // This should be a registered name in the Transformers library (see https://huggingface.co/models) 
 2 | // OR a path on disk to a serialized transformer model.
 3 | local transformer_model = std.extVar("TRANSFORMER_MODEL");
 4 | 
 5 | // This will be used to set the max/min # of tokens in the positive and negative examples.
 6 | local max_length = 512;
 7 | local min_length = 32;
 8 | 
 9 | {
10 |     "vocabulary": {
11 |         "type": "empty"
12 |     },
13 |     "dataset_reader": {
14 |         "type": "declutr",
15 |         "lazy": true,
16 |         "num_anchors": 2,
17 |         "num_positives": 2,
18 |         "max_span_len": max_length,
19 |         "min_span_len": min_length,
20 |         "tokenizer": {
21 |             "type": "pretrained_transformer",
22 |             "model_name": transformer_model,
23 |             // Account for special tokens (e.g. CLS and SEP), otherwise a cryptic error is thrown.
24 |             "max_length": max_length - 2,
25 |         },
26 |         "token_indexers": {
27 |             "tokens": {
28 |                 "type": "pretrained_transformer",
29 |                 "model_name": transformer_model,
30 |             },
31 |         },
32 |     }, 
33 |     "train_data_path": null,
34 |     "model": {
35 |         "type": "declutr",
36 |         "text_field_embedder": {
37 |             "type": "mlm",
38 |             "token_embedders": {
39 |                 "tokens": {
40 |                     "type": "pretrained_transformer_mlm",
41 |                     "model_name": transformer_model,
42 |                     "masked_language_modeling": false
43 |                 },
44 |             },
45 |         },
46 |         "loss": {
47 |             "type": "nt_xent",
48 |             "temperature": 0.05,
49 |         },
50 |         // There was a small bug in the original implementation that caused gradients derived from
51 |         // the contrastive loss to be scaled by 1/N, where N is the number of GPUs used during
52 |         // training. This has been fixed. To reproduce results from the paper, set this to false.
53 |         // Note that this will have no effect if you are not using distributed training with more
54 |         // than 1 GPU.
55 |         "scale_fix": false
56 |     },
57 |     "data_loader": {
58 |         "batch_size": 4,
59 |         "num_workers": 1,
60 |         "drop_last": true,
61 |     },
62 |     "trainer": {
63 |         // Set use_amp to true to use automatic mixed-precision during training (if your GPU supports it)
64 |         "use_amp": true,
65 |         "optimizer": {
66 |             "type": "huggingface_adamw",
67 |             "lr": 5e-5,
68 |             "eps": 1e-06,
69 |             "correct_bias": false,
70 |             "weight_decay": 0.1,
71 |             "parameter_groups": [
72 |                 // Apply weight decay to pre-trained params, excluding LayerNorm params and biases
73 |                 [["bias", "LayerNorm\\.weight", "layer_norm\\.weight"], {"weight_decay": 0}],
74 |             ],
75 |         },
76 |         "num_epochs": 1,
77 |         "checkpointer": {
78 |             // A value of null or -1 will save the weights of the model at the end of every epoch
79 |             "num_serialized_models_to_keep": -1,
80 |         },
81 |         "grad_norm": 1.0,
82 |         "learning_rate_scheduler": {
83 |             "type": "slanted_triangular",
84 |         },
85 |     },
86 | }


--------------------------------------------------------------------------------
/training_config/mlm_only.jsonnet:
--------------------------------------------------------------------------------
 1 | // This should be a registered name in the Transformers library (see https://huggingface.co/models) 
 2 | // OR a path on disk to a serialized transformer model.
 3 | local transformer_model = std.extVar("TRANSFORMER_MODEL");
 4 | 
 5 | // This will be used to set the max/min # of tokens in the positive and negative examples.
 6 | local max_length = 512;
 7 | local min_length = 32;
 8 | 
 9 | {
10 |     "vocabulary": {
11 |         "type": "empty"
12 |     },
13 |     "dataset_reader": {
14 |         "type": "declutr",
15 |         "lazy": true,
16 |         // Technically, we don't need to sample anchors or positives when training with MLM only.
17 |         // However, to make this experiment as comparable as possible to the "Contrastive only"
18 |         // and "Both" experiments, we sample the same number of anchors and MLM on all of them.
19 |         "num_anchors": 2,
20 |         "num_positives": 1,
21 |         "max_span_len": max_length,
22 |         "min_span_len": min_length,
23 |         "tokenizer": {
24 |             "type": "pretrained_transformer",
25 |             "model_name": transformer_model,
26 |             // Account for special tokens (e.g. CLS and SEP), otherwise a cryptic error is thrown.
27 |             "max_length": max_length - 2,
28 |         },
29 |         "token_indexers": {
30 |             "tokens": {
31 |                 "type": "pretrained_transformer",
32 |                 "model_name": transformer_model,
33 |             },
34 |         },
35 |     }, 
36 |     "train_data_path": null,
37 |     "model": {
38 |         "type": "declutr",
39 |         "text_field_embedder": {
40 |             "type": "mlm",
41 |             "token_embedders": {
42 |                 "tokens": {
43 |                     "type": "pretrained_transformer_mlm",
44 |                     "model_name": transformer_model,
45 |                     "masked_language_modeling": true
46 |                 },
47 |             },
48 |         },
49 |         // There was a small bug in the original implementation that caused gradients derived from
50 |         // the contrastive loss to be scaled by 1/N, where N is the number of GPUs used during
51 |         // training. This has been fixed. To reproduce results from the paper, set this to false.
52 |         // Note that this will have no effect if you are not using distributed training with more
53 |         // than 1 GPU.
54 |         "scale_fix": false
55 |     },
56 |     "data_loader": {
57 |         "batch_size": 4,
58 |         "num_workers": 1,
59 |         "drop_last": true,
60 |     },
61 |     "trainer": {
62 |         // Set use_amp to true to use automatic mixed-precision during training (if your GPU supports it)
63 |         "use_amp": true,
64 |         "optimizer": {
65 |             "type": "huggingface_adamw",
66 |             "lr": 5e-5,
67 |             "eps": 1e-06,
68 |             "correct_bias": false,
69 |             "weight_decay": 0.1,
70 |             "parameter_groups": [
71 |                 // Apply weight decay to pre-trained params, excluding LayerNorm params and biases
72 |                 [["bias", "LayerNorm\\.weight", "layer_norm\\.weight"], {"weight_decay": 0}],
73 |             ],
74 |         },
75 |         "num_epochs": 1,
76 |         "checkpointer": {
77 |             // A value of null or -1 will save the weights of the model at the end of every epoch
78 |             "num_serialized_models_to_keep": -1,
79 |         },
80 |         "grad_norm": 1.0,
81 |         "learning_rate_scheduler": {
82 |             "type": "slanted_triangular",
83 |         },
84 |     },
85 | }


--------------------------------------------------------------------------------
/tests/common/test_model_utils.py:
--------------------------------------------------------------------------------
 1 | from copy import deepcopy
 2 | 
 3 | import torch
 4 | from allennlp.data import TextFieldTensors
 5 | from hypothesis import given, settings
 6 | from hypothesis.strategies import integers
 7 | 
 8 | from declutr.common import model_utils
 9 | 
10 | 
11 | class TestModelUtils:
12 |     @settings(deadline=None)
13 |     @given(
14 |         batch_size=integers(min_value=1, max_value=4),
15 |         num_anchors=integers(min_value=1, max_value=4),
16 |         max_length=integers(min_value=1, max_value=16),
17 |     )
18 |     def test_unpack_batch(self, batch_size: int, num_anchors: int, max_length: int) -> None:
19 |         # Create some dummy data.
20 |         two_dim_tensor = torch.randn(batch_size, max_length)
21 |         two_dim_input: TextFieldTensors = {
22 |             "tokens": {
23 |                 "token_ids": two_dim_tensor,
24 |                 "mask": torch.ones_like(two_dim_tensor),
25 |                 "type_ids": torch.ones_like(two_dim_tensor),
26 |             }
27 |         }
28 |         three_dim_tensor = torch.randn(batch_size, num_anchors, max_length)
29 |         three_dim_input: TextFieldTensors = {
30 |             "tokens": {
31 |                 "token_ids": three_dim_tensor,
32 |                 "mask": torch.ones_like(three_dim_tensor),
33 |                 "type_ids": torch.ones_like(three_dim_tensor),
34 |             }
35 |         }
36 |         four_dim_tensor = torch.randn(batch_size, num_anchors, num_anchors, max_length)
37 |         four_dim_input: TextFieldTensors = {
38 |             "tokens": {
39 |                 "token_ids": four_dim_tensor,
40 |                 "mask": torch.ones_like(four_dim_tensor),
41 |                 "type_ids": torch.ones_like(four_dim_tensor),
42 |             }
43 |         }
44 | 
45 |         # Only TextFieldTensors with tensors of three dimensions should be reshaped...
46 |         # Tensors are updated in-place, so deepcopy before passing to unpack_batch
47 |         actual_three_input_dim = model_utils.unpack_batch(deepcopy(three_dim_input))
48 |         for name, tensor in actual_three_input_dim["tokens"].items():
49 |             assert torch.equal(
50 |                 tensor,
51 |                 three_dim_input["tokens"][name].reshape(batch_size * num_anchors, max_length),
52 |             )
53 |         # ...unpack_batch is a no-op for TextFieldTensors with tensors less than or greater than 3D.
54 |         actual_two_dim_input = model_utils.unpack_batch(deepcopy(two_dim_input))
55 |         for name, tensor in actual_two_dim_input["tokens"].items():
56 |             assert torch.equal(tensor, two_dim_input["tokens"][name])
57 |         actual_four_dim_input = model_utils.unpack_batch(deepcopy(four_dim_input))
58 |         for name, tensor in actual_four_dim_input["tokens"].items():
59 |             assert torch.equal(tensor, four_dim_input["tokens"][name])
60 | 
61 |     def test_all_gather_anchor_positive_pairs_no_op(self) -> None:
62 |         """Check that `all_gather_anchor_positive_pairs` is a no-op when not in distributed mode."""
63 |         num_anchors = 2
64 |         num_positives = 2
65 |         batch_size = 16
66 |         embedding_dim = 256
67 | 
68 |         expected_anchors = torch.randn(num_anchors, batch_size, embedding_dim)
69 |         expected_positives = torch.randn(num_positives, batch_size, embedding_dim)
70 |         actual_anchors, actual_positives = model_utils.all_gather_anchor_positive_pairs(
71 |             expected_anchors, expected_positives
72 |         )
73 | 
74 |         assert torch.equal(actual_anchors, expected_anchors)
75 |         assert torch.equal(actual_positives, expected_positives)
76 | 


--------------------------------------------------------------------------------
/declutr/modules/text_field_embedders/mlm_text_field_embedder.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | from typing import Dict
 3 | 
 4 | import torch
 5 | 
 6 | from allennlp.common.checks import ConfigurationError
 7 | from allennlp.data import TextFieldTensors
 8 | from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
 9 | from allennlp.modules.text_field_embedders.text_field_embedder import TextFieldEmbedder
10 | from allennlp.modules.time_distributed import TimeDistributed
11 | from allennlp.modules.token_embedders.token_embedder import TokenEmbedder
12 | 
13 | 
14 | @TextFieldEmbedder.register("mlm")
15 | class MLMTextFieldEmbedder(BasicTextFieldEmbedder):
16 |     """
17 |     This is a a simple wrapper around `BasicTextFieldEmbedder` that accounts for the fact that
18 |     our custom PretrainedTransformerEmbedderMLM returns a tuple containing the loss for the masked
19 |     language modelling objective as well as some embedded text.
20 | 
21 |     Registered as a `TextFieldEmbedder` with name "mlm".
22 | 
23 |     # Parameters
24 | 
25 |     token_embedders : `Dict[str, TokenEmbedder]`, required.
26 |         A dictionary mapping token embedder names to implementations.
27 |         These names should match the corresponding indexer used to generate
28 |         the tensor passed to the TokenEmbedder.
29 |     """
30 | 
31 |     def __init__(self, token_embedders: Dict[str, TokenEmbedder]) -> None:
32 |         super().__init__(token_embedders)
33 | 
34 |     def forward(
35 |         self, text_field_input: TextFieldTensors, num_wrapping_dims: int = 0, **kwargs
36 |     ) -> torch.Tensor:
37 |         if self._token_embedders.keys() != text_field_input.keys():
38 |             message = "Mismatched token keys: %s and %s" % (
39 |                 str(self._token_embedders.keys()),
40 |                 str(text_field_input.keys()),
41 |             )
42 |             raise ConfigurationError(message)
43 | 
44 |         embedded_representations = []
45 |         for key in self._ordered_embedder_keys:
46 |             # Note: need to use getattr here so that the pytorch voodoo
47 |             # with submodules works with multiple GPUs.
48 |             embedder = getattr(self, "token_embedder_{}".format(key))
49 |             forward_params = inspect.signature(embedder.forward).parameters
50 |             forward_params_values = {}
51 |             missing_tensor_args = set()
52 |             for param in forward_params.keys():
53 |                 if param in kwargs:
54 |                     forward_params_values[param] = kwargs[param]
55 |                 else:
56 |                     missing_tensor_args.add(param)
57 | 
58 |             for _ in range(num_wrapping_dims):
59 |                 embedder = TimeDistributed(embedder)
60 | 
61 |             tensors: Dict[str, torch.Tensor] = text_field_input[key]
62 |             if len(tensors) == 1 and len(missing_tensor_args) == 1:
63 |                 # If there's only one tensor argument to the embedder, and we just have one tensor
64 |                 # to embed, we can just pass in that tensor, without requiring a name match.
65 |                 masked_lm_loss, token_vectors = embedder(
66 |                     list(tensors.values())[0], **forward_params_values
67 |                 )
68 |             else:
69 |                 # If there are multiple tensor arguments, we have to require matching names from
70 |                 # the TokenIndexer. I don't think there's an easy way around that.
71 |                 masked_lm_loss, token_vectors = embedder(**tensors, **forward_params_values)
72 |             if token_vectors is not None:
73 |                 # To handle some very rare use cases, we allow the return value of the embedder to
74 |                 # be None; we just skip it in that case.
75 |                 embedded_representations.append(token_vectors)
76 |         return masked_lm_loss, torch.cat(embedded_representations, dim=-1)
77 | 


--------------------------------------------------------------------------------
/declutr/losses/pytorch_metric_learning.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from typing import Tuple
 3 | 
 4 | import torch
 5 | from allennlp.common import Registrable
 6 | 
 7 | from declutr.miners import PyTorchMetricLearningMiner
 8 | from pytorch_metric_learning import losses
 9 | 
10 | 
11 | class PyTorchMetricLearningLoss(Registrable):
12 |     """This class allows us to implement `Registrable` for PyTorch Metric Learning loss functions.
13 |     Subclasses of this class should also subclass a loss function from PyTorch Metric Learning
14 |     (see: https://kevinmusgrave.github.io/pytorch-metric-learning/losses/), and accept as arguments
15 |     to the constructor the same arguments that the loss function does. See `NTXentLoss` below for
16 |     an example.
17 |     """
18 | 
19 |     default_implementation = "nt_xent"
20 | 
21 |     @classmethod
22 |     def get_embeddings_and_labels(
23 |         self, anchors: torch.Tensor, positives: torch.Tensor
24 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
25 |         """Formats a pair of anchor, positive embeddings for use with a PyTorch Metric Learning loss
26 |         function (https://github.com/KevinMusgrave/pytorch-metric-learning). These loss functions
27 |         expect a single embedding tensor, and a corresponding set of labels. Given two tensors:
28 |         `anchor_embeddings` and `positive_embeddings` each of shape `(batch_size, embedding_dim)`,
29 |         concatenate them along the first dimension to produce a single tensor, `embeddings`, of
30 |         shape `(batch_size * 2, embedding_dim)`. Then, generate the corresponding `labels` tensor of
31 |         shape `(batch_size * 2)` by assigning a matching integer index to each pair of anchor,
32 |         positive embeddings in `embeddings`.
33 | 
34 |         # Parameters
35 | 
36 |         anchor_embeddings : `torch.Tensor`
37 |             Encoded representations of the anchors.
38 |         positive_embeddings : `torch.Tensor`
39 |             Encoded representations of the positives.
40 | 
41 |         # Returns
42 | 
43 |         A tuple of embeddings and labels that can be fed directly to any PyTorch Metric Learning
44 |         loss function.
45 |         """
46 |         embeddings = torch.cat((anchors, positives))
47 |         # When using CrossBatchMemory, labels persist across batches so they need to be unique.
48 |         # By choosing a random integer in (0, sys.maxsize) we can be reasonably sure of this.
49 |         # Obviously, there are better (i.e. deterministic ways to do this), but I don't have
50 |         # access to the current batch id or some other uniquely identifying value.
51 |         indices = torch.randint(sys.maxsize, (anchors.size(0),), device=anchors.device)
52 |         labels = torch.cat((indices, indices))
53 | 
54 |         return embeddings, labels
55 | 
56 | 
57 | @PyTorchMetricLearningLoss.register("cross_batch_memory")
58 | class CrossBatchMemory(PyTorchMetricLearningLoss, losses.CrossBatchMemory):
59 |     """Wraps the `CrossBatchMemory` implementation from Pytorch Metric Learning:
60 |     (https://kevinmusgrave.github.io/pytorch-metric-learning/losses/#crossbatchmemory).
61 | 
62 |     Registered as a `PyTorchMetricLearningLoss` with name "cross_batch_memory".
63 |     """
64 | 
65 |     def __init__(
66 |         self,
67 |         loss: PyTorchMetricLearningLoss,
68 |         embedding_size: int,
69 |         memory_size: int = 1024,
70 |         miner: PyTorchMetricLearningMiner = None,
71 |     ) -> None:
72 | 
73 |         super().__init__(
74 |             loss=loss,
75 |             embedding_size=embedding_size,
76 |             memory_size=memory_size,
77 |             miner=miner,
78 |         )
79 | 
80 | 
81 | @PyTorchMetricLearningLoss.register("nt_xent")
82 | class NTXentLoss(PyTorchMetricLearningLoss, losses.NTXentLoss):
83 |     """Wraps the `NTXentLoss` implementation from Pytorch Metric Learning:
84 |     (https://kevinmusgrave.github.io/pytorch-metric-learning/losses/#ntxentloss).
85 | 
86 |     Registered as a `PyTorchMetricLearningLoss` with name "nt_xent".
87 |     """
88 | 
89 |     def __init__(self, temperature: float) -> None:
90 | 
91 |         super().__init__(temperature=temperature)
92 | 


--------------------------------------------------------------------------------
/tests/test_model.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import pytest
 4 | 
 5 | from allennlp.common.params import Params
 6 | from allennlp.common.testing import ModelTestCase
 7 | from allennlp.models import Model
 8 | 
 9 | 
10 | class TestDeCLUTR(ModelTestCase):
11 |     def setup_method(self) -> None:
12 |         super().setup_method()
13 |         # We need to override the path set by AllenNLP
14 |         self.FIXTURES_ROOT = Path("tests/fixtures")
15 |         self.set_up_model(
16 |             self.FIXTURES_ROOT / "experiment.jsonnet",
17 |             self.FIXTURES_ROOT / "data" / "openwebtext" / "train.txt",
18 |         )
19 | 
20 |     def test_forward_pass_runs_correctly(self) -> None:
21 |         training_tensors = self.dataset.as_tensor_dict()
22 |         output_dict = self.model(**training_tensors)
23 |         output_dict = self.model.make_output_human_readable(output_dict)
24 |         assert "loss" in output_dict.keys()
25 |         # Embeddings are not added to the output dict when training
26 |         assert "embeddings" not in output_dict.keys()
27 | 
28 |     def test_forward_pass_with_feedforward_runs_correctly(self) -> None:
29 |         self.set_up_model(
30 |             self.FIXTURES_ROOT / "experiment_feedforward.jsonnet",
31 |             self.FIXTURES_ROOT / "data" / "openwebtext" / "train.txt",
32 |         )
33 |         training_tensors = self.dataset.as_tensor_dict()
34 |         output_dict = self.model(**training_tensors)
35 |         output_dict = self.model.make_output_human_readable(output_dict)
36 |         assert "loss" in output_dict.keys()
37 |         # Embeddings are not added to the output dict when training
38 |         assert "embeddings" not in output_dict.keys()
39 | 
40 |     def test_forward_pass_contrastive_only_runs_correctly(self) -> None:
41 |         self.set_up_model(
42 |             self.FIXTURES_ROOT / "experiment_contrastive_only.jsonnet",
43 |             self.FIXTURES_ROOT / "data" / "openwebtext" / "train.txt",
44 |         )
45 |         training_tensors = self.dataset.as_tensor_dict()
46 |         output_dict = self.model(**training_tensors)
47 |         output_dict = self.model.make_output_human_readable(output_dict)
48 |         assert "loss" in output_dict.keys()
49 |         # Embeddings are not added to the output dict when training
50 |         assert "embeddings" not in output_dict.keys()
51 | 
52 |     def test_forward_pass_mlm_only_runs_correctly(self) -> None:
53 |         self.set_up_model(
54 |             self.FIXTURES_ROOT / "experiment_mlm_only.jsonnet",
55 |             self.FIXTURES_ROOT / "data" / "openwebtext" / "train.txt",
56 |         )
57 |         training_tensors = self.dataset.as_tensor_dict()
58 |         output_dict = self.model(**training_tensors)
59 |         output_dict = self.model.make_output_human_readable(output_dict)
60 |         assert "loss" in output_dict.keys()
61 |         # Embeddings are not added to the output dict when training
62 |         assert "embeddings" not in output_dict.keys()
63 | 
64 |     def test_forward_pass_scalar_mix_runs_correctly(self) -> None:
65 |         self.set_up_model(
66 |             self.FIXTURES_ROOT / "experiment_scalar_mix.jsonnet",
67 |             self.FIXTURES_ROOT / "data" / "openwebtext" / "train.txt",
68 |         )
69 |         training_tensors = self.dataset.as_tensor_dict()
70 |         output_dict = self.model(**training_tensors)
71 |         output_dict = self.model.make_output_human_readable(output_dict)
72 |         assert "loss" in output_dict.keys()
73 |         # Embeddings are not added to the output dict when training
74 |         assert "embeddings" not in output_dict.keys()
75 | 
76 |     def test_no_loss_throws_configuration_error(self) -> None:
77 |         params = Params.from_file(self.param_file)
78 |         params["model"]["loss"] = None
79 |         params["model"]["text_field_embedder"]["token_embedders"]["tokens"][
80 |             "masked_language_modeling"
81 |         ] = False
82 |         with pytest.raises(ValueError):
83 |             Model.from_params(vocab=self.vocab, params=params.get("model"))
84 | 
85 |     @pytest.mark.skip(reason="failing for upstream reasons")
86 |     def test_can_train_save_and_load(self) -> None:
87 |         self.ensure_model_can_train_save_and_load(self.param_file)
88 | 


--------------------------------------------------------------------------------
/tests/test_dataset_reader.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from hypothesis import given, settings
 3 | from hypothesis.strategies import integers, text
 4 | 
 5 | from declutr.dataset_reader import DeCLUTRDatasetReader
 6 | 
 7 | 
 8 | class TestDeCLUTRDatasetReader:
 9 |     # Not clear why turning off the deadline is neccesary? Errors out otherwise.
10 |     @settings(deadline=None)
11 |     @given(
12 |         num_anchors=integers(min_value=0, max_value=4),
13 |         num_positives=integers(min_value=1, max_value=4),
14 |         max_span_len=integers(min_value=32, max_value=64),
15 |         min_span_len=integers(min_value=16, max_value=32),
16 |     )
17 |     def test_no_sample_context_manager(
18 |         self, num_anchors: int, num_positives: int, max_span_len: int, min_span_len: int
19 |     ) -> None:
20 |         dataset_reader = DeCLUTRDatasetReader(
21 |             num_anchors=num_anchors,
22 |             num_positives=num_positives,
23 |             max_span_len=max_span_len,
24 |             min_span_len=min_span_len,
25 |         )
26 | 
27 |         # While in the scope of the context manager, sample_spans should be false.
28 |         # After existing the context manger, it should return to whatever value it was at
29 |         # before entering the contxt manager.
30 |         previous = dataset_reader.sample_spans
31 |         with dataset_reader.no_sample():
32 |             assert not dataset_reader.sample_spans
33 |         assert dataset_reader.sample_spans == previous
34 | 
35 |     @given(
36 |         num_anchors=integers(min_value=0, max_value=4),
37 |         num_positives=integers(min_value=1, max_value=4),
38 |         max_span_len=integers(min_value=32, max_value=64),
39 |         min_span_len=integers(min_value=16, max_value=32),
40 |     )
41 |     def test_init_raises_value_error_sampling_missing_arguments(
42 |         self, num_anchors: int, num_positives: int, max_span_len: int, min_span_len: int
43 |     ) -> None:
44 |         if num_anchors:  # should only raise the error when num_anchors is truthy
45 |             with pytest.raises(ValueError):
46 |                 _ = DeCLUTRDatasetReader(
47 |                     num_anchors=num_anchors,
48 |                     num_positives=num_positives,
49 |                     max_span_len=None,
50 |                     min_span_len=min_span_len,
51 |                 )
52 |             with pytest.raises(ValueError):
53 |                 _ = DeCLUTRDatasetReader(
54 |                     num_anchors=num_anchors,
55 |                     num_positives=num_positives,
56 |                     max_span_len=max_span_len,
57 |                     min_span_len=None,
58 |                 )
59 |             with pytest.raises(ValueError):
60 |                 _ = DeCLUTRDatasetReader(
61 |                     num_anchors=num_anchors,
62 |                     num_positives=num_positives,
63 |                     max_span_len=None,
64 |                     min_span_len=None,
65 |                 )
66 |             with pytest.raises(ValueError):
67 |                 _ = DeCLUTRDatasetReader(
68 |                     num_anchors=num_anchors,
69 |                     num_positives=None,
70 |                     max_span_len=max_span_len,
71 |                     min_span_len=min_span_len,
72 |                 )
73 | 
74 |     @given(
75 |         num_anchors=integers(min_value=0, max_value=4),
76 |         num_positives=integers(min_value=1, max_value=4),
77 |         max_span_len=integers(min_value=32, max_value=64),
78 |         min_span_len=integers(min_value=16, max_value=32),
79 |         sampling_strategy=text(),
80 |     )
81 |     def test_init_raises_value_error_invalid_sampling_strategy(
82 |         self,
83 |         num_anchors: int,
84 |         num_positives: int,
85 |         max_span_len: int,
86 |         min_span_len: int,
87 |         sampling_strategy: str,
88 |     ) -> None:
89 |         if num_anchors:  # should only raise the error when num_spans is truthy
90 |             with pytest.raises(ValueError):
91 |                 _ = DeCLUTRDatasetReader(
92 |                     num_anchors=num_anchors,
93 |                     num_positives=num_positives,
94 |                     max_span_len=max_span_len,
95 |                     min_span_len=min_span_len,
96 |                     sampling_strategy=sampling_strategy,
97 |                 )
98 | 


--------------------------------------------------------------------------------
/scripts/preprocess_wikitext_103.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import io
  3 | import re
  4 | import zipfile
  5 | from pathlib import Path
  6 | from typing import List, Optional
  7 | 
  8 | import requests
  9 | import typer
 10 | from declutr.common.util import sanitize_text
 11 | 
 12 | WIKITEXT_103_URL = "https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip"
 13 | 
 14 | # Emoji's used in typer.secho calls
 15 | # See: https://github.com/carpedm20/emoji/blob/master/emoji/unicode_codes.py"
 16 | SAVING = "\U0001F4BE"
 17 | DOWNLOAD = "\U00002B07"
 18 | 
 19 | 
 20 | def _write_output_to_disk(text: List[str], output_filepath: Path) -> None:
 21 |     """Writes a list of documents, `text`, to the file `output_filepath`, one document per line."""
 22 |     # Create the directory path if it doesn't exist
 23 |     output_filepath = Path(output_filepath)
 24 |     output_filepath.parents[0].mkdir(parents=True, exist_ok=True)
 25 | 
 26 |     with open(output_filepath, "w") as f:
 27 |         # TODO (John): In the future, it might make sense to both batch and shard:
 28 |         # 1) Batch, meaning write batches of documents to a file as opposed to 1 at a time
 29 |         # 2) Shard, meaning break a file up into shard_size // len(text) files, and return a
 30 |         #    directory instead. Loading a dataset like this is supported in AllenNLP (see:
 31 |         #    https://docs.allennlp.org/master/api/data/dataset_readers/sharded_dataset_reader/)
 32 |         with typer.progressbar(text, label="Writing to disk") as progress:
 33 |             for doc in progress:
 34 |                 f.write(doc.strip() + "\n")
 35 |     typer.secho(
 36 |         f"{SAVING} {len(text)} preprocessed documents saved to: {output_filepath}",
 37 |         bold=True,
 38 |     )
 39 | 
 40 | 
 41 | def main(
 42 |     output_filepath: Path,
 43 |     segment_sentences: bool = False,
 44 |     lowercase: bool = False,
 45 |     min_length: Optional[int] = None,
 46 |     max_instances: Optional[int] = None,
 47 |     pretrained_model_name_or_path: Optional[str] = None,
 48 | ) -> None:
 49 |     """Downloads and lightly preprocesses WikiText-103. If `min_length is not None`, only documents
 50 |     with at least this many tokens are retained. If `pretrained_model_name_or_path` is not None, the
 51 |     tokenizer will be loaded as `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)`
 52 |     using the HuggingFace Transformers library. Otherwise `str.split()` is used. This argument has
 53 |     no effect if `min-length is None`. If `segment_sentences` is provided, individual sentences
 54 |     will be returned instead of documents. You must have the `"en_core_web_sm"` spacy model
 55 |     installed to segment sentences.
 56 |     """
 57 |     # Setup the pre-trained tokenizer, if specified
 58 |     if min_length is not None:
 59 |         if pretrained_model_name_or_path is not None:
 60 |             # Import transformers here to prevent ImportError errors if the
 61 |             # user doesn't want to use it.
 62 |             from transformers import AutoTokenizer
 63 | 
 64 |             tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path).tokenize
 65 |         else:
 66 |             tokenizer = lambda x: x.split()  # noqa
 67 |     else:
 68 |         tokenizer = None
 69 | 
 70 |     # Setup spacy lang object if we are segmenting sentences
 71 |     if segment_sentences:
 72 |         import spacy
 73 | 
 74 |         nlp = spacy.load("en_core_web_sm", disable=["ner"])
 75 | 
 76 |     # Download WikiText-103
 77 |     r = requests.get(WIKITEXT_103_URL, stream=True)
 78 |     z = zipfile.ZipFile(io.BytesIO(r.content))
 79 |     partition_filenames = z.namelist()[1:]
 80 |     typer.secho(f"{DOWNLOAD} Downloaded WikiText-103", bold=True)
 81 | 
 82 |     preprocessed_documents: List[str] = []
 83 |     for filename in partition_filenames:
 84 |         text = z.open(filename).read().decode("utf-8")
 85 | 
 86 |         # Strip out subtitles and split the text into documents
 87 |         no_subtitles = re.sub(r"(=\s){2,5}.*(=\s){2,5}", "", text)
 88 |         documents = re.split(r"=\s.*\s=", no_subtitles)
 89 | 
 90 |         if segment_sentences:
 91 |             documents = (sent.text for doc in documents for sent in nlp(doc).sents)  # type: ignore
 92 | 
 93 |         with typer.progressbar(
 94 |             documents, length=max_instances, label=typer.style("Preprocessing text", bold=True)
 95 |         ) as progress:
 96 |             for doc in progress:
 97 |                 doc = sanitize_text(doc, lowercase=lowercase)
 98 |                 if not doc:
 99 |                     continue
100 | 
101 |                 # Retain documents if the length of their shortest document is
102 |                 # equal to or greater than the minimum specified length
103 |                 if tokenizer is not None:
104 |                     num_tokens = len(tokenizer(doc))
105 |                     if min_length and num_tokens < min_length:
106 |                         continue
107 | 
108 |                 if max_instances and len(preprocessed_documents) >= max_instances:
109 |                     break
110 |                 preprocessed_documents.append(doc)
111 |                 progress.update(1)
112 | 
113 |     _write_output_to_disk(preprocessed_documents, output_filepath)
114 | 
115 | 
116 | if __name__ == "__main__":
117 |     typer.run(main)
118 | 


--------------------------------------------------------------------------------
/notebooks/evaluating.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "evaluating.ipynb",
  7 |       "provenance": [],
  8 |       "collapsed_sections": []
  9 |     },
 10 |     "kernelspec": {
 11 |       "name": "python3",
 12 |       "display_name": "Python 3"
 13 |     },
 14 |     "accelerator": "GPU"
 15 |   },
 16 |   "cells": [
 17 |     {
 18 |       "cell_type": "markdown",
 19 |       "metadata": {
 20 |         "id": "LY9LO9FnSPIa"
 21 |       },
 22 |       "source": [
 23 |         "# Evaluating a model\n",
 24 |         "\n",
 25 |         "This notebook will walk you through evaluating a [DeCLUTR](https://github.com/JohnGiorgi/DeCLUTR) model with [SentEval](https://github.com/facebookresearch/SentEval)."
 26 |       ]
 27 |     },
 28 |     {
 29 |       "cell_type": "markdown",
 30 |       "metadata": {
 31 |         "id": "ZbZ4o1HHSM5t"
 32 |       },
 33 |       "source": [
 34 |         "## 🔧 Install the prerequisites\n",
 35 |         "\n",
 36 |         "Clone to repository locally so we have access to the evaluation scripts. Then install DeCLUTR"
 37 |       ]
 38 |     },
 39 |     {
 40 |       "cell_type": "code",
 41 |       "metadata": {
 42 |         "id": "fdyoe-EPSKLN"
 43 |       },
 44 |       "source": [
 45 |         "%%bash\n",
 46 |         "git clone https://github.com/JohnGiorgi/DeCLUTR.git\n",
 47 |         "cd DeCLUTR\n",
 48 |         "pip install -e .\n",
 49 |         "cd ../"
 50 |       ],
 51 |       "execution_count": null,
 52 |       "outputs": []
 53 |     },
 54 |     {
 55 |       "cell_type": "markdown",
 56 |       "metadata": {
 57 |         "id": "5o1owrdbWDl9"
 58 |       },
 59 |       "source": [
 60 |         "Next, we have to clone the SentEval benchmark locally (this will take a few minutes)"
 61 |       ]
 62 |     },
 63 |     {
 64 |       "cell_type": "code",
 65 |       "metadata": {
 66 |         "id": "_9Mg77kOREs7"
 67 |       },
 68 |       "source": [
 69 |         "%%bash\n",
 70 |         "# Clone our fork which has several bug fixes merged\n",
 71 |         "git clone https://github.com/JohnGiorgi/SentEval.git\n",
 72 |         "cd SentEval/data/downstream/\n",
 73 |         "./get_transfer_data.bash\n",
 74 |         "cd ../../../"
 75 |       ],
 76 |       "execution_count": null,
 77 |       "outputs": []
 78 |     },
 79 |     {
 80 |       "cell_type": "markdown",
 81 |       "metadata": {
 82 |         "id": "m05mAP5wWU-f"
 83 |       },
 84 |       "source": [
 85 |         "Lastly, we need a model to evaluate. We will download `DeCLUTR-small`:\n",
 86 |         "\n"
 87 |       ]
 88 |     },
 89 |     {
 90 |       "cell_type": "code",
 91 |       "metadata": {
 92 |         "id": "0Nd8oYGpUn5k"
 93 |       },
 94 |       "source": [
 95 |         "!wget https://github.com/JohnGiorgi/DeCLUTR/releases/download/v0.1.0rc1/declutr-small.tar.gz"
 96 |       ],
 97 |       "execution_count": null,
 98 |       "outputs": []
 99 |     },
100 |     {
101 |       "cell_type": "markdown",
102 |       "metadata": {
103 |         "id": "3ae7g5puWn8X"
104 |       },
105 |       "source": [
106 |         "## 📋 Evaluating the model\n",
107 |         "\n",
108 |         "Finally, use our provided script to evaluate the model on SentEval.\n",
109 |         "\n",
110 |         "> Note, the script will evaluate on all 28 SentEval tasks. This can take 7 hours or more on a GPU."
111 |       ]
112 |     },
113 |     {
114 |       "cell_type": "code",
115 |       "metadata": {
116 |         "id": "zLWGGQSJUwW1"
117 |       },
118 |       "source": [
119 |         "!python DeCLUTR/scripts/run_senteval.py allennlp \"SentEval\" \"declutr-small.tar.gz\" \\\n",
120 |         " --output-filepath \"senteval_results.json\" \\\n",
121 |         " --cuda-device 0  \\\n",
122 |         " --include-package \"declutr\" \\\n",
123 |         " --verbose"
124 |       ],
125 |       "execution_count": null,
126 |       "outputs": []
127 |     },
128 |     {
129 |       "cell_type": "markdown",
130 |       "metadata": {
131 |         "id": "MLRyWa1IXepJ"
132 |       },
133 |       "source": [
134 |         "We also provide commands for evaluating other popular sentence encoders. For a list of commands, run:"
135 |       ]
136 |     },
137 |     {
138 |       "cell_type": "code",
139 |       "metadata": {
140 |         "id": "GWdKsXc_W6TC"
141 |       },
142 |       "source": [
143 |         "!python DeCLUTR/scripts/run_senteval.py --help"
144 |       ],
145 |       "execution_count": null,
146 |       "outputs": []
147 |     },
148 |     {
149 |       "cell_type": "markdown",
150 |       "metadata": {
151 |         "id": "OCNuiVzLXozu"
152 |       },
153 |       "source": [
154 |         "For help with a specific command, e.g. `transformers`, run:"
155 |       ]
156 |     },
157 |     {
158 |       "cell_type": "code",
159 |       "metadata": {
160 |         "id": "XEbSmYIVWNoB"
161 |       },
162 |       "source": [
163 |         "!python DeCLUTR/scripts/run_senteval.py transformers --help"
164 |       ],
165 |       "execution_count": null,
166 |       "outputs": []
167 |     },
168 |     {
169 |       "cell_type": "markdown",
170 |       "metadata": {
171 |         "id": "AZyXiAmHX3RF"
172 |       },
173 |       "source": [
174 |         "Notice that evaluate other popular models, like [Sentence Transformers](https://www.sbert.net/)! Just make sure to install it first:"
175 |       ]
176 |     },
177 |     {
178 |       "cell_type": "code",
179 |       "metadata": {
180 |         "id": "samad0pSbg4N"
181 |       },
182 |       "source": [
183 |         "!pip install sentence-transformers"
184 |       ],
185 |       "execution_count": null,
186 |       "outputs": []
187 |     },
188 |     {
189 |       "cell_type": "code",
190 |       "metadata": {
191 |         "id": "V1kN71RBXtwB"
192 |       },
193 |       "source": [
194 |         "!python DeCLUTR/scripts/run_senteval.py sentence-transformers \"SentEval\" \"roberta-base-nli-mean-tokens\" \\\n",
195 |         " --output-filepath \"senteval_results.json\" \\\n",
196 |         " --cuda-device 0  \\\n",
197 |         " --verbose"
198 |       ],
199 |       "execution_count": null,
200 |       "outputs": []
201 |     }
202 |   ]
203 | }


--------------------------------------------------------------------------------
/scripts/preprocess_openwebtext.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import shutil
  3 | import tarfile
  4 | from pathlib import Path
  5 | from typing import Optional
  6 | 
  7 | import typer
  8 | from declutr.common.util import sanitize_text
  9 | from more_itertools import chunked
 10 | 
 11 | # Emoji's used in typer.secho calls
 12 | # See: https://github.com/carpedm20/emoji/blob/master/emoji/unicode_codes.py"
 13 | WARNING = "\U000026A0"
 14 | SAVING = "\U0001F4BE"
 15 | MINING = "\U000026CF"
 16 | 
 17 | 
 18 | def main(
 19 |     openwebtext_path: Path = typer.Argument(..., help="Path to a OpenWebText dump."),
 20 |     output_filepath: Path = typer.Argument(..., help="Filepath to save the preprocessed text"),
 21 |     min_length: Optional[int] = typer.Option(
 22 |         None, help="Minimum token length of documents to retain"
 23 |     ),
 24 |     lowercase: bool = typer.Option(True, help="Whether text should be lowercased"),
 25 |     max_documents: Optional[int] = typer.Option(
 26 |         None,
 27 |         help="Maximum number of documents to retain. Because of batching, this won't be exact.",
 28 |     ),
 29 |     pretrained_model_name_or_path: Optional[str] = typer.Option(
 30 |         None,
 31 |         help=(
 32 |             "Name of the HuggingFace Tokenizer to use when determining the token length of a"
 33 |             "document. Has no effect if min-length is None"
 34 |         ),
 35 |     ),
 36 | ) -> None:
 37 |     """Lightly preprocesses an OpenWebText dump obtained from
 38 |     https://skylion007.github.io/OpenWebTextCorpus/. If `min-length is not None`, only documents
 39 |     with at least this many tokens are retained. If `pretrained_model_name_or_path` is not None,
 40 |     the tokenizer will be loaded as `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)`
 41 |     using the HuggingFace Transformers library. Otherwise `str.split()` is used. This argument has
 42 |     no effect if `min-length is None`.
 43 |     """
 44 |     openwebtext_path = Path(openwebtext_path)
 45 |     output_filepath = Path(output_filepath)
 46 |     output_filepath.parents[0].mkdir(parents=True, exist_ok=True)
 47 | 
 48 |     # Setup the pre-trained tokenizer, if specified
 49 |     if min_length is not None:
 50 |         if pretrained_model_name_or_path is not None:
 51 |             # Import transformers here to prevent ImportError errors if the
 52 |             # user doesn't want to use it.
 53 |             from transformers import AutoTokenizer
 54 | 
 55 |             tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, use_fast=True)
 56 |         else:
 57 |             tokenizer = lambda x: x.split()  # noqa
 58 |     else:
 59 |         tokenizer = None
 60 | 
 61 |     processed_docs = 0
 62 |     skipped_files = 0
 63 |     typer.secho(
 64 |         (
 65 |             f'{MINING} Scraping {max_documents or "all"} documents'
 66 |             f' {f"with a minimum token length of {min_length}" if min_length else ""}'
 67 |         ),
 68 |         bold=True,
 69 |     )
 70 | 
 71 |     with typer.progressbar(
 72 |         length=max_documents or len(list(openwebtext_path.iterdir())), label="Preprocessing text"
 73 |     ) as progress:
 74 |         for tar_filepath in openwebtext_path.iterdir():
 75 |             # Didn't bother debugging as it only happens for a tiny number (1-2) of tar archives.
 76 |             # Instead, catch the error and report to the user at the end how many we skipped.
 77 |             untared_filepath = Path(tar_filepath.stem)
 78 |             try:
 79 |                 with tarfile.open(tar_filepath) as tf:
 80 |                     tf.extractall(untared_filepath)
 81 |             except (tarfile.ReadError, IsADirectoryError):
 82 |                 skipped_files += 1
 83 |                 continue
 84 | 
 85 |             for text_filepaths in chunked(untared_filepath.iterdir(), 128):
 86 |                 docs = []
 87 |                 for fp in text_filepaths:
 88 |                     # Some very minimal preprocessing to remove extra whitespace, newlines and tabs.
 89 |                     doc = sanitize_text(fp.read_text(), lowercase=lowercase)
 90 |                     # We add a space in front of the text in order to achieve consistant tokenization
 91 |                     # with certain tokenizers, e.g. the BPE tokenizer used by RoBERTa, GPT and others.
 92 |                     # See: https://github.com/huggingface/transformers/issues/1196
 93 |                     doc = f"{ doc.lstrip()}"
 94 |                     docs.append(doc)
 95 | 
 96 |                 if tokenizer is not None:
 97 |                     if pretrained_model_name_or_path:
 98 |                         lengths = tokenizer(
 99 |                             docs, add_special_tokens=False, truncation=False, return_length=True
100 |                         ).length
101 |                     else:
102 |                         lengths = [len(tokenizer(doc)) for doc in docs]
103 |                     docs = [doc for doc, length in zip(docs, lengths) if length > min_length]
104 | 
105 |                 with open(output_filepath, "a") as f:
106 |                     f.write("\n".join(docs).strip() + "\n")
107 | 
108 |                 if max_documents:
109 |                     progress.update(len(docs))
110 |                     processed_docs += len(docs)
111 |                     if processed_docs >= max_documents:
112 |                         break
113 | 
114 |             # We are using a for-else trick here, see: https://stackoverflow.com/a/3150107/6578628
115 |             else:
116 |                 if max_documents is None:
117 |                     progress.update(1)
118 |                 shutil.rmtree(untared_filepath)
119 |                 # Continue if the inner loop wasn't broken.
120 |                 continue
121 |             shutil.rmtree(untared_filepath)
122 |             # Inner loop was broken, break the outer.
123 |             break
124 | 
125 |     if skipped_files:
126 |         typer.secho(
127 |             f"{WARNING} {skipped_files} tar files were skipped because they couldn't be extracted.",
128 |             fg=typer.colors.YELLOW,
129 |             bold=True,
130 |         )
131 | 
132 | 
133 | if __name__ == "__main__":
134 |     typer.run(main)
135 | 


--------------------------------------------------------------------------------
/declutr/encoder.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | from operator import itemgetter
  3 | from pathlib import Path
  4 | from typing import List, Optional, Tuple, Union, cast
  5 | 
  6 | import torch
  7 | from allennlp.common import util as common_util
  8 | from allennlp.common.file_utils import cached_path
  9 | from allennlp.models.archival import load_archive
 10 | from allennlp.predictors import Predictor
 11 | from validators.url import url
 12 | 
 13 | from declutr.common.util import sanitize_text
 14 | 
 15 | PRETRAINED_MODELS = {
 16 |     "declutr-small": "https://github.com/JohnGiorgi/DeCLUTR/releases/download/v0.1.0rc1/declutr-small.tar.gz",
 17 |     "declutr-base": "https://github.com/JohnGiorgi/DeCLUTR/releases/download/v0.1.0rc1/declutr-base.tar.gz",
 18 | }
 19 | 
 20 | 
 21 | class Encoder:
 22 |     """A simple interface to the model for the purposes of embedding sentences/paragraphs.
 23 | 
 24 |     # Example Usage
 25 | 
 26 |     ```python
 27 |     from declutr import Encoder
 28 | 
 29 |     # This can be a path on disk to a model you have trained yourself OR
 30 |     # the name of one of our pretrained models.
 31 |     pretrained_model_or_path = "declutr-small"
 32 | 
 33 |     encoder = Encoder(pretrained_model_or_path)
 34 |     embeddings = encoder([
 35 |         "A smiling costumed woman is holding an umbrella.",
 36 |         "A happy woman in a fairy costume holds an umbrella."
 37 |     ])
 38 |     ```
 39 | 
 40 |     # Parameters
 41 | 
 42 |     pretrained_model_name_or_path : `str`, required
 43 |         Path to a serialized AllenNLP archive or a model name from:
 44 |         `declutr.encoder.PRETRAINED_MODEL_URLS`
 45 |     sphereize : `bool`, optional (default = `False`)
 46 |         If `True` embeddings will be l2-normalized and shifted by the centroid. Defaults to `False`.
 47 |     **kwargs : `Dict`, optional
 48 |         Keyword arguments that will be passed to `allennlp.models.archival.load_archive`. This is
 49 |         useful, for example, to specify a CUDA device id with `cuda_device`. See:
 50 |         https://docs.allennlp.org/master/api/models/archival/#load_archive for more details.
 51 |     """
 52 | 
 53 |     _output_dict_field = "embeddings"
 54 | 
 55 |     def __init__(
 56 |         self, pretrained_model_name_or_path: str, sphereize: bool = False, **kwargs
 57 |     ) -> None:
 58 |         if pretrained_model_name_or_path in PRETRAINED_MODELS:
 59 |             pretrained_model_name_or_path = PRETRAINED_MODELS[pretrained_model_name_or_path]
 60 |         common_util.import_module_and_submodules("declutr")
 61 |         archive = load_archive(pretrained_model_name_or_path, **kwargs)
 62 |         self._predictor = Predictor.from_archive(archive, predictor_name="declutr")
 63 |         self._sphereize = sphereize
 64 | 
 65 |     @torch.no_grad()
 66 |     def __call__(
 67 |         self, inputs: Union[str, List[str]], batch_size: Optional[int] = None
 68 |     ) -> torch.Tensor:
 69 |         """Returns a numpy array of embeddings, one for each item in `inputs`.
 70 | 
 71 |         # Parameters
 72 | 
 73 |         inputs : `Union[str, List[str]]`, required
 74 |             The input text to embed. Can be a string, list of strings, or a filepath/URL to a text
 75 |             file with one input per line.
 76 |         batch_size : `int`, optional
 77 |             If given, the `inputs` will be batched before embedding.
 78 |         """
 79 |         if isinstance(inputs, str):
 80 |             # Determine if inputs is a path, or text string
 81 |             try:
 82 |                 is_path = Path(inputs).is_file()
 83 |             except OSError:
 84 |                 warnings.warn(
 85 |                     "'OSError' raised when checking if 'inputs' is a filepath."
 86 |                     " Assuming it is a string or URL."
 87 |                 )
 88 |             else:
 89 |                 is_path = Path(inputs).is_file() or url(inputs)
 90 | 
 91 |             if is_path:
 92 |                 inputs = Path(cached_path(inputs)).read_text().split("\n")
 93 |             else:
 94 |                 inputs = [inputs]
 95 | 
 96 |         if batch_size is None:
 97 |             unsort = False
 98 |             batch_size = len(inputs)
 99 |         else:
100 |             # Sort the inputs by length, maintaining the original indices so we can un-sort
101 |             # before returning the embeddings. This speeds up embedding by minimizing the
102 |             # amount of computation performed on pads. Because this sorting happens before
103 |             # tokenization, it is only a proxy of the true lengths of the inputs to the model.
104 |             # In the future, it would be better to use the built-in bucket sort of AllenNLP,
105 |             # which would lead to an even larger speedup.
106 |             unsort = True
107 |             sorted_indices, inputs = cast(
108 |                 Tuple[List[int], List[str]], zip(*sorted(enumerate(inputs), key=itemgetter(1)))
109 |             )  # tell mypy explicitly the types of items in the unpacked tuple
110 |             unsorted_indices, _ = zip(*sorted(enumerate(sorted_indices), key=itemgetter(1)))
111 | 
112 |         embeddings: torch.FloatTensor = []  # promise mypy we will behave
113 |         for i in range(0, len(inputs), batch_size):
114 |             batch_json = [{"text": sanitize_text(input_)} for input_ in inputs[i : i + batch_size]]
115 |             outputs = self._predictor.predict_batch_json(batch_json)
116 |             outputs = torch.as_tensor(
117 |                 # Accumulating the tensors on the GPU would quickly lead to OOM.
118 |                 [output[self._output_dict_field] for output in outputs],
119 |                 device="cpu",
120 |             )
121 |             embeddings.append(outputs)
122 |         embeddings = torch.cat(embeddings)
123 |         # Make sure to unsort the embeddings if they were sorted.
124 |         if unsort:
125 |             unsorted_indices = torch.as_tensor(unsorted_indices, dtype=torch.long)
126 |             embeddings = torch.index_select(embeddings, dim=0, index=unsorted_indices)
127 |         if self._sphereize:
128 |             if embeddings.size(0) > 1:
129 |                 centroid = torch.mean(embeddings, dim=0)
130 |                 embeddings -= centroid
131 |                 embeddings /= torch.norm(embeddings, dim=1, keepdim=True)
132 |             else:
133 |                 warnings.warn(
134 |                     "sphereize==True but only a single input sentence was passed."
135 |                     " Inputs will not be sphereized."
136 |                 )
137 | 
138 |         return embeddings.numpy()
139 | 


--------------------------------------------------------------------------------
/tests/common/test_contrastive_utils.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | from typing import List, Union
  3 | 
  4 | import pytest
  5 | from declutr.common.contrastive_utils import sample_anchor_positive_pairs
  6 | from hypothesis import given
  7 | from hypothesis.strategies import integers, sampled_from
  8 | from transformers import AutoTokenizer
  9 | 
 10 | 
 11 | class TestContrastiveUtils:
 12 |     def tokenize(self, text) -> List[str]:
 13 |         return text.split()
 14 | 
 15 |     @given(
 16 |         num_anchors=integers(min_value=1, max_value=4),
 17 |         num_positives=integers(min_value=1, max_value=4),
 18 |         sampling_strategy=sampled_from(["subsuming", "adjacent", None]),
 19 |     )
 20 |     def test_sample_spans(
 21 |         self,
 22 |         inputs: List[str],
 23 |         num_anchors: int,
 24 |         num_positives: int,
 25 |         sampling_strategy: Union[str, None],
 26 |     ) -> None:
 27 | 
 28 |         for text in inputs:
 29 |             tokens = self.tokenize(text)
 30 |             num_tokens = len(tokens)
 31 | 
 32 |             # Really short examples make the tests unreliable.
 33 |             if num_tokens < 7:
 34 |                 continue
 35 | 
 36 |             # These represent sensible defaults
 37 |             max_span_len = num_tokens // 4
 38 |             min_span_len = random.randint(1, max_span_len) if max_span_len > 1 else 1
 39 | 
 40 |             if num_tokens < num_anchors * max_span_len * 2:
 41 |                 with pytest.raises(ValueError):
 42 |                     _, _ = sample_anchor_positive_pairs(
 43 |                         text,
 44 |                         num_anchors=num_anchors,
 45 |                         num_positives=num_positives,
 46 |                         max_span_len=max_span_len,
 47 |                         min_span_len=min_span_len,
 48 |                         sampling_strategy=sampling_strategy,
 49 |                     )
 50 |             else:
 51 |                 anchors, positives = sample_anchor_positive_pairs(
 52 |                     text,
 53 |                     num_anchors=num_anchors,
 54 |                     num_positives=num_positives,
 55 |                     max_span_len=max_span_len,
 56 |                     min_span_len=min_span_len,
 57 |                     sampling_strategy=sampling_strategy,
 58 |                 )
 59 |                 assert len(anchors) == num_anchors
 60 |                 assert len(positives) == num_anchors * num_positives
 61 |                 for i, anchor in enumerate(anchors):
 62 |                     # Several simple checks for valid anchors.
 63 |                     anchor_tokens = self.tokenize(anchor)
 64 |                     anchor_length = len(anchor_tokens)
 65 |                     assert anchor_length <= max_span_len
 66 |                     assert anchor_length >= min_span_len
 67 |                     # The tokenization process may lead to certain characters (such as escape
 68 |                     # characters) being dropped, so repeat the tokenization process before
 69 |                     # performing this check (otherwise a bunch of tests fail).
 70 |                     assert anchor in " ".join(tokens)
 71 |                     for j in range(i * num_positives, i * num_positives + num_positives):
 72 |                         # Several simple checks for valid positives.
 73 |                         positive = positives[j]
 74 |                         positive_tokens = self.tokenize(positive)
 75 |                         positive_length = len(positive_tokens)
 76 |                         assert positive_length <= max_span_len
 77 |                         assert positive_length >= min_span_len
 78 |                         assert positive in " ".join(tokens)
 79 |                         # Test that specific sampling strategies are obeyed.
 80 |                         if sampling_strategy == "subsuming":
 81 |                             assert positive in " ".join(anchor_tokens)
 82 |                         elif sampling_strategy == "adjacent":
 83 |                             assert positive not in " ".join(anchor_tokens)
 84 | 
 85 |     @given(
 86 |         num_anchors=integers(min_value=1, max_value=4),
 87 |         num_positives=integers(min_value=1, max_value=4),
 88 |     )
 89 |     def test_sample_spans_raises_value_error_invalid_min_span_length(
 90 |         self, num_anchors: int, num_positives: int
 91 |     ) -> None:
 92 |         text = "They may take our lives, but they'll never take our freedom!"
 93 |         num_tokens = len(self.tokenize(text))
 94 | 
 95 |         max_span_len = num_tokens - 1  # This is guaranteed to be valid.
 96 |         min_span_len = max_span_len + 1  # This is guaranteed to be invalid.
 97 | 
 98 |         with pytest.raises(ValueError):
 99 |             _, _ = sample_anchor_positive_pairs(
100 |                 text,
101 |                 num_anchors=num_anchors,
102 |                 num_positives=num_positives,
103 |                 max_span_len=max_span_len,
104 |                 min_span_len=min_span_len,
105 |             )
106 | 
107 |     @given(
108 |         num_anchors=integers(min_value=1, max_value=4),
109 |         num_positives=integers(min_value=1, max_value=4),
110 |     )
111 |     def test_sample_spans_raises_value_error_invalid_max_span_length(
112 |         self, num_anchors: int, num_positives: int
113 |     ) -> None:
114 |         text = "They may take our lives, but they'll never take our freedom!"
115 |         num_tokens = len(self.tokenize(text))
116 | 
117 |         max_span_len = num_tokens + 1  # This is guaranteed to be invalid.
118 |         min_span_len = max_span_len - 1  # This is guaranteed to be valid.
119 | 
120 |         with pytest.raises(ValueError):
121 |             _, _ = sample_anchor_positive_pairs(
122 |                 text,
123 |                 num_anchors=num_anchors,
124 |                 num_positives=num_positives,
125 |                 max_span_len=max_span_len,
126 |                 min_span_len=min_span_len,
127 |             )
128 | 
129 |     def test_sample_spans_with_hf_tokenizer(self):
130 |         text = "They may take our lives, but they'll never take our freedom!"
131 |         tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
132 |         num_tokens = len(tokenizer(text))
133 | 
134 |         # Arbitrary but valid choices
135 |         max_span_len = num_tokens
136 |         min_span_len = max_span_len - 1
137 | 
138 |         anchors, positives = sample_anchor_positive_pairs(
139 |             text,
140 |             num_anchors=1,
141 |             num_positives=1,
142 |             max_span_len=max_span_len,
143 |             min_span_len=min_span_len,
144 |             tokenizer=tokenizer.tokenize,
145 |         )
146 | 
147 |         for anchor in anchors:
148 |             tokens = anchor.split()
149 |             assert tokenizer.convert_tokens_to_string(tokens) in text
150 |         for positive in positives:
151 |             tokens = positive.split()
152 |             assert tokenizer.convert_tokens_to_string(tokens) in text
153 | 


--------------------------------------------------------------------------------
/declutr/common/contrastive_utils.py:
--------------------------------------------------------------------------------
  1 | from typing import Callable, List, Optional, Tuple
  2 | 
  3 | import numpy as np
  4 | 
  5 | from allennlp.common.logging import AllenNlpLogger
  6 | 
  7 | logger = AllenNlpLogger(__name__)
  8 | 
  9 | 
 10 | def sample_anchor_positive_pairs(
 11 |     text: str,
 12 |     num_anchors: int,
 13 |     num_positives: int,
 14 |     max_span_len: int,
 15 |     min_span_len: int,
 16 |     sampling_strategy: Optional[str] = None,
 17 |     tokenizer: Optional[Callable[[str], List[str]]] = None,
 18 | ) -> Tuple[List[str], List[str]]:
 19 |     """Returns a `Tuple` of `List`s, containing `num_anchors` anchor spans and `num_positives`
 20 |     positive spans sampled from `text`.
 21 | 
 22 |     # Parameters
 23 | 
 24 |     text : `str`, required
 25 |         The string to extract anchor and positive spans from.
 26 |     num_anchors : `int`, required
 27 |         The number of spans to sample from `text` to serve as anchors.
 28 |     num_positives : `int`, required
 29 |         The number of spans to sample from `text` to serve as positives (per anchor).
 30 |     max_span_len : `int`, required
 31 |         The maximum length of spans, after tokenization, to sample.
 32 |     min_span_len : `int`, required
 33 |         The minimum length of spans, after tokenization, to sample.
 34 |     sampling_strategy : `str`, optional (default = `None`)
 35 |         One of `"subsuming"` or `"adjacent"`. If `"subsuming"`, positive spans are always subsumed
 36 |         by the anchor. If `"adjacent"`, positive spans are always adjacent to the anchor. If not
 37 |         provided, positives may be subsumed, adjacent to, or overlapping with the anchor.
 38 |     tokenizer : `Callable`, optional (default = `None`)
 39 |         Optional tokenizer to use before sampling spans. If `None`, `text.split()` is used.
 40 |     """
 41 |     # Tokenize the incoming text. Whitespace tokenization is much more straightforward
 42 |     # (we don't need to worry about chopping up subword tokens), but a user can also provide
 43 |     # their own tokenization scheme if they want.
 44 |     tokens = tokenizer(text) if tokenizer is not None else text.split()
 45 |     tok_method = "tokenizer(text)" if tokenizer else "text.split()"
 46 |     num_tokens = len(tokens)
 47 | 
 48 |     if num_tokens < num_anchors * max_span_len * 2:
 49 |         raise ValueError(
 50 |             f"len({tok_method}) should be at least {num_anchors * max_span_len * 2}"
 51 |             f" (num_anchors * max_span_len * 2), got {num_tokens}."
 52 |         )
 53 |     if min_span_len > max_span_len:
 54 |         raise ValueError(
 55 |             f"min_span_len must be less than max_span_len ({max_span_len}), got {min_span_len}."
 56 |         )
 57 |     if max_span_len > num_tokens:
 58 |         raise ValueError(
 59 |             (
 60 |                 f"max_span_len must be less than or equal to"
 61 |                 f" len({tok_method}) ({num_tokens}), got {max_span_len}."
 62 |             )
 63 |         )
 64 | 
 65 |     # Valid anchor starts are token indices which begin a token span of at least max_span_len.
 66 |     anchors, positives = [], []
 67 |     valid_anchor_starts = list(range(0, num_tokens - max_span_len + 1, max_span_len))
 68 |     for i in range(num_anchors):
 69 |         # Sample the anchor length from a beta distribution skewed towards longer spans, the
 70 |         # intuition being that longer spans have the best chance of being representative of the
 71 |         # document they are sampled from.
 72 |         anchor_len = int(np.random.beta(4, 2) * (max_span_len - min_span_len) + min_span_len)
 73 |         # This check prevents an edge case were we run out of valid_anchor_starts.
 74 |         if len(valid_anchor_starts) // (num_anchors - i) < num_anchors - i:
 75 |             anchor_start_idx = np.random.choice([0, len(valid_anchor_starts) - 1])
 76 |         else:
 77 |             anchor_start_idx = np.random.randint(len(valid_anchor_starts))
 78 |         # When num_anchors = 1, this is equivalent to uniformly sampling that starting position.
 79 |         anchor_start = np.random.randint(
 80 |             valid_anchor_starts[anchor_start_idx],
 81 |             # randint is high-exclusive
 82 |             valid_anchor_starts[anchor_start_idx] + max_span_len - anchor_len + 1,
 83 |         )
 84 |         # Once sampled, remove an anchor (and its immediate neighbours) from consideration.
 85 |         del valid_anchor_starts[max(0, anchor_start_idx - 1) : anchor_start_idx + 2]
 86 |         anchor_end = anchor_start + anchor_len
 87 |         anchors.append(" ".join(tokens[anchor_start:anchor_end]))
 88 | 
 89 |         # Sample positives from around the anchor. The intuition being that text that appears
 90 |         # close together is the same document is likely to be semantically similar.
 91 |         for _ in range(num_positives):
 92 |             # A user can specify a subsuming or adjacent only sampling strategy.
 93 |             if sampling_strategy == "subsuming":
 94 |                 # To be strictly subsuming, we cannot allow the positive_len > anchor_len.
 95 |                 positive_len = int(
 96 |                     np.random.beta(2, 4) * (anchor_len - min_span_len) + min_span_len
 97 |                 )
 98 |                 # randint is high-exclusive
 99 |                 positive_start = np.random.randint(anchor_start, anchor_end - positive_len + 1)
100 |             elif sampling_strategy == "adjacent":
101 |                 # Restrict positives to a length that will allow them to be adjacent to the anchor
102 |                 # without running off the edge of the document. If the anchor has sufficent room on
103 |                 # either side, this won't be a problem and max_positive_len will equal max_span_len.
104 |                 max_positive_len = min(max_span_len, max(anchor_start, num_tokens - anchor_end))
105 |                 if max_positive_len < max_span_len:
106 |                     logger.warning_once(
107 |                         (
108 |                             "There is no room to sample an adjacent positive span. Temporarily"
109 |                             " reducing the maximum span length of positives. This message will not"
110 |                             " be displayed again."
111 |                         )
112 |                     )
113 |                 positive_len = int(
114 |                     np.random.beta(2, 4) * (max_positive_len - min_span_len) + min_span_len
115 |                 )
116 |                 # There are two types of adjacent positives, those that border the beginning of the
117 |                 # anchor and those that border the end. The checks above guarantee at least one of
118 |                 # these is valid. Here we just choose from the valid positive starts at random.
119 |                 valid_starts = []
120 |                 if anchor_start - positive_len > 0:
121 |                     valid_starts.append(anchor_start - positive_len)
122 |                 if anchor_end + positive_len <= num_tokens:
123 |                     valid_starts.append(anchor_end)
124 |                 positive_start = np.random.choice(valid_starts)
125 |             else:
126 |                 # Sample positive length from a beta distribution skewed towards shorter spans. The
127 |                 # idea is to promote diversity and minimize the amount of overlapping text.
128 |                 positive_len = int(
129 |                     np.random.beta(2, 4) * (max_span_len - min_span_len) + min_span_len
130 |                 )
131 |                 # By default, spans may be adjacent or overlap with each other and the anchor.
132 |                 # Careful not to run off the edges of the document (this error may pass silently).
133 |                 positive_start = np.random.randint(
134 |                     max(0, anchor_start - positive_len),
135 |                     min(anchor_end, num_tokens - positive_len) + 1,  # randint is high-exclusive
136 |                 )
137 | 
138 |             positive_end = positive_start + positive_len
139 |             positives.append(" ".join(tokens[positive_start:positive_end]))
140 | 
141 |     return anchors, positives
142 | 


--------------------------------------------------------------------------------
/notebooks/training.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "accelerator": "GPU",
  6 |     "colab": {
  7 |       "name": "training.ipynb",
  8 |       "private_outputs": true,
  9 |       "provenance": [],
 10 |       "collapsed_sections": []
 11 |     },
 12 |     "kernelspec": {
 13 |       "display_name": "Python 3",
 14 |       "language": "python",
 15 |       "name": "python3"
 16 |     },
 17 |     "language_info": {
 18 |       "codemirror_mode": {
 19 |         "name": "ipython",
 20 |         "version": 3
 21 |       },
 22 |       "file_extension": ".py",
 23 |       "mimetype": "text/x-python",
 24 |       "name": "python",
 25 |       "nbconvert_exporter": "python",
 26 |       "pygments_lexer": "ipython3",
 27 |       "version": "3.8.5"
 28 |     }
 29 |   },
 30 |   "cells": [
 31 |     {
 32 |       "cell_type": "markdown",
 33 |       "metadata": {
 34 |         "id": "I8jt6ML03DS5"
 35 |       },
 36 |       "source": [
 37 |         "# Training your own model\n",
 38 |         "\n",
 39 |         "This notebook will walk you through training your own model using [DeCLUTR](https://github.com/JohnGiorgi/DeCLUTR)."
 40 |       ]
 41 |     },
 42 |     {
 43 |       "cell_type": "markdown",
 44 |       "metadata": {
 45 |         "id": "SU3Iod2-g0-o"
 46 |       },
 47 |       "source": [
 48 |         "## 🔧 Install the prerequisites"
 49 |       ]
 50 |     },
 51 |     {
 52 |       "cell_type": "code",
 53 |       "metadata": {
 54 |         "id": "sr4r5pN40Kli"
 55 |       },
 56 |       "source": [
 57 |         "!pip install git+https://github.com/JohnGiorgi/DeCLUTR.git"
 58 |       ],
 59 |       "execution_count": null,
 60 |       "outputs": []
 61 |     },
 62 |     {
 63 |       "cell_type": "markdown",
 64 |       "metadata": {
 65 |         "id": "Zog7ApwuUD7_"
 66 |       },
 67 |       "source": [
 68 |         "## 📖 Preparing a dataset"
 69 |       ]
 70 |     },
 71 |     {
 72 |       "cell_type": "markdown",
 73 |       "metadata": {
 74 |         "id": "uwnLpUmN4Art"
 75 |       },
 76 |       "source": [
 77 |         "\n",
 78 |         "A dataset is simply a file containing one item of text (a document, a scientific paper, etc.) per line. For demonstration purposes, we have provided a script that will download the [WikiText-103](https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/) dataset and format it for training with our method.\n",
 79 |         "\n",
 80 |         "The only \"gotcha\" is that each piece of text needs to be long enough so that we can sample spans from it. In general, you should collect documents of a minimum length according to the following:\n",
 81 |         "\n",
 82 |         "```python\n",
 83 |         "min_length = num_anchors * max_span_len * 2\n",
 84 |         "```\n",
 85 |         "\n",
 86 |         "In our paper, we set `num_anchors=2` and `max_span_len=512`, so we require documents of `min_length=2048`. We simply need to provide this value as an argument when running the script:"
 87 |       ]
 88 |     },
 89 |     {
 90 |       "cell_type": "code",
 91 |       "metadata": {
 92 |         "id": "q0fwnwq23aAZ"
 93 |       },
 94 |       "source": [
 95 |         "import os\n",
 96 |         "\n",
 97 |         "train_data_path = \"wikitext_103/train.txt\"\n",
 98 |         "min_length = 2048\n",
 99 |         "\n",
100 |         "!wget -nc https://raw.githubusercontent.com/JohnGiorgi/DeCLUTR/master/scripts/preprocess_wikitext_103.py\n",
101 |         "!python preprocess_wikitext_103.py $train_data_path --min-length $min_length"
102 |       ],
103 |       "execution_count": null,
104 |       "outputs": []
105 |     },
106 |     {
107 |       "cell_type": "markdown",
108 |       "metadata": {
109 |         "id": "yUEFeupP6qy-"
110 |       },
111 |       "source": [
112 |         "Lets confirm that our dataset looks as expected."
113 |       ]
114 |     },
115 |     {
116 |       "cell_type": "code",
117 |       "metadata": {
118 |         "id": "K7ffGXCn7Cpq"
119 |       },
120 |       "source": [
121 |         "!wc -l $train_data_path  # This should be approximately 17.8K lines"
122 |       ],
123 |       "execution_count": null,
124 |       "outputs": []
125 |     },
126 |     {
127 |       "cell_type": "code",
128 |       "metadata": {
129 |         "id": "10DprWZc9iV6"
130 |       },
131 |       "source": [
132 |         "!head -n 1 $train_data_path  # This should be a single Wikipedia entry"
133 |       ],
134 |       "execution_count": null,
135 |       "outputs": []
136 |     },
137 |     {
138 |       "cell_type": "markdown",
139 |       "metadata": {
140 |         "id": "VKYdambZ59nM"
141 |       },
142 |       "source": [
143 |         "## 🏃 Training the model\n",
144 |         "\n",
145 |         "Once you have collected the dataset, you can easily initiate a training session with the `allennlp train` command. An experiment is configured using a [Jsonnet](https://jsonnet.org/) config file. Lets take a look at the config for the DeCLUTR-small model presented in [our paper](https://arxiv.org/abs/2006.03659):"
146 |       ]
147 |     },
148 |     {
149 |       "cell_type": "code",
150 |       "metadata": {
151 |         "id": "xTaSExh4ba8e"
152 |       },
153 |       "source": [
154 |         "!wget -nc https://raw.githubusercontent.com/JohnGiorgi/DeCLUTR/master/training_config/declutr_small.jsonnet\n",
155 |         "with open(\"declutr_small.jsonnet\", \"r\") as f:\n",
156 |         "    print(f.read())"
157 |       ],
158 |       "execution_count": null,
159 |       "outputs": []
160 |     },
161 |     {
162 |       "cell_type": "markdown",
163 |       "metadata": {
164 |         "id": "-f1HqWSscWOx"
165 |       },
166 |       "source": [
167 |         "\n",
168 |         "The only thing to configure is the path to the training set (`train_data_path`), which can be passed to `allennlp train` via the `--overrides` argument (but you can also provide it in your config file directly, if you prefer):"
169 |       ]
170 |     },
171 |     {
172 |       "cell_type": "code",
173 |       "metadata": {
174 |         "id": "YS9VuxESBcr3"
175 |       },
176 |       "source": [
177 |         "overrides = (\n",
178 |         "    f\"{{'train_data_path': '{train_data_path}', \"\n",
179 |         "    # lower the batch size to be able to train on Colab GPUs\n",
180 |         "    \"'data_loader.batch_size': 2, \"\n",
181 |         "    # training examples / batch size. Not required, but gives us a more informative progress bar during training\n",
182 |         "    \"'data_loader.batches_per_epoch': 8912}\"\n",
183 |         ")"
184 |       ],
185 |       "execution_count": null,
186 |       "outputs": []
187 |     },
188 |     {
189 |       "cell_type": "code",
190 |       "metadata": {
191 |         "id": "2v4tiiXgBC2M"
192 |       },
193 |       "source": [
194 |         "overrides"
195 |       ],
196 |       "execution_count": null,
197 |       "outputs": []
198 |     },
199 |     {
200 |       "cell_type": "code",
201 |       "metadata": {
202 |         "id": "Db_cNfZ76KRf"
203 |       },
204 |       "source": [
205 |         "!allennlp train \"declutr_small.jsonnet\" \\\n",
206 |         "    --serialization-dir \"output\" \\\n",
207 |         "    --overrides \"$overrides\" \\\n",
208 |         "    --include-package \"declutr\" \\\n",
209 |         "    -f"
210 |       ],
211 |       "execution_count": null,
212 |       "outputs": []
213 |     },
214 |     {
215 |       "cell_type": "markdown",
216 |       "metadata": {
217 |         "id": "Qsbr6OMv16GQ"
218 |       },
219 |       "source": [
220 |         "### 🤗 Exporting a trained model to HuggingFace Transformers\n",
221 |         "\n",
222 |         "We have provided a simple script to export a trained model so that it can be loaded with [Hugging Face Transformers](https://github.com/huggingface/transformers)"
223 |       ]
224 |     },
225 |     {
226 |       "cell_type": "code",
227 |       "metadata": {
228 |         "id": "KqmWVD0y16GQ"
229 |       },
230 |       "source": [
231 |         "!wget -nc https://github.com/JohnGiorgi/DeCLUTR/blob/master/scripts/save_pretrained_hf.py\n",
232 |         "!python save_pretrained_hf.py --archive-file \"output\" --save-directory \"output_transformers\""
233 |       ],
234 |       "execution_count": null,
235 |       "outputs": []
236 |     },
237 |     {
238 |       "cell_type": "markdown",
239 |       "metadata": {
240 |         "id": "N0-NTFaH16GQ"
241 |       },
242 |       "source": [
243 |         "The model, saved to `--save-directory`, can then be loaded using the Hugging Face Transformers library\n",
244 |         "\n",
245 |         "> See the [embedding notebook](https://colab.research.google.com/github/JohnGiorgi/DeCLUTR/blob/master/notebooks/embedding.ipynb) for more details on using trained models."
246 |       ]
247 |     },
248 |     {
249 |       "cell_type": "code",
250 |       "metadata": {
251 |         "id": "pAl1zIya16GQ"
252 |       },
253 |       "source": [
254 |         "from transformers import AutoTokenizer, AutoModelForMaskedLM\n",
255 |         "  \n",
256 |         "tokenizer = AutoTokenizer.from_pretrained(\"output_transformers\")\n",
257 |         "model = AutoModel.from_pretrained(\"output_transformers\")"
258 |       ],
259 |       "execution_count": null,
260 |       "outputs": []
261 |     },
262 |     {
263 |       "cell_type": "markdown",
264 |       "metadata": {
265 |         "id": "mzQ0G4rp16GQ"
266 |       },
267 |       "source": [
268 |         "> If you would like to upload your model to the Hugging Face model repository, follow the instructions [here](https://huggingface.co/transformers/model_sharing.html)."
269 |       ]
270 |     },
271 |     {
272 |       "cell_type": "markdown",
273 |       "metadata": {
274 |         "id": "eD5dZo18EE-S"
275 |       },
276 |       "source": [
277 |         "## ♻️ Conclusion\n",
278 |         "\n",
279 |         "That's it! In this notebook, we covered how to collect data for training the model, and specifically how _long_ that text needs to be. We then briefly covered configuring and running a training session. Please see [our paper](https://arxiv.org/abs/2006.03659) and [repo](https://github.com/JohnGiorgi/DeCLUTR) for more details, and don't hesitate to open an issue if you have any trouble!"
280 |       ]
281 |     }
282 |   ]
283 | }


--------------------------------------------------------------------------------
/declutr/dataset_reader.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import random
  3 | from contextlib import contextmanager
  4 | from typing import Any, Dict, Iterable, Iterator, List
  5 | 
  6 | from allennlp.common.file_utils import cached_path
  7 | from allennlp.data.dataset_readers import DatasetReader
  8 | from allennlp.data.fields import Field, ListField, TextField
  9 | from allennlp.data.instance import Instance
 10 | from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenIndexer
 11 | from allennlp.data.tokenizers import PretrainedTransformerTokenizer, SpacyTokenizer, Tokenizer
 12 | from overrides import overrides
 13 | 
 14 | from declutr.common.contrastive_utils import sample_anchor_positive_pairs
 15 | from declutr.common.util import sanitize_text
 16 | 
 17 | logger = logging.getLogger(__name__)
 18 | 
 19 | 
 20 | @DatasetReader.register("declutr")
 21 | class DeCLUTRDatasetReader(DatasetReader):
 22 |     """
 23 |     Read a text file containing one instance per line, and create a dataset suitable for a
 24 |     `DeCLUTR` model.
 25 | 
 26 |     The output of `read` is a list of `Instance` s with the field:
 27 |         tokens : `ListField[TextField]`
 28 |     if `num_anchors > 0`, else:
 29 |         tokens : `TextField`
 30 | 
 31 |     Registered as a `DatasetReader` with name "declutr".
 32 | 
 33 |     # Parameters
 34 | 
 35 |     tokenizer : `Tokenizer`, optional (default = `{"tokens": SpacyTokenizer()}`)
 36 |         Tokenizer to use to split the input text into words or other kinds of tokens.
 37 |     token_indexers : `Dict[str, TokenIndexer]`, optional
 38 |         We use this to define the input representation for the text. See :class:`TokenIndexer`.
 39 |     num_anchors : `int`, optional
 40 |         The number of spans to sample from each instance to serve as anchors.
 41 |     num_positives : `int`, optional
 42 |         The number of spans to sample from each instance to serve as positive examples (per anchor).
 43 |         Has no effect if `num_anchors` is not provided.
 44 |     max_span_len : `int`, optional
 45 |         The maximum length of spans (after tokenization) which should be sampled. Has no effect if
 46 |         `num_anchors` is not provided.
 47 |     min_span_len : `int`, optional
 48 |         The minimum length of spans (after tokenization) which should be sampled. Has no effect if
 49 |         `num_anchors` is not provided.
 50 |     sampling_strategy : `str`, optional (default = None)
 51 |         One of "subsuming" or "adjacent". If "subsuming," positive spans are always subsumed by the
 52 |         anchor. If "adjacent", positive spans are always adjacent to the anchor. If not provided,
 53 |         positives may be subsumed, adjacent to, or overlapping with the anchor. Has no effect if
 54 |         `num_anchors` is not provided.
 55 |     """
 56 | 
 57 |     def __init__(
 58 |         self,
 59 |         tokenizer: Tokenizer = None,
 60 |         token_indexers: Dict[str, TokenIndexer] = None,
 61 |         num_anchors: int = None,
 62 |         num_positives: int = None,
 63 |         max_span_len: int = None,
 64 |         min_span_len: int = None,
 65 |         sampling_strategy: str = None,
 66 |         **kwargs,
 67 |     ) -> None:
 68 |         super().__init__(**kwargs)
 69 |         self._tokenizer = tokenizer or SpacyTokenizer()
 70 |         self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
 71 | 
 72 |         # If the user provided us with a number of anchors to sample, we automatically
 73 |         # check that the other expected values are provided and valid.
 74 |         if num_anchors is not None:
 75 |             self._num_anchors = num_anchors
 76 |             self.sample_spans = True
 77 |             if num_positives is None:
 78 |                 raise ValueError("num_positives must be provided if num_anchors is not None.")
 79 |             if max_span_len is None:
 80 |                 raise ValueError("max_span_len must be provided if num_anchors is not None.")
 81 |             if min_span_len is None:
 82 |                 raise ValueError("min_span_len must be provided if num_anchors is not None.")
 83 |             self._num_positives = num_positives
 84 |             self._max_span_len = max_span_len
 85 |             self._min_span_len = min_span_len
 86 |             self._sampling_strategy = (
 87 |                 sampling_strategy.lower() if sampling_strategy is not None else sampling_strategy
 88 |             )
 89 |             if (
 90 |                 self.sample_spans
 91 |                 and self._sampling_strategy is not None
 92 |                 and self._sampling_strategy not in ["subsuming", "adjacent"]
 93 |             ):
 94 |                 raise ValueError(
 95 |                     (
 96 |                         'sampling_strategy must be one of ["subsuming", "adjacent"].'
 97 |                         f" Got {self._sampling_strategy}."
 98 |                     )
 99 |                 )
100 |         else:
101 |             self.sample_spans = False
102 | 
103 |     @property
104 |     def sample_spans(self) -> bool:
105 |         return self._sample_spans
106 | 
107 |     @sample_spans.setter
108 |     def sample_spans(self, sample_spans: bool) -> None:
109 |         self._sample_spans = sample_spans
110 | 
111 |     @contextmanager
112 |     def no_sample(self) -> Iterator[None]:
113 |         """A context manager that temporarily disables sampling of spans. Useful at test time when
114 |         we want to embed unseen text.
115 |         """
116 |         prev = self.sample_spans
117 |         self.sample_spans = False
118 |         yield
119 |         self.sample_spans = prev
120 | 
121 |     @overrides
122 |     def _read(self, file_path: str) -> Iterable[Instance]:
123 |         # if `file_path` is a URL, redirect to the cache
124 |         file_path = cached_path(file_path)
125 | 
126 |         with open(file_path, "r") as data_file:
127 |             logger.info("Reading instances from lines in file at: %s", file_path)
128 | 
129 |             # If we are sampling spans (i.e. we are training) we need to shuffle the data so that
130 |             # we don't yield instances in the same order every epoch. Our current solution is to
131 |             # read the entire file into memory. This is a little expensive (roughly 1G per 1 million
132 |             # docs), so a better solution might be required down the line.
133 |             data: Iterable[Any] = []
134 |             if self.sample_spans:
135 |                 data = list(enumerate(data_file))
136 |                 random.shuffle(data)
137 |                 data = iter(data)
138 |             else:
139 |                 data = enumerate(data_file)
140 | 
141 |             for _, text in data:
142 |                 yield self.text_to_instance(text)
143 | 
144 |     @overrides
145 |     def text_to_instance(self, text: str) -> Instance:  # type: ignore
146 |         """
147 |         # Parameters
148 | 
149 |         text : `str`, required.
150 |             The text to process.
151 | 
152 |         # Returns
153 | 
154 |         An `Instance` containing the following fields:
155 |             - anchors (`Union[TextField, ListField[TextField]]`) :
156 |                 If `self.sample_spans`, this will be a `ListField[TextField]` object, containing
157 |                 each anchor span sampled from `text`. Otherwise, this will be a `TextField` object
158 |                 containing the tokenized `text`.
159 |             - positives (`ListField[TextField]`) :
160 |                 If `self.sample_spans`, this will be a `ListField[TextField]` object, containing
161 |                 each positive span sampled from `text`. Otherwise this field will not be included
162 |                 in the returned `Instance`.
163 |         """
164 |         # Some very minimal preprocessing to remove whitespace, newlines and tabs.
165 |         # We peform it here as it will cover both training and predicting with the model.
166 |         # We DON'T lowercase by default, but rather allow `self._tokenizer` to decide.
167 |         text = sanitize_text(text, lowercase=False)
168 | 
169 |         fields: Dict[str, Field] = {}
170 |         if self.sample_spans:
171 |             if isinstance(self._tokenizer, PretrainedTransformerTokenizer):
172 |                 # We add a space in front of the text in order to achieve consistant tokenization with
173 |                 # certain tokenizers, e.g. the BPE tokenizer used by RoBERTa, GPT and others.
174 |                 # See: https://github.com/huggingface/transformers/issues/1196
175 |                 text = f" {text.lstrip()}"
176 |                 tokenization_func = self._tokenizer.tokenizer.tokenize
177 |                 # A call to the `tokenize` method of the AllenNLP tokenizer causes
178 |                 # subsequent calls to the underlying HuggingFace Tokenizer (if `use_fast`)
179 |                 # to truncate text. Reset the truncation each time here.
180 |                 # Note this only appears to happen for transformers<3.1
181 |                 if self._tokenizer.tokenizer.is_fast:
182 |                     self._tokenizer.tokenizer._tokenizer.no_truncation()
183 |             else:
184 |                 tokenization_func = None
185 |             # Choose the anchor/positives at random.
186 |             anchor_spans, positive_spans = sample_anchor_positive_pairs(
187 |                 text=text,
188 |                 num_anchors=self._num_anchors,
189 |                 num_positives=self._num_positives,
190 |                 max_span_len=self._max_span_len,
191 |                 min_span_len=self._min_span_len,
192 |                 sampling_strategy=self._sampling_strategy,
193 |                 tokenizer=tokenization_func,
194 |             )
195 | 
196 |             anchors: List[Field] = []
197 |             for span in anchor_spans:
198 |                 # Sampled spans have already been tokenized and joined by whitespace.
199 |                 # We need to convert them back to a string to use the AllenNLP tokenizer
200 |                 # It would be simpler to use convert_tokens_to_string, but we can't guarantee
201 |                 # this method is implemented for all HuggingFace Tokenizers
202 |                 anchor_text = self._tokenizer.tokenizer.decode(
203 |                     self._tokenizer.tokenizer.convert_tokens_to_ids(span.split())
204 |                 )
205 |                 tokens = self._tokenizer.tokenize(anchor_text)
206 |                 anchors.append(TextField(tokens, self._token_indexers))
207 |             fields["anchors"] = ListField(anchors)
208 |             positives: List[Field] = []
209 |             for span in positive_spans:
210 |                 positive_text = self._tokenizer.tokenizer.decode(
211 |                     self._tokenizer.tokenizer.convert_tokens_to_ids(span.split())
212 |                 )
213 |                 tokens = self._tokenizer.tokenize(positive_text)
214 |                 positives.append(TextField(tokens, self._token_indexers))
215 |             fields["positives"] = ListField(positives)
216 |         else:
217 |             tokens = self._tokenizer.tokenize(text)
218 |             fields["anchors"] = TextField(tokens, self._token_indexers)
219 |         return Instance(fields)
220 | 


--------------------------------------------------------------------------------
/declutr/modules/token_embedders/pretrained_transformer_embedder_mlm.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, Dict, Optional, Tuple, Union
  2 | 
  3 | import torch
  4 | from allennlp.data.tokenizers import PretrainedTransformerTokenizer
  5 | from allennlp.modules.scalar_mix import ScalarMix
  6 | from allennlp.modules.token_embedders import PretrainedTransformerEmbedder
  7 | from allennlp.modules.token_embedders.token_embedder import TokenEmbedder
  8 | from overrides import overrides
  9 | from transformers import AutoConfig, AutoModelForMaskedLM
 10 | 
 11 | 
 12 | @TokenEmbedder.register("pretrained_transformer_mlm")
 13 | class PretrainedTransformerEmbedderMLM(PretrainedTransformerEmbedder):
 14 |     """
 15 |     This is a wrapper around `PretrainedTransformerEmbedder` that allows us to train against a
 16 |     masked language modelling objective while we are embedding text.
 17 | 
 18 |     Registered as a `TokenEmbedder` with name "pretrained_transformer_mlm".
 19 | 
 20 |     # Parameters
 21 | 
 22 |     model_name : `str`
 23 |         The name of the `transformers` model to use. Should be the same as the corresponding
 24 |         `PretrainedTransformerIndexer`.
 25 |     max_length : `int`, optional (default = `None`)
 26 |         If positive, folds input token IDs into multiple segments of this length, pass them
 27 |         through the transformer model independently, and concatenate the final representations.
 28 |         Should be set to the same value as the `max_length` option on the
 29 |         `PretrainedTransformerIndexer`.
 30 |     sub_module: `str`, optional (default = `None`)
 31 |         The name of a submodule of the transformer to be used as the embedder. Some transformers naturally act
 32 |         as embedders such as BERT. However, other models consist of encoder and decoder, in which case we just
 33 |         want to use the encoder.
 34 |     train_parameters: `bool`, optional (default = `True`)
 35 |         If this is `True`, the transformer weights get updated during training.
 36 |     last_layer_only: `bool`, optional (default = `True`)
 37 |         When `True` (the default), only the final layer of the pretrained transformer is taken
 38 |         for the embeddings. But if set to `False`, a scalar mix of all of the layers
 39 |         is used.
 40 |     gradient_checkpointing: `bool`, optional (default = `None`)
 41 |         Enable or disable gradient checkpointing.
 42 |     tokenizer_kwargs: `Dict[str, Any]`, optional (default = `None`)
 43 |         Dictionary with
 44 |         [additional arguments](https://github.com/huggingface/transformers/blob/155c782a2ccd103cf63ad48a2becd7c76a7d2115/transformers/tokenization_utils.py#L691)
 45 |         for `AutoTokenizer.from_pretrained`.
 46 |     transformer_kwargs: `Dict[str, Any]`, optional (default = `None`)
 47 |         Dictionary with
 48 |         [additional arguments](https://github.com/huggingface/transformers/blob/155c782a2ccd103cf63ad48a2becd7c76a7d2115/transformers/modeling_utils.py#L253)
 49 |         for `AutoModel.from_pretrained`.
 50 |     masked_language_modeling: `bool`, optional (default = `True`)
 51 |         If this is `True` and `masked_lm_labels is not None` in the call to `forward`, the model
 52 |         will be trained against a masked language modelling objective and the resulting loss will
 53 |         be returned along with the output tensor.
 54 |     """  # noqa: E501
 55 | 
 56 |     def __init__(
 57 |         self,
 58 |         model_name: str,
 59 |         *,
 60 |         max_length: int = None,
 61 |         sub_module: str = None,
 62 |         train_parameters: bool = True,
 63 |         last_layer_only: bool = True,
 64 |         override_weights_file: Optional[str] = None,
 65 |         override_weights_strip_prefix: Optional[str] = None,
 66 |         gradient_checkpointing: Optional[bool] = None,
 67 |         tokenizer_kwargs: Optional[Dict[str, Any]] = None,
 68 |         transformer_kwargs: Optional[Dict[str, Any]] = None,
 69 |         masked_language_modeling: bool = True,
 70 |     ) -> None:
 71 |         TokenEmbedder.__init__(self)  # Call the base class constructor
 72 |         tokenizer = PretrainedTransformerTokenizer(model_name, tokenizer_kwargs=tokenizer_kwargs)
 73 |         self.masked_language_modeling = masked_language_modeling
 74 | 
 75 |         if self.masked_language_modeling:
 76 |             self.config = AutoConfig.from_pretrained(model_name, output_hidden_states=True)
 77 |             # We only need access to the HF tokenizer if we are masked language modeling
 78 |             self.tokenizer = tokenizer.tokenizer
 79 |             # The only differences when masked language modeling are:
 80 |             # 1) `output_hidden_states` must be True to get access to token embeddings.
 81 |             # 2) We need to use `AutoModelForMaskedLM` to get the correct model
 82 |             self.transformer_model = AutoModelForMaskedLM.from_pretrained(
 83 |                 model_name, config=self.config, **(transformer_kwargs or {})
 84 |             )
 85 |         # Eveything after the if statement (including the else) is copied directly from:
 86 |         # https://github.com/allenai/allennlp/blob/master/allennlp/modules/token_embedders/pretrained_transformer_embedder.py
 87 |         else:
 88 |             from allennlp.common import cached_transformers
 89 | 
 90 |             self.transformer_model = cached_transformers.get(
 91 |                 model_name, True, override_weights_file, override_weights_strip_prefix
 92 |             )
 93 |             self.config = self.transformer_model.config
 94 | 
 95 |         if gradient_checkpointing is not None:
 96 |             self.transformer_model.config.update({"gradient_checkpointing": gradient_checkpointing})
 97 | 
 98 |         if sub_module:
 99 |             assert hasattr(self.transformer_model, sub_module)
100 |             self.transformer_model = getattr(self.transformer_model, sub_module)
101 |         self._max_length = max_length
102 | 
103 |         # I'm not sure if this works for all models; open an issue on github if you find a case
104 |         # where it doesn't work.
105 |         self.output_dim = self.config.hidden_size
106 | 
107 |         self._scalar_mix: Optional[ScalarMix] = None
108 |         if not last_layer_only:
109 |             self._scalar_mix = ScalarMix(self.config.num_hidden_layers)
110 |             self.config.output_hidden_states = True
111 | 
112 |         self._num_added_start_tokens = len(tokenizer.single_sequence_start_tokens)
113 |         self._num_added_end_tokens = len(tokenizer.single_sequence_end_tokens)
114 |         self._num_added_tokens = self._num_added_start_tokens + self._num_added_end_tokens
115 | 
116 |         if not train_parameters:
117 |             for param in self.transformer_model.parameters():
118 |                 param.requires_grad = False
119 | 
120 |     @overrides
121 |     def forward(
122 |         self,
123 |         token_ids: torch.LongTensor,
124 |         mask: torch.BoolTensor,
125 |         type_ids: Optional[torch.LongTensor] = None,
126 |         segment_concat_mask: Optional[torch.BoolTensor] = None,
127 |         masked_lm_labels: Optional[torch.LongTensor] = None,
128 |     ) -> Union[Tuple[torch.FloatTensor, torch.Tensor], torch.Tensor]:  # type: ignore
129 |         """
130 |         # Parameters
131 | 
132 |         token_ids: `torch.LongTensor`
133 |             Shape: `[batch_size, num_wordpieces if max_length is None else num_segment_concat_wordpieces]`.
134 |             num_segment_concat_wordpieces is num_wordpieces plus special tokens inserted in the
135 |             middle, e.g. the length of: "[CLS] A B C [SEP] [CLS] D E F [SEP]" (see indexer logic).
136 |         mask: `torch.BoolTensor`
137 |             Shape: [batch_size, num_wordpieces].
138 |         type_ids: `Optional[torch.LongTensor]`
139 |             Shape: `[batch_size, num_wordpieces if max_length is None else num_segment_concat_wordpieces]`.
140 |         segment_concat_mask: `Optional[torch.BoolTensor]`
141 |             Shape: `[batch_size, num_segment_concat_wordpieces]`.
142 |         masked_lm_labels: `Optional[torch.LongTensor]`
143 |             Shape: `[batch_size, num_wordpieces]`.
144 | 
145 |         # Returns:
146 | 
147 |         If `self.masked_language_modeling`, returns a `Tuple` of the masked language modeling loss
148 |         and a `torch.Tensor` of shape: `[batch_size, num_wordpieces, embedding_size]`. Otherwise,
149 |         returns only the `torch.Tensor` of shape: `[batch_size, num_wordpieces, embedding_size]`.
150 |         """
151 |         # Some of the huggingface transformers don't support type ids at all and crash when you supply
152 |         # them. For others, you can supply a tensor of zeros, and if you don't, they act as if you did.
153 |         # There is no practical difference to the caller, so here we pretend that one case is the same
154 |         # as another case.
155 |         if type_ids is not None:
156 |             max_type_id = type_ids.max()
157 |             if max_type_id == 0:
158 |                 type_ids = None
159 |             else:
160 |                 if max_type_id >= self._number_of_token_type_embeddings():
161 |                     raise ValueError("Found type ids too large for the chosen transformer model.")
162 |                 assert token_ids.shape == type_ids.shape
163 | 
164 |         fold_long_sequences = self._max_length is not None and token_ids.size(1) > self._max_length
165 |         if fold_long_sequences:
166 |             batch_size, num_segment_concat_wordpieces = token_ids.size()
167 |             token_ids, segment_concat_mask, type_ids = self._fold_long_sequences(
168 |                 token_ids, segment_concat_mask, type_ids
169 |             )
170 | 
171 |         transformer_mask = segment_concat_mask if self._max_length is not None else mask
172 |         # Shape: [batch_size, num_wordpieces, embedding_size],
173 |         # or if self._max_length is not None:
174 |         # [batch_size * num_segments, self._max_length, embedding_size]
175 | 
176 |         # We call this with kwargs because some of the huggingface models don't have the
177 |         # token_type_ids parameter and fail even when it's given as None.
178 |         # Also, as of transformers v2.5.1, they are taking FloatTensor masks.
179 |         parameters = {"input_ids": token_ids, "attention_mask": transformer_mask.float()}  # type: ignore
180 |         if type_ids is not None:
181 |             parameters["token_type_ids"] = type_ids
182 |         if masked_lm_labels is not None and self.masked_language_modeling:
183 |             parameters["labels"] = masked_lm_labels
184 | 
185 |         masked_lm_loss = None
186 |         transformer_output = self.transformer_model(**parameters)
187 | 
188 |         if self.config.output_hidden_states:
189 |             # Even if masked_language_modeling is True, we may not be masked language modeling on
190 |             # the current batch. Check if masked language modeling labels are present in the input.
191 |             if "labels" in parameters:
192 |                 masked_lm_loss = transformer_output[0]
193 | 
194 |             if self._scalar_mix:
195 |                 embeddings = self._scalar_mix(transformer_output[-1][1:])
196 |             else:
197 |                 embeddings = transformer_output[-1][-1]
198 |         else:
199 |             embeddings = transformer_output[0]
200 | 
201 |         if fold_long_sequences:
202 |             embeddings = self._unfold_long_sequences(
203 |                 embeddings, segment_concat_mask, batch_size, num_segment_concat_wordpieces
204 |             )
205 | 
206 |         return masked_lm_loss, embeddings
207 | 


--------------------------------------------------------------------------------
/declutr/model.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict, Optional
  2 | 
  3 | import torch
  4 | import torch.distributed as dist
  5 | from allennlp.common import util
  6 | from allennlp.data import TextFieldTensors, Vocabulary
  7 | from allennlp.models.model import Model
  8 | from allennlp.modules import FeedForward, Seq2VecEncoder, TextFieldEmbedder
  9 | from allennlp.modules.seq2vec_encoders import BagOfEmbeddingsEncoder
 10 | from allennlp.nn import InitializerApplicator
 11 | from allennlp.nn.util import get_text_field_mask
 12 | 
 13 | from declutr.common.masked_lm_utils import mask_tokens
 14 | from declutr.common.model_utils import all_gather_anchor_positive_pairs, unpack_batch
 15 | from declutr.losses import PyTorchMetricLearningLoss
 16 | from declutr.miners import PyTorchMetricLearningMiner
 17 | 
 18 | 
 19 | @Model.register("declutr")
 20 | class DeCLUTR(Model):
 21 |     """
 22 |     This `Model` implements a text encoder trained against a contrastive, self-supervised objective.
 23 |     After embedding the text into a text field, we will optionally encode the embeddings with a
 24 |     `Seq2SeqEncoder`. The resulting sequence is pooled using a `Seq2VecEncoder` and then passed to
 25 |     a `FeedFoward` layer, which projects the embeddings to a certain size.
 26 | 
 27 |     Registered as a `Model` with name "declutr".
 28 | 
 29 |     # Parameters
 30 | 
 31 |     vocab : `Vocabulary`
 32 |     text_field_embedder : `TextFieldEmbedder`
 33 |         Used to embed the input text into a `TextField`
 34 |     seq2vec_encoder : `Seq2VecEncoder`, optional, (default = `None`)
 35 |         Seq2Vec encoder layer. If `seq2seq_encoder` is provided, this encoder will pool its output.
 36 |         Otherwise, this encoder will operate directly on the output of the `text_field_embedder`.
 37 |         If `None`, defaults to `BagOfEmbeddingsEncoder` with `averaged=True`.
 38 |     feedforward : `FeedForward`, optional, (default = None).
 39 |         An optional feedforward layer to apply after the seq2vec_encoder.
 40 |     loss : `PyTorchMetricLearningLoss`, option (default = None).
 41 |         An optional metric learning loss function. Will be combined with the masked language
 42 |         modeling objective if
 43 |         `text_field_embedder.token_embedders["tokens"].masked_language_modeling` is True. Must be
 44 |         provided if `text_field_embedder.token_embedders["tokens"].masked_language_modeling` is
 45 |         False. See https://kevinmusgrave.github.io/pytorch-metric-learning/losses/ for a list of
 46 |         available loss functions.
 47 |     miner: `PyTorchMetricLearningMiner`, option (default = None).
 48 |         An optional mining function which will mine hard negatives from each batch before computing
 49 |         the loss. See https://kevinmusgrave.github.io/pytorch-metric-learning/miners/ for a list
 50 |         of available mining functions.
 51 |     initializer : `InitializerApplicator`, optional (default=`InitializerApplicator()`)
 52 |         If provided, will be used to initialize the model parameters.
 53 |     """
 54 | 
 55 |     def __init__(
 56 |         self,
 57 |         vocab: Vocabulary,
 58 |         text_field_embedder: TextFieldEmbedder,
 59 |         seq2vec_encoder: Optional[Seq2VecEncoder] = None,
 60 |         feedforward: Optional[FeedForward] = None,
 61 |         miner: Optional[PyTorchMetricLearningMiner] = None,
 62 |         loss: Optional[PyTorchMetricLearningLoss] = None,
 63 |         scale_fix: bool = True,
 64 |         initializer: InitializerApplicator = InitializerApplicator(),
 65 |         **kwargs,
 66 |     ) -> None:
 67 | 
 68 |         super().__init__(vocab, **kwargs)
 69 |         self._text_field_embedder = text_field_embedder
 70 |         # Prevents the user from having to specify the tokenizer / masked language modeling
 71 |         # objective. In the future it would be great to come up with something more elegant.
 72 |         token_embedder = self._text_field_embedder._token_embedders["tokens"]
 73 |         self._masked_language_modeling = token_embedder.masked_language_modeling
 74 |         if self._masked_language_modeling:
 75 |             self._tokenizer = token_embedder.tokenizer
 76 | 
 77 |         # Default to mean BOW pooler. This performs well and so it serves as a sensible default.
 78 |         self._seq2vec_encoder = seq2vec_encoder or BagOfEmbeddingsEncoder(
 79 |             text_field_embedder.get_output_dim(), averaged=True
 80 |         )
 81 |         self._feedforward = feedforward
 82 | 
 83 |         self._miner = miner
 84 |         self._loss = loss
 85 |         if self._loss is None and not self._masked_language_modeling:
 86 |             raise ValueError(
 87 |                 (
 88 |                     "No loss function provided. You must provide a contrastive loss (DeCLUTR.loss)"
 89 |                     " and/or specify `masked_language_modeling=True` in the config when training."
 90 |                 )
 91 |             )
 92 |         # There was a small bug in the original implementation that caused gradients derived from
 93 |         # the contrastive loss to be scaled by 1/N, where N is the number of GPUs used during
 94 |         # training. This has been fixed. To reproduce results from the paper, set `model.scale_fix`
 95 |         # to `False` in your config. Note that this will have no effect if you are not using
 96 |         # distributed training with more than 1 GPU.
 97 |         self._scale_fix = scale_fix
 98 |         initializer(self)
 99 | 
100 |     def forward(  # type: ignore
101 |         self, anchors: TextFieldTensors, positives: TextFieldTensors = None
102 |     ) -> Dict[str, torch.Tensor]:
103 | 
104 |         """
105 |         # Parameters
106 | 
107 |         tokens : TextFieldTensors
108 |             From a `TextField`
109 | 
110 |         # Returns
111 | 
112 |         An output dictionary consisting of:
113 | 
114 |         embeddings : torch.FloatTensor
115 |             A tensor of shape `(batch_size, self._seq2vec_encoder.get_output_dim())`, which is the
116 |             representation for the given `tokens` output by the encoder. The encoder is composed of:
117 |             `self._text_field_embedder`, and `self._seq2vec_encoder`, in that order.
118 |         projections : torch.FloatTensor
119 |             A tensor of shape `(batch_size, self._feedforward.get_output_dim())`, which is the
120 |             non-linear projection of the learned representation for the given `anchor_tokens` output
121 |             by the projection head. This field will only be included if `self._feedforward` is not
122 |             `None`.
123 |         loss : torch.FloatTensor, optional
124 |             A scalar loss to be optimized.
125 |         """
126 |         output_dict: Dict[str, torch.Tensor] = {}
127 | 
128 |         # If multiple anchors were sampled, we need to unpack them.
129 |         anchors = unpack_batch(anchors)
130 |         # Mask anchor input ids and get labels required for MLM.
131 |         if self.training and self._masked_language_modeling:
132 |             anchors = mask_tokens(anchors, self._tokenizer)
133 |         # This is the textual representation learned by a model and used for downstream tasks.
134 |         masked_lm_loss, embedded_anchors = self._forward_internal(anchors, output_dict)
135 | 
136 |         # If positives are supplied by DataLoader and we are training, compute a contrastive loss.
137 |         if self.training:
138 |             output_dict["loss"] = 0
139 |             # TODO: We should throw a ValueError if no postives provided but loss is not None.
140 |             if self._loss is not None:
141 |                 # Like the anchors, if we sampled multiple positives, we need to unpack them.
142 |                 positives = unpack_batch(positives)
143 |                 # Positives are represented by their mean embedding a la
144 |                 # https://arxiv.org/abs/1902.09229.
145 |                 _, embedded_positives = self._forward_internal(positives)
146 |                 # Shape: (num_anchors, num_positives_per_anchor, embedding_dim)
147 |                 embedded_positives = torch.reshape(
148 |                     embedded_positives,
149 |                     (embedded_anchors.size(0), -1, embedded_anchors.size(-1)),
150 |                 )
151 |                 # Shape: (num_anchors, embedding_dim)
152 |                 embedded_positives = torch.mean(embedded_positives, dim=1)
153 | 
154 |                 # If we are training on multiple GPUs using DistributedDataParallel, then a naive
155 |                 # application would result in 2 * (batch_size/n_gpus - 1) number of negatives per
156 |                 # GPU. To avoid this, we need to gather the anchors/positives from each replica on
157 |                 # every other replica in order to generate the correct number of negatives,
158 |                 # i.e. 2 * (batch_size - 1), before computing the contrastive loss.
159 |                 embedded_anchors, embedded_positives = all_gather_anchor_positive_pairs(
160 |                     embedded_anchors, embedded_positives
161 |                 )
162 |                 # Get embeddings into the format that the PyTorch Metric Learning library expects
163 |                 # before computing the loss (with an optional mining step).
164 |                 embeddings, labels = self._loss.get_embeddings_and_labels(
165 |                     embedded_anchors, embedded_positives
166 |                 )
167 |                 indices_tuple = self._miner(embeddings, labels) if self._miner is not None else None
168 |                 contrastive_loss = self._loss(embeddings, labels, indices_tuple)
169 |                 # Loss needs to be scaled by world size when using DistributedDataParallel
170 |                 # See: https://amsword.medium.com/gradient-backpropagation-with-torch-distributed-all-gather-9f3941a381f8
171 |                 if util.is_distributed() and self._scale_fix:
172 |                     contrastive_loss *= dist.get_world_size()
173 |                 output_dict["loss"] += contrastive_loss
174 |             # Loss may be derived from contrastive objective, MLM objective or both.
175 |             if masked_lm_loss is not None:
176 |                 output_dict["loss"] += masked_lm_loss
177 | 
178 |         return output_dict
179 | 
180 |     def _forward_internal(
181 |         self,
182 |         tokens: TextFieldTensors,
183 |         output_dict: Optional[Dict[str, torch.Tensor]] = None,
184 |     ) -> torch.Tensor:
185 | 
186 |         masked_lm_loss, embedded_text = self._text_field_embedder(tokens)
187 |         mask = get_text_field_mask(tokens).float()
188 | 
189 |         embedded_text = self._seq2vec_encoder(embedded_text, mask=mask)
190 |         # Don't hold on to embeddings or projections during training.
191 |         if output_dict is not None and not self.training:
192 |             output_dict["embeddings"] = embedded_text.clone().detach()
193 | 
194 |         # Representations produced by a non-linear projection can be used for training with a
195 |         # contrastive loss. Previous works in computer vision have found this projection head to
196 |         # improve the quality of the learned embeddings (see: https://arxiv.org/abs/2002.05709).
197 |         # When embedding text with a trained model, we want the representation produced by the
198 |         # encoder network. We therefore call these vectors "projections" to distinguish them from
199 |         # the "embeddings".
200 |         if self._feedforward is not None:
201 |             embedded_text = self._feedforward(embedded_text)
202 |             if output_dict is not None and not self.training:
203 |                 output_dict["projections"] = embedded_text.clone().detach()
204 | 
205 |         return masked_lm_loss, embedded_text
206 | 
207 |     default_predictor = "declutr"
208 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/tests/fixtures/data/openwebtext/valid.txt:
--------------------------------------------------------------------------------
1 | Labour and the Greens say they're committed to running budget surpluses, paying debt and keeping spending down. The two parties will present their 'Budget Responsibility Rules' to business leaders, economists, NGOs and academics on Friday morning. "People want to know the kinds of principles that we're going to manage the books by," Labour leader Andrew Little told The AM Show. The two parties' economic policies have differed in the past, and while they'll still have separate policies, the rules are a "framework" for working together in government, said Labour finance spokesman Grant Robertson. Among the rules are: running surpluses (barring major economic shocks or natural disasters) keep Crown spending to about 30 percent of GDP a progressive, fairer tax system reduce net debt to 20 percent of GDP in five years increase investment in superannuation, climate change and infrastructure. "We're committed to running surpluses over an economic cycle," says Mr Little. "We're going to be responsible with the government budget. There's stuff we've already committed to - we've got to fix housing, we've got to fix health, we've got to fix education." Crown spending currently is at 30 percent of GDP, according to Treasury figures. The 2016 Budget predicted it would drop to about 28 percent by 2020, under National. It peaked in 2011, in the wake of the global financial crisis, at around 34 percent. The last time Labour was in government, Finance Minister Michael Cullen ran up a series of significant surpluses, but came under fire for not cutting taxes. No tax increases are planned if Labour does win the upcoming election. "Every commitment we've made we can fund out of existing tax revenue," said Mr Little. The Greens went into the last election promising a new top tax rate of 40 percent on income over $140,000. Co-leader James Shaw wouldn't say if that would be the case this year, saying the party's focus wouldn't be on income tax. "A lot of people don't know this, but at the last election we actually promised a tax cut for 97 percent of New Zealanders," he told The AM Show. "That was going to be funded out of a tax on pollution that causes climate change." A capital gains tax would likely be included in the Greens' economic policy, but not Labour's. "It won't be exactly the same as what we've said in the past," said Mr Shaw. Coalition negotiations As for who'll take what job should the left bloc win the election, Mr Shaw says it's wide open. "The largest party in government is guaranteed the Prime Minister and Minister of Finance. Everything else depends on what happens on election night."
2 | DOWNEY, Calif. When Mario Guerra strolls through the streets of downtown Downey, he cant help but play the role of seasoned salesman for the city in southeastern Los Angeles County he adopted more than 35 years ago. Guerra, a Cuban-American immigrant who served eight years on the city council and two terms as mayor, sings the praises of Portos Bakery, a Cuban sandwich and pastry shop that he helped lure to Downey. He shows off the vibrant murals and sleekly designed street sculpture that he commissioned as a city leader to spruce up the downtown area and imbue it with a sense of culture and character. Subsequently, an acquaintance stops Guerra, known about town for his role as a Catholic deacon as well as consummate problem-solver, to ask if Guerra would perform his wedding at the towns annual Dia de los Muertes festival. Guerra, a Republican in a city that is 70 percent Hispanic and leans Democratic, attributes his success as a GOP politician to a laser-like focus on finding solutions, building bridges and approaching every problem with a kind of neighborly compassion. Its how Guerra says he came within 5 percentage points of his opponent in his race for a state Senate seat last fall in a district where Democrats have a 24-point voter registration advantage I govern in that 60 to 70 percent where I feel that we can all agree and say, Lets just make things better, he said. Were not going to agree on everything but we can agree on this stuff that we can fix, and its in that space that you can get things done. But to Guerras dismay, its a markedly different philosophy than the one that seems to have taken hold within the Republican Party at the national level in the 2016 presidential cycle, particularly with the rise of Donald Trump. Guerra has watched with a mixture of bewilderment and exasperation as the business mogul has ridden to the top of the polls in part by spewing barbed invective against illegal immigration, and branding Latino immigrants as criminals and rapists. In preparation of going on Spanish-language television to talk about the summer of Trump, Guerra brushed up to make sure he had one specific word in his arsenal payaso clown. For Californians, the narrative playing out at the national level has an air of dj vu. Twenty years ago, a similar wave of anti-immigrant sentiment washed over the Golden State, and voters responded by passing a ballot initiative that blocked undocumented immigrants from receiving a litany of critical state services, including public education and health care.
3 | Let's just get something out of the way: the Phillies are not a good baseball team. Ryan Howard is listed as the number one starter at first base on the depth chart, and it's not 2009. Freddy Galvis and Cesar Hernandez compose the middle infield; Cedric Hunter and Peter Bourjos are the starting corner outfielders. Those are five players that shouldn't be starters, starting, but this is the reality that the Phillies currently inhabit. Fortunately for Philadelphia, and their fans, they have already have the pieces in place to help them make a quantum leap. Through a series of trades, and good drafts, the Phillies are in a position to put a competitive baseball team on the field as early as 2017. Homegrown players Philadelphia has two players that have come through their own farm system and that could be ready as soon as this year: J.P. Crawford and Andrew Knapp. 2015 PA BB% K% ISO wOBA wRC+ Crawford (AA) 405 12.1% 11.1% .142 .348 121 Andrew Knapp (AA) 241 9.1% 17.8% .271 .465 200 Both players spent the majority of their 2015 seasons in AA, and they were fantastic. Knapp, a catcher, showcased more power and a better offensive line overall, but in no way should that diminish Crawford's campaign. FanGraphs rates his future value at 60 (out of 80), and he's expected to be the next shortstop prospect to join the surfeit of budding MLB stars at that position. He's starting the year at AA, but with a current wOBA of .421 and a wRC+ of 164 though 43 at-bats, he's likely going to join Knapp at AAA relatively soon. The Phillies don't want to start anyone's service clock sooner than they have to, but Knapp seems like a safe bet to see time at the major league level this year. Carlos Ruiz (likely) won't be playing for Philadelphia beyond this year, and Cameron Rupp (with his career wOBA of .282 and wRC+ of 75) isn't going to block Knapp from reaching the big leagues. The catcher's spot is up for grabs on the big league club, and Knapp has a chance to entrench himself as the starter. There's also Aaron Nola, who's already at the big league level. In the first 91.2 innings of his career, Nola's posted a K/9 of 8.35 and a BB/9 of 1.87, along with an ERA of 3.53, an FIP of 3.81, and an fWAR of 1.3. In two starts this year, he's striking out a significantly higher percentage of batters than he did in 2015, and has yet to walk anybody. He's under team control through the 2021 season, and barring an injury, Nola should be one of the Phillies' front line starters for years to come. Philadelphia also has Aaron Altherr (currently on the DL) and Roman Quinn moving up through their system.
4 | Is it possible to completely eliminate scent? Share this article Every day a new scent control product seems to hit the market. How do you make sense of the marketing tidal wave that hits you every time you turn on the T.V. or open a hunting magazine? Do scent control products actually work? Is it possible to completely eliminate odor? Maybe. In this article, I will uncover the answers to those questions and determine the best way to remain undetected in the field. How animals smell Animals have special membranes in their noses that pick up scent signatures in the air. In fact, deer and elk have more receptors in their noses than even dogs. Deer and elk can also use an additional organ in their mouth, called a vomeronasal organ, to detect smells. This allows them to detect multiple scents simultaneously and makes it possible for them to detect smells that are incredibly far away. This means that the proverbial deck is stacked against a hunter almost immediately because animals can smell you before you ever see them. Types of scent It will be easiest to understand scent control if we first understand scent in general. There are three types of scents that comprise the smells that an animal will pick up: natural, unnatural/foreign and odor. Natural Natural smells are those smells that naturally occur in its given environment. The smell of pine trees, the smell of a deer's coat, etc.; however, natural smells are not just those smells that are found in nature. Human beings also have a natural smell even though we do not actually smell it. This is because our brains recognize it as our own scent and the way we are programmed, it becomes useless information that does not register for us. But that does not mean that it is not there. I could shower with scentless soap and scentless shampoo and I would still have a natural scent. Natural scents cannot be removed; they can only be masked. Unnatural/foreign Unnatural or foreign smells are those that are not naturally occurring. Scented detergents might be mountain fresh, but in the woods in the fall that smell is unnatural. Coffee is an unnatural smell as well as vehicle exhaust. Unnatural scents can be removed with little effort. Simply by being aware of these unnatural or foreign smells is enough to make changes that can drastically reduce or eliminate them. Odor Odor is a specific type of scent because it is caused by the growth of bacteria in an environment. Sweat does not smell like body odor until it stays in a dark, wet area of your body (i.e., your armpit) for an extended period of time and bacteria begins to grow.
5 | Steam Hammer is the first hardcore sandbox-style RPG set in a dark and mysterious steampunk world. Experience the intensity as you try to survive on the mysterious Acribo Islands. Steam Hammer features: a classic Victorian steampunk setting with wondrous mechanisms, machinery, weapons, armor, clothing, andof coursesteam and smoke. an open-class system that frees you from arbitrary constraints. Engineer, scientist, farmer, gunsmith, stormtrooper, sharpshooter, and more can all be combined and switched depending on your skill set. a huge open world for you to explore, travel, and terraform. Go where you will and master the land. Craft your Glory. Craft your Victory. Craft your Steam Hammer! Updates: Update #1 Thank you! Update #2 Update #3 Update #4 Two Weeks Left Update #5 Final week Update #6 We are very grateful to all our backers who spread the word about our campaign, and now we can offer you something extra in return as a token of our appreciation. Now with KickBooster you can share our campaign with your friends and you'll get 11% of every dollar you help raise. Click here for more info. The Victorian Empire was at the height of its power. It dominated its neighbors and basked in glory. The Victorians made remarkable technological achievements in steam-powered machinery. Imperial airships flew over the many lands and provinces under its control. The foundation of the Empires power was the celebrium trees growing on the Acribo Islands. Its precious sap fueled its technological wonders, but the empire had a dark secret. Harvesting the sap of the trees drove the workers mad. Day after day, the Acribian laborers slaved away in nightmarish conditions, all for the sake of harvesting the cursed tree. A storm was brewing that could not be seen from above, and one day, everything changed. The ancient gods of the Acribo Islands returned and took revenge on the Victorians exploiting their people. Devastating cataclysms struck the land. Having suffered long under the yoke of the Empire, the Acribians rebelled in a war that nearly drowned the country in blood, and they declared their independence. The world was never the same. Time passed. The crisis was over, but the Empires greatness was gone. So was its industry, and the lives of so many of its citizens. It lost its access to the celebrium trees. But now, Victoria is healing its wounds and it is time to take action. All that remains of its air force is mustering and the Imperial airships are headed West. This is where your story begins. Bring the Empire back to its former glory! You must win whatever it takes.
6 | 


--------------------------------------------------------------------------------
/tests/fixtures/data/openwebtext/train.txt:
--------------------------------------------------------------------------------
1 | Just two months into the baseball season, the Cubs are exceeding fans' expectations and creating a national buzz with one of the best records in the National League. But, quietly, progress also is being made off the field as team Chairman Tom Ricketts methodically moves to gain further control of the neighborhood streets just beyond the walls of Wrigley Field and win his battle with the rooftop businesses that help define the Wrigley vibe. His latest coup came earlier this month, records show, when an entity controlled by the Ricketts family bought three more rooftop buildings on Sheffield Avenue, bringing to six the number the family now owns. The Ricketts family paid Sheffield Finance an undisclosed price for the buildings at 3637 N. Sheffield which was torn down and rebuilt just for the rooftop business 3617 N. Sheffield and 3619 N. Sheffield, Cook County property records show. The Ricketts family will assume no debt on the buildings. A sale was expected after a federal judge last month dismissed a foreclosure lawsuit against the businesses. In November, Fifth Third Bank sued the rooftop operations and their owners, alleging that the businesses owed more than $18 million on mortgages and missed payments. Sheffield Finance later bought a portion of the debt and replaced the bank as plaintiffs in the case. Sheffield Finance is an entity owned by Jerry Lasky and Murray Peretz, partners in Spectrum Real Estate, a Chicago commercial real estate business. "I always felt these rooftops were an extension of Wrigley Field and they belong with the Ricketts family," said Lasky, who called himself a diehard Cubs fan. "It was a natural fit." The Rickettses have held a financial interest in a seventh rooftop operation, Down the Line Rooftop, since 2010. Earlier this year, George Loukas, who helped start the rooftop craze years ago, sold two buildings while James Lourgos and his partners sold another. Just three rooftop businesses on Sheffield aren't owned by the Ricketts family now: Murphy's Rooftop, at the corner of Waveland and Sheffield and above Murphy's Bleachers, and Skybox on Sheffield and Lakeview Baseball Club, which have sued the team in federal court. That suit is pending. According to records, Ricketts tried to buy all of the rooftop club properties shortly after acquiring the Cubs. Today, sports team owners search for new sources of revenue, and there are more changes on the horizon for the Cubs: Future plans include an open-air plaza, a nearby hotel and street fairs similar to the ones the Boston Red Sox host. A Cubs spokesman said Thursday that the newly acquired rooftops will be managed like the three bought in January through an agreement with Loukas, who owns popular bars in Wrigleyville and still has one rooftop business. Lasky said he admired that the team is investing more than $575 million into renovating the 101-year-old stadium and neighborhood, and praised the team's operation of Wrigley. He said he approached team executives there weren't other potential buyers a few weeks ago, completing the deal with Cubs Chairman Tom Ricketts and President of Business Operations Crane Kenney. Ricketts has said little about the team's plans for the rooftops other than that he plans to keep them in operation. In a statement, the Cubs said, "The Ricketts family has said in the past they are interested in reasonable opportunities to purchase rooftop property and are willing to pay a fair price. ... The rooftop situation has been a political and legal morass for more than a decade, and the Ricketts family will remain interested in opportunities which make sound business sense." The team's $375 million overhaul of the stadium continues. The 3,990-square-foot left field video board was ready for the Cubs' home opener, and the left-field bleachers opened earlier this month.
2 | Im surrounded by big spenders at family gatherings. My siblings, their spouses/partners, my uncles and cousins all seem to spend at least every dime they make. The thing is, these arent poor people struggling to get by. The poorest among them brings in probably around $65k/year, and the others are well into the six figures. So they all have two or three refrigerators. One for the kitchen, one for freezing meats and vegetables in the basement and one in the garage just full of beer and soda. They all have cars no more than 2 years old. They have houses with unused rooms filled with unused furniture. When Im with people one on one Ill often bring up the topic of personal finance and investing. So I know that none of these people have any savings what-so-ever. Theyre barely into their 30s so they think they dont need to plan for retirement yet. I think they used to all just think I didnt earn much income and was a struggling student or something. So they would rag on me and tell me where I can get a good deal on a much shinier car to replace my 10 year old sedan. Theyre offering 0% interest for the next six weeks, you should go! It would only be like $300/month, even you could swing that. Then, over the past six months or so, word has gotten out that I paid cash for a house. And that Im fixing to do it again a few times over in the next year. Yet I still drive the old sedan around. Theyre realizing their apparent financial superiority has been merely that; apparent. Now, when Im in ear shot, Ill hear things like, I play hard, but I work hard! or, Whats the point of earning it if youre not going to spend it??? and You only live once! I think they are feeling a bit guilty about their behavior and my mere presence is bringing it out. They know that TV they bought over a year ago that theyre still making payments on has lost its novelty. They know they should be putting some money away, at least for a typical retirement when theyre 67. They know they ought to be saving something. The fact that no one else is doing it though makes it easier for them to slide as well. Theyll all be on that sinking ship together at least. I think, if they were honest, their platitudes would sound more like: I know I should save some money for a rainy day, but damn that cars shiny! Or, Ive already resigned myself to working for the next 40 years, I may as well buy some crap that at least makes me happy for a few weeks. Or, Were actually in a contest to see who can spend the most on their daily transportation, I just got a little closer to the winners circle. Ive said my piece many times over the years. When someone mentions theyre thinking about getting a new car, I explain the vast cost savings in getting something at least slightly used that gets good mileage. And if not, I explain the advantages of saving up and paying cash rather than paying all those finance charges. Ive suggested to my brother when he was buying his house that, as a single guy, he didnt really need 4 bedrooms and to consider the cost of heating all those empty rooms through a New England winter. Ive recommended to everyone, without much success, that they at least make use of tax-advantaged retirement accounts. After a while of that I just started to get eye-rolls. Or anticipatory glances when someone brought up some financial topic. Realizing Im just blowing into the wind, now I just say, You guys know what Im going to say. You know it makes sense. But its your money; your future, do what you want with it. My parents are no better. They make a good income and they do a good job of spending it. They make 3 to 5 Caribbean trips every year. I dont think theyve ever not had a car payment. The two of them live in a 6 bedroom house. They order exotic meats through the mail. My dad gambles. They carry way too much insurance.
3 | Colorado cannabis is better than anything Amsterdam's got, and the medicine here is on par with, if not better than, what is coming out of California. While we already knew that here in Colorado, it was still cool to hear those words come out of High Times editor Danny Danko's mouth last night at the 2011 High Times Cannabis Cup award ceremony. I don't think anyone really knew what to expect from the Cannabis Cup. Even dispensary owners I spoke with beforehand had only an inkling. Were people really going to be able to light up? Would staffers be able to hand out meds to patients? What coalesced, though, turned out to be easily one of the most amazing cannabis events in Colorado short of passing Amendment 20 nearly twelve years ago. Continue Reading See a photo slide show from the Medical Cannabis Cup Inside Exdo was a miniature version of the massive KushCon II from last December, with dispensaries and bong shops setting up booths. The big difference was that the majority of the dispensaries this weekend were displaying real cannabis on their tables -- something KushCon frowned upon. Half of the large hall was given over to speakers, including talks from Danko on hash making and cultivation tips. All of that was cool, and it definitely had a more relaxed and patient-driven vibe than the corporate-feeling KushCon. But what really made the cup worthwhile was going on outside and down the alley. Marijuana Deals Near You In a warehouse not connected to Exdo, hundreds of medical marijuana patients lit up and created the largest hot box I have ever been a part of. Massive pillows of ganja smoke were billowing over the head of the security guard checking to make sure our wristbands were all in order. Inside was exactly what a cannabis convention should look like. Thick air, hazy eyes and bong load after bong load of smoke being blown around the huge room. Some dispensaries were giving out herb, others were playing it more sly and only displaying their ganja while puffing with patients a few steps away from their booths. At one booth, there was an at least five-foot glass bong being packed up for patients, while across the way, another group was filling up equally as tall Volcano bags and offering a hit to anyone who would walk by. The Cannasseur dispensary had one of the most creative booths, offering a plywood simulation of a first-class private jet lounge and having their budtenders dress like flight attendants. Cannasseur also had some the most delicious samples of herb, with the scantily-clad stewardesses handing out bong rips of Kurple Fantasy from sick 4.0 Glass micro tubes. The Clinic also had a unique setup, letting patients play on a homemade The Price Is Right-like Klinko board for coupons and specials. A lot of booths had oil rigs, so it was fun to walk around and try different waxes and budders -- though some booths neglectfully weren't wiping down pipes with sanitary wipes, and the thought of catching some crap from one of the hundreds of other puffers kept me away on occasion. Still, having that many people together all for cannabis, and to have our community recognized by High Times, was exciting. See a photo slide show from the Medical Cannabis Cup One strange thing, even for a guy in the media, was all of the media. It seems like around every corner, someone being followed by their pet documentary film crew. People seemed to love hamming it up for the cameras, especially when the crew from G4's Attack of the Show would walk by a booth of puffing patients. I also met a few people from a crew in town filming for National Geographic, as well as another independent documentary film. I know it's legal here, and we should have no shame in what we are doing -- but as attorney Warren Edson appropriately asked after the event: "You know those cameras were on, right?
4 | Diesel engines are starting to make a return in the United States but they have been massively popular for decades in Europe. Its not just economy cars, either: AMG has built a diesel engine, oil-burning Audis have won the grueling 24 Hours of Le Mans on several occasions and BMW offers a 5-Series with a triple-turbocharged 3.0-liter straight-six diesel worthy of a M badge on the trunk lid. 40 years ago, Peugeot and Mercedes-Benz were among the very few manufacturers that offered a diesel in a passenger car and the thought of a compression ignition engine mounted in the engine bay of a sports car was a daring one that only Mercedes was willing to dabble in with the C111-III, a 230-horsepower experimental sports car that was never given the green light for production. The oil crisis that rocked the 1970s convinced BMWs top brass to take a close look at the diesel engine as a good compromise between power and fuel economy. A team of engineers tasked with studying oil-burning engines was formed at the firms Munich, Germany, headquarters in 1975. Engineers chose to use the M20 straight-six gasoline-burning engine as the starting point for the new diesel. The engines basic structure and belt-driven overhead cam setup were retained, but it featured purpose-designed valves, pistons and crankshaft and, importantly, an exhaust gas-driven turbocharger. BMW also worked with outside suppliers to design a system called Instant Start that shortened the glow time. With a displacement of 2,443 cubic centimeters, the new M21 engine was manufactured in Steyr, Austria, on a production line operated jointly by BMW and Magna-Steyr. The first regular-production diesel-powered BMW, the 524td, made its public debut at the 1983 Frankfurt Motor Show. With 115 horsepower and 154 lb-ft. of torque under the hood, it sprinted from zero to 62 mph (100 km/h) in 12.9 seconds and reached a top speed of 111 mph (180 km/h). At the time, BMW proudly called the car the fastest diesel-burning sedan in the world, though Mercedes turbocharged w123 300D was not far behind. Fuel economy was rated at an impressive 7.1 liters per 100 kilometers (33 mpg U.S., 39 mpg U.K.) in a mixed European cycle. Diesel engines rose to prominence in Europe during the early 1980s and manufacturers who didnt offer at least one oil-burning model often lost sales to competitors. In hindsight, the 524td came at exactly the right moment for BMW and it quickly became one of the most popular variants of the E28 5-Series in Germany. Lincoln burns oil Eager to keep up with rival Cadillac, who offered the Eldorado coupe and the Seville sedan with an Oldsmobile-sourced V8 diesel, Fords Lincoln division equipped its Continental sedan and Mark VII coupe with BMWs 2.4-liter diesel in 1984 but public demand was almost non-existent and the model was axed a year later after a handful of examples were built. Democratizing the diesel The 524tds popularity in Germany and abroad convinced BMW to widen its diesel offering. Launched in 1985, the 324d (E30) was powered by a naturally-aspirated variant of the 2.4-liter that churned out 82 horsepower and 113 lb-ft. of torque. It hit 60 mph from a stop in 16.1 seconds and returned 6.9 liters per 100 kilometers (34 mpg U.S., 40 mpg U.K.) in a mixed European cycle. Like in the 524td, power was sent to the rear wheels via a standard five-speed manual transmission or an optional four-speed automatic. The 324d was a hit in Europe but buyers clamored for more power so BMW quickly offered the turbodiesel 2.4 in the E30, creating the 324td. Conversely, the naturally-aspirated mill was installed the e28 and the 524d was popular in heavily-taxed markets like Italy, Spain and France. The original BMW diesel was replaced by a brand new unit presented at the 1991 Frankfurt Motor Show.
5 | New Jersey Democratic Sen. Cory Booker came under attack after his Wednesday night vote against allowing the importation of cheaper drugs from Canada into the United States. The amendment to the budget resolution bill would have encouraged the importing of cheaper pharmaceutical products into the U.S. to lower prescription drug prices. Those exorbitant price tags, which are set by pharmaceutical companies, are putting a financial crunch on families, according to a Consumer Reports survey. Spending on drugs is also taking a huge bite out of not only families' pocketbooks, but also government coffers which could end up coming back to bite taxpayers again. As the outrage grew online, Booker responded to questions about why he joined Republicans and a dozen Democrats in opposing the amendment sponsored by Sen. Amy Klobuchar (D-Minn.) and Sen. Bernie Sanders (I-Vt.). "Any plan to allow the importation of prescription medications should also include consumer protections that ensure foreign protections that ensure foreign drugs meet American safety standards," Booker said in a statement to Jezebel. Back in December, though, Booker voted to weaken federal safety standards that regulate whether a medication can be sold in the U.S. in the name of broader consumer access to drugs, but experimental ones, not necessarily cheaper ones. The 21st Century Cures Act, which Booker vocally supported, passed with resounding bipartisan support and was signed into law last month. The law promised government investments in cancer and Alzheimer's research, allocated funds to fight the opioid epidemic and contained a host of other measures intended to facilitate the modernization of the health care industry. Among those steps was one to roll back the notorious gauntlet of Food and Drug Administration regulations in order to expedite the arrival of experimental medicine and medical equipment to market with the idea of allowing Americans easier access to cutting-edge treatments for what ails them. Despite the overwhelming support in Congress, critics were vocal about flaws in the bill notably the roll back of FDA regulations that would benefit pharmaceutical and medical tech companies. "Big pharma has its hand out for a bunch of special giveaways and favors that are packed together in something called the 21st Century Cures bill," Sen. Elizabeth Warren said during the debate over the law. "When American voters say Congress is owned by big companies, this bill is exactly what they are talking about." Charles Krupa/AP Sen. Elizabeth Warren was a leading critic of the rollbacks on regulations on experimental drugs in the 21st Century Cures Act. "A greater threat" The 21st Century Cures Act was "terrible for drug quality," Peter Maybarduk, who directs progressive watchdog Public Citizen's access to medicines group, said in a phone call Friday. Pulitzer Prize-winning journalist Michael Hiltzik wrote in a column for the Los Angeles Times: "Remarkably, nothing in the measure would address the main problem the public sees with the drug industry excessive prices." Booker's office defended the seeming contradiction between the votes. There's a "big difference between adjusting FDA's requirements for medical products [and] experimental medications and a situation where you could have no FDA review of drugs at all," Jeff Giertz, a spokesperson for Booker, said in a phone call Friday. "If the amendment had some more specifics on what it would have spelled out, in terms of a review process, that would have been something he supported," Giertz said. Critics of the the 21st Century Cures Act, though, contended the inconsistencies remain rife. "Twenty-first Century Cures was a greater threat to drug safety and efficacy than the import amendment," Maybarduk said, adding of Booker's objections to the amendment: "It could be legitimate drug quality concerns. But that argument is also used as a fig-leaf when an elected rep doesn't want to break with pharma."
6 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # DeCLUTR: Deep Contrastive Learning for Unsupervised Textual Representations
  2 | 
  3 | ![build](https://github.com/JohnGiorgi/declutr/workflows/build/badge.svg?branch=master)
  4 | [![codecov](https://codecov.io/gh/JohnGiorgi/DeCLUTR/branch/master/graph/badge.svg)](https://codecov.io/gh/JohnGiorgi/DeCLUTR)
  5 | [![Checked with mypy](http://www.mypy-lang.org/static/mypy_badge.svg)](http://mypy-lang.org/)
  6 | ![GitHub](https://img.shields.io/github/license/JohnGiorgi/DeCLUTR?color=blue)
  7 | 
  8 | The corresponding code for our paper: [DeCLUTR: Deep Contrastive Learning for Unsupervised Textual Representations](https://aclanthology.org/2021.acl-long.72/). Results on [SentEval](https://github.com/facebookresearch/SentEval) are presented below (as averaged scores on the downstream and probing task test sets), along with existing state-of-the-art methods.
  9 | 
 10 | | Model                                                                                                      | Requires labelled data? | Parameters | Embed. dim. | Downstream (-SNLI) |  Probing  |   Δ   |
 11 | |------------------------------------------------------------------------------------------------------------|:-----------------------:|:----------:|:-----------:|:------------------:|:---------:|:-----:|
 12 | | [InferSent V2](https://github.com/facebookresearch/InferSent)                                              |           Yes           |     38M    |     4096    |        76.00       |   72.58   | -3.10 |
 13 | | [Universal Sentence Encoder](https://tfhub.dev/google/universal-sentence-encoder-large/5)                  |           Yes           |    147M    |     512     |        78.89       |   66.70   | -0.21 |
 14 | | [Sentence Transformers](https://github.com/UKPLab/sentence-transformers)  ("roberta-base-nli-mean-tokens") |           Yes           |    125M    |     768     |        77.19       |   63.22   | -1.91 |
 15 | | Transformer-small ([DistilRoBERTa-base](https://huggingface.co/distilroberta-base))                        |            No           |     82M    |     768     |        72.58       |   74.57   | -6.52 |
 16 | | Transformer-base ([RoBERTa-base](https://huggingface.co/roberta-base))                                     |            No           |    125M    |     768     |        72.70       |   74.19   | -6.40 |
 17 | | DeCLUTR-small ([DistilRoBERTa-base](https://huggingface.co/distilroberta-base))                            |            No           |     82M    |     768     |        77.50       | __74.71__ | -1.60 |
 18 | | DeCLUTR-base ([RoBERTa-base](https://huggingface.co/roberta-base))                                         |            No           |    125M    |     768     |      __79.10__     |   74.65   |   --  |
 19 | 
 20 | > Transformer-* is the same underlying architecture and pretrained weights as DeCLUTR-* _before_ continued pretraining with our contrastive objective. Transformer-* and DeCLUTR-* use mean pooling on their token-level embeddings to produce a fixed-length sentence representation. Downstream scores are computed without considering perfomance on SNLI (denoted "Downstream (-SNLI)") as InferSent, USE and Sentence Transformers all train on SNLI. Δ: difference to DeCLUTR-base downstream score.
 21 | 
 22 | ## Table of contents
 23 | 
 24 | - [Notebooks](#notebooks)
 25 | - [Installation](#installation)
 26 | - [Usage](#usage)
 27 |   - [Training](#training)
 28 |   - [Embedding](#embedding)
 29 |   - [Evaluating with SentEval](#evaluating-with-senteval)
 30 |   - [Reproducing results](#reproducing-results)
 31 | - [Citing](#citing)
 32 | 
 33 | ## Notebooks
 34 | 
 35 | The easiest way to get started is to follow along with one of our [notebooks](notebooks):
 36 | 
 37 | - Training your own model [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnGiorgi/DeCLUTR/blob/master/notebooks/training.ipynb)
 38 | - Embedding text with a pretrained model [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnGiorgi/DeCLUTR/blob/master/notebooks/embedding.ipynb)
 39 | - Evaluating a model with [SentEval](https://github.com/facebookresearch/SentEval) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnGiorgi/DeCLUTR/blob/master/notebooks/evaluating.ipynb)
 40 | 
 41 | ## Installation
 42 | 
 43 | This repository requires Python 3.6.1 or later.
 44 | 
 45 | ### Setting up a virtual environment
 46 | 
 47 | Before installing, you should create and activate a Python virtual environment. See [here](https://github.com/allenai/allennlp#installing-via-pip) for detailed instructions.
 48 | 
 49 | ### Installing the library and dependencies
 50 | 
 51 | If you _don't_ plan on modifying the source code, install from `git` using `pip`
 52 | 
 53 | ```
 54 | pip install git+https://github.com/JohnGiorgi/DeCLUTR.git
 55 | ```
 56 | 
 57 | Otherwise, clone the repository locally and then install
 58 | 
 59 | ```bash
 60 | git clone https://github.com/JohnGiorgi/DeCLUTR.git
 61 | cd DeCLUTR
 62 | pip install --editable .
 63 | ```
 64 | 
 65 | #### Gotchas
 66 | 
 67 | - If you plan on training your own model, you should also install [PyTorch](https://pytorch.org/) with [CUDA](https://developer.nvidia.com/cuda-zone) support by following the instructions for your system [here](https://pytorch.org/get-started/locally/).
 68 | 
 69 | ## Usage
 70 | 
 71 | ### Preparing a dataset
 72 | 
 73 | A dataset is simply a file containing one item of text (a document, a scientific paper, etc.) per line. For demonstration purposes, we have provided a script that will download the [WikiText-103](https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/) dataset and match our minimal preprocessing
 74 | 
 75 | ```bash
 76 | python scripts/preprocess_wikitext_103.py path/to/output/wikitext-103/train.txt --min-length 2048
 77 | ```
 78 | 
 79 | > See [scripts/preprocess_openwebtext.py](scripts/preprocess_openwebtext.py) for a script that can be used to recreate the (much larger) dataset used in our paper.
 80 | 
 81 | You can specify the train set path in the [configs](training_config) under `"train_data_path"`.
 82 | 
 83 | #### Gotchas
 84 | 
 85 | - A training dataset should contain documents with a minimum of `num_anchors * max_span_len * 2` whitespace tokens. This is required to sample spans according to our sampling procedure. See the [dataset reader](declutr/dataset_reader.py) and/or [our paper](https://aclanthology.org/2021.acl-long.72/) for more details on these hyperparameters.
 86 | 
 87 | ### Training
 88 | 
 89 | To train the model, use the [`allennlp train`](https://docs.allennlp.org/master/api/commands/train/) command with our [`declutr.jsonnet`](training_config/declutr.jsonnet) config. For example, to train DeCLUTR-small, run the following
 90 | 
 91 | ```bash
 92 | # This can be (almost) any model from https://huggingface.co/ that supports masked language modelling.
 93 | TRANSFORMER_MODEL="distilroberta-base"
 94 | 
 95 | allennlp train "training_config/declutr.jsonnet" \
 96 |     --serialization-dir "output" \
 97 |     --overrides "{'train_data_path': 'path/to/your/dataset/train.txt'}" \
 98 |     --include-package "declutr"
 99 | ```
100 | 
101 | The `--overrides` flag allows you to override any field in the config with a JSON-formatted string, but you can equivalently update the config itself if you prefer. During training, models, vocabulary, configuration, and log files will be saved to the directory provided by `--serialization-dir`. This can be changed to any directory you like.
102 | 
103 | #### Gotchas
104 | 
105 | - There was a small bug in the original implementation that caused gradients derived from the contrastive loss to be scaled by 1/N, where N is the number of GPUs used during training. This has been fixed. To reproduce results from the paper, set `model.scale_fix` to `False` in your config. Note that this will have no effect if you are not using distributed training with more than 1 GPU.
106 | 
107 | #### Exporting a trained model to HuggingFace Transformers
108 | 
109 | We have provided a simple script to export a trained model so that it can be loaded with [Hugging Face Transformers](https://github.com/huggingface/transformers)
110 | 
111 | ```bash
112 | wget -nc https://github.com/JohnGiorgi/DeCLUTR/blob/master/scripts/save_pretrained_hf.py
113 | python save_pretrained_hf.py --archive-file "output" --save-directory "output_transformers"
114 | ```
115 | 
116 | The model, saved to `--save-directory`, can then be loaded using the Hugging Face Transformers library (see [Embedding](#hugging-face-transformers) for more details)
117 | 
118 | ```python
119 | from transformers import AutoTokenizer, AutoModelForMaskedLM
120 |   
121 | tokenizer = AutoTokenizer.from_pretrained("output_transformers")
122 | model = AutoModel.from_pretrained("output_transformers")
123 | ```
124 | 
125 | > If you would like to upload your model to the Hugging Face model repository, follow the instructions [here](https://huggingface.co/transformers/model_sharing.html).
126 | 
127 | #### Multi-GPU training
128 | 
129 | To train on more than one GPU, provide a list of CUDA devices in your call to `allennlp train`. For example, to train with four CUDA devices with IDs `0, 1, 2, 3`
130 | 
131 | ```bash
132 | --overrides "{'distributed.cuda_devices': [0, 1, 2, 3]}"
133 | ```
134 | 
135 | #### Training with mixed-precision
136 | 
137 | If your GPU supports it, [mixed-precision](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) will be used automatically during training and inference.
138 | 
139 | ### Embedding
140 | 
141 | You can embed text with a trained model in one of four ways:
142 | 
143 | 1. [Sentence Transformers](#sentencetransformers): load our pretrained models with the [SentenceTransformers](https://www.sbert.net/) library (_recommended_).
144 | 2. [Hugging Face Transformers](#hugging-face-transformers): load our pretrained models with the [Hugging Face Transformers](https://github.com/huggingface/transformers) library.
145 | 3. [From this repo](#from-this-repo): import and initialize an object from this repo which can be used to embed sentences/paragraphs.
146 | 4. [Bulk embed](#bulk-embed-a-file): embed all text in a given text file with a simple command-line interface.
147 | 
148 | The following pre-trained models are available:
149 | 
150 | - [johngiorgi/declutr-small](https://huggingface.co/johngiorgi/declutr-small)
151 | - [johngiorgi/declutr-base](https://huggingface.co/johngiorgi/declutr-base)
152 | - [johngiorgi/declutr-sci-base](https://huggingface.co/johngiorgi/declutr-sci-base)
153 | 
154 | #### SentenceTransformers
155 | 
156 | Our pretrained models are hosted with Hugging Face Transformers, so they can easily be loaded in SentenceTransformers. Just make sure to [install the SentenceTransformers library](https://www.sbert.net/docs/installation.html) first. Here is a simple example
157 | 
158 | ```python
159 | from sentence_transformers import SentenceTransformer
160 | 
161 | # Load the model
162 | model = SentenceTransformer("johngiorgi/declutr-small")
163 | 
164 | # Prepare some text to embed
165 | texts = [
166 |     "A smiling costumed woman is holding an umbrella.",
167 |     "A happy woman in a fairy costume holds an umbrella.",
168 | ]
169 | 
170 | # Embed the text
171 | embeddings = model.encode(texts)
172 | ```
173 | 
174 | These embeddings can then be used, for example, to compute the semantic similarity between some number of sentences or paragraphs
175 | 
176 | ```python
177 | from scipy.spatial.distance import cosine
178 | 
179 | semantic_sim = 1 - cosine(embeddings[0], embeddings[1])
180 | ```
181 | 
182 | #### Hugging Face Transformers
183 | 
184 | Alternatively, you can use the models straight from Hugging Face Transformers. This just requires a few extra steps. Here is a simple example
185 | 
186 | ```python
187 | import torch
188 | from transformers import AutoModel, AutoTokenizer
189 | 
190 | # Load the model
191 | tokenizer = AutoTokenizer.from_pretrained("johngiorgi/declutr-small")
192 | model = AutoModel.from_pretrained("johngiorgi/declutr-small")
193 | 
194 | # Prepare some text to embed
195 | texts = [
196 |     "A smiling costumed woman is holding an umbrella.",
197 |     "A happy woman in a fairy costume holds an umbrella.",
198 | ]
199 | inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
200 | 
201 | # Embed the text
202 | with torch.no_grad():
203 |     sequence_output = model(**inputs)[0]
204 | 
205 | # Mean pool the token-level embeddings to get sentence-level embeddings
206 | embeddings = torch.sum(
207 |     sequence_output * inputs["attention_mask"].unsqueeze(-1), dim=1
208 | ) / torch.clamp(torch.sum(inputs["attention_mask"], dim=1, keepdims=True), min=1e-9)
209 | ```
210 | 
211 | #### From this repo
212 | 
213 | To use the model directly from this repo, import `Encoder` and pass it some text (it accepts both strings and lists of strings)
214 | 
215 | ```python
216 | from declutr import Encoder
217 | 
218 | # This can be a path on disk to a model you have trained yourself OR
219 | # the name of one of our pretrained models.
220 | pretrained_model_or_path = "declutr-small"
221 | 
222 | encoder = Encoder(pretrained_model_or_path)
223 | embeddings = encoder([
224 |     "A smiling costumed woman is holding an umbrella.",
225 |     "A happy woman in a fairy costume holds an umbrella."
226 | ])
227 | ```
228 | 
229 | See the list of available `PRETRAINED_MODELS` in [declutr/encoder.py](declutr/encoder.py)
230 | 
231 | ```bash
232 | python -c "from declutr.encoder import PRETRAINED_MODELS ; print(list(PRETRAINED_MODELS.keys()))"
233 | ```
234 | 
235 | #### Bulk embed a file
236 | 
237 | To embed all text in a **given** file with a trained model, run the following command
238 | 
239 | ```bash
240 | allennlp predict "output" "path/to/input.txt" \
241 |  --output-file "output/embeddings.jsonl" \
242 |  --batch-size 32 \
243 |  --cuda-device 0 \
244 |  --use-dataset-reader \
245 |  --overrides "{'dataset_reader.num_anchors': null}" \
246 |  --include-package "declutr"
247 | ```
248 | 
249 | This will:
250 | 
251 | 1. Load the model serialized to `"output"` with the "best" weights (i.e. the ones that achieved the lowest loss during training).
252 | 2. Use that model to embed the text in the provided input file (`"path/to/input.txt"`).
253 | 3. Save the embeddings to disk as a [JSON lines](http://jsonlines.org/) file (`"output/embeddings.jsonl"`)
254 | 
255 | The text embeddings are stored in the field `"embeddings"` in `"output/embeddings.jsonl"`.
256 | 
257 | ### Evaluating with SentEval
258 | 
259 | [SentEval](https://github.com/facebookresearch/SentEval) is a library for evaluating the quality of sentence embeddings. We provide a script to evaluate our model against SentEval. We have provided a [notebook](https://colab.research.google.com/github/JohnGiorgi/DeCLUTR/blob/master/notebooks/evaluating.ipynb) that documents the process of evaluating a trained model on SentEval. Broadly, the steps are the following:
260 | 
261 | First, clone the SentEval repository and download the transfer task datasets (you only need to do this once)
262 | 
263 | ```bash
264 | # Clone our fork which has several bug fixes merged
265 | git clone https://github.com/JohnGiorgi/SentEval.git
266 | cd SentEval/data/downstream/
267 | ./get_transfer_data.bash
268 | cd ../../../
269 | ```
270 | 
271 | > See the [SentEval](https://github.com/facebookresearch/SentEval) repository for full details.
272 | 
273 | Then you can run our [script](scripts/run_senteval.py) to evaluate a trained model against SentEval
274 | 
275 | ```bash
276 | python scripts/run_senteval.py allennlp "SentEval" "output"
277 |  --output-filepath "output/senteval_results.json" \
278 |  --cuda-device 0  \
279 |  --include-package "declutr"
280 | ```
281 | 
282 | The results will be saved to `"output/senteval_results.json"`. This can be changed to any path you like.
283 | 
284 | > Pass the flag `--prototyping-config` to get a proxy of the results while dramatically reducing computation time.
285 | 
286 | For a list of commands, run
287 | 
288 | ```bash
289 | python scripts/run_senteval.py --help
290 | ```
291 | 
292 | For help with a specific command, e.g. `allennlp`, run
293 | 
294 | ```
295 | python scripts/run_senteval.py allennlp --help
296 | ```
297 | 
298 | ### Reproducing results
299 | 
300 | To reproduce results from the paper, first follow the instructions to set up SentEval in [Evaluating with SentEval](#evaluating-with-senteval). Then, run
301 | 
302 | ```bash
303 | python scripts/run_senteval.py transformers "SentEval" "johngiorgi/declutr-base" \
304 | 	--output-filepath "senteval_results.json" \
305 | 	--cuda-device 0 \
306 | 	--mean-pool
307 | ```
308 | 
309 | `"johngiorgi/declutr-base"` can be replaced with (almost) any model on the [HuggingFace model hub](https://huggingface.co/models). Evaluation takes approximately 10-12 hours on a NVIDIA V100 Tesla GPU.
310 | 
311 | ## Citing
312 | 
313 | If you use DeCLUTR in your work, please consider citing our paper
314 | 
315 | ```
316 | @inproceedings{giorgi-etal-2021-declutr,
317 |     title = "{D}e{CLUTR}: Deep Contrastive Learning for Unsupervised Textual Representations",
318 |     author = "Giorgi, John  and
319 |       Nitski, Osvald  and
320 |       Wang, Bo  and
321 |       Bader, Gary",
322 |     booktitle = "Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)",
323 |     month = aug,
324 |     year = "2021",
325 |     address = "Online",
326 |     publisher = "Association for Computational Linguistics",
327 |     url = "https://aclanthology.org/2021.acl-long.72",
328 |     doi = "10.18653/v1/2021.acl-long.72",
329 |     pages = "879--895",
330 |     abstract = "Sentence embeddings are an important component of many natural language processing (NLP) systems. Like word embeddings, sentence embeddings are typically learned on large text corpora and then transferred to various downstream tasks, such as clustering and retrieval. Unlike word embeddings, the highest performing solutions for learning sentence embeddings require labelled data, limiting their usefulness to languages and domains where labelled data is abundant. In this paper, we present DeCLUTR: Deep Contrastive Learning for Unsupervised Textual Representations. Inspired by recent advances in deep metric learning (DML), we carefully design a self-supervised objective for learning universal sentence embeddings that does not require labelled training data. When used to extend the pretraining of transformer-based language models, our approach closes the performance gap between unsupervised and supervised pretraining for universal sentence encoders. Importantly, our experiments suggest that the quality of the learned embeddings scale with both the number of trainable parameters and the amount of unlabelled training data. Our code and pretrained models are publicly available and can be easily adapted to new domains or used to embed unseen text.",
331 | }
332 | ```
333 | 


--------------------------------------------------------------------------------