├── tests ├── __init__.py ├── common │ ├── __init__.py │ ├── test_util.py │ ├── test_model_utils.py │ └── test_contrastive_utils.py ├── test_predictor.py ├── fixtures │ ├── data │ │ ├── encoder_inputs.txt │ │ └── openwebtext │ │ │ ├── valid.txt │ │ │ └── train.txt │ ├── experiment_mlm_only.jsonnet │ ├── experiment.jsonnet │ ├── experiment_contrastive_only.jsonnet │ ├── experiment_scalar_mix.jsonnet │ ├── experiment_feedforward.jsonnet │ └── common.jsonnet ├── conftest.py ├── test_encoder.py ├── test_model.py └── test_dataset_reader.py ├── .allennlp_plugins ├── declutr ├── common │ ├── __init__.py │ ├── util.py │ ├── masked_lm_utils.py │ ├── model_utils.py │ └── contrastive_utils.py ├── modules │ ├── __init__.py │ ├── text_field_embedders │ │ ├── __init__.py │ │ └── mlm_text_field_embedder.py │ └── token_embedders │ │ ├── __init__.py │ │ └── pretrained_transformer_embedder_mlm.py ├── __init__.py ├── miners │ ├── __init__.py │ └── pytorch_metric_learning.py ├── losses │ ├── __init__.py │ └── pytorch_metric_learning.py ├── predictor.py ├── encoder.py ├── dataset_reader.py └── model.py ├── .coveragerc ├── pytest.ini ├── mypy.ini ├── pyproject.toml ├── .github ├── dependabot.yml └── workflows │ └── build.yml ├── .flake8 ├── training_config ├── transformer_mean.jsonnet ├── transformer_cls.jsonnet ├── declutr_base.jsonnet ├── declutr_small.jsonnet ├── declutr.jsonnet ├── contrastive_only.jsonnet └── mlm_only.jsonnet ├── scripts ├── save_pretrained_hf.py ├── preprocess_wikitext_103.py └── preprocess_openwebtext.py ├── setup.py ├── .gitignore ├── CONTRIBUTING.md ├── notebooks ├── evaluating.ipynb └── training.ipynb ├── LICENSE └── README.md /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.allennlp_plugins: -------------------------------------------------------------------------------- 1 | declutr -------------------------------------------------------------------------------- /declutr/common/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /declutr/modules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/common/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | source = declutr 3 | omit = tests/* \ 4 | *\__init__.py 5 | -------------------------------------------------------------------------------- /declutr/__init__.py: -------------------------------------------------------------------------------- 1 | from declutr.model import DeCLUTR 2 | from declutr.encoder import Encoder 3 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | filterwarnings = 3 | ignore::DeprecationWarning 4 | ignore::PendingDeprecationWarning 5 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | ignore_missing_imports = true 3 | no_site_packages = true 4 | 5 | [mypy-tests.*] 6 | strict_optional = false -------------------------------------------------------------------------------- /declutr/modules/text_field_embedders/__init__.py: -------------------------------------------------------------------------------- 1 | from declutr.modules.text_field_embedders.mlm_text_field_embedder import MLMTextFieldEmbedder 2 | -------------------------------------------------------------------------------- /declutr/miners/__init__.py: -------------------------------------------------------------------------------- 1 | from declutr.miners.pytorch_metric_learning import ( 2 | PairMarginMiner, 3 | PyTorchMetricLearningMiner, 4 | ) 5 | -------------------------------------------------------------------------------- /declutr/losses/__init__.py: -------------------------------------------------------------------------------- 1 | from declutr.losses.pytorch_metric_learning import ( 2 | CrossBatchMemory, 3 | NTXentLoss, 4 | PyTorchMetricLearningLoss, 5 | ) 6 | -------------------------------------------------------------------------------- /declutr/modules/token_embedders/__init__.py: -------------------------------------------------------------------------------- 1 | from declutr.modules.token_embedders.pretrained_transformer_embedder_mlm import ( 2 | PretrainedTransformerEmbedderMLM, 3 | ) 4 | -------------------------------------------------------------------------------- /declutr/common/util.py: -------------------------------------------------------------------------------- 1 | def sanitize_text(text: str, lowercase: bool = False) -> str: 2 | """Cleans text by removing whitespace, newlines and tabs and (optionally) lowercasing.""" 3 | sanitized_text = " ".join(text.strip().split()) 4 | sanitized_text = sanitized_text.lower() if lowercase else sanitized_text 5 | return sanitized_text 6 | -------------------------------------------------------------------------------- /tests/test_predictor.py: -------------------------------------------------------------------------------- 1 | class TestDeCLUTRPredictor: 2 | def test_json_to_instance(self, predictor) -> None: 3 | json_dict = {"text": "They may take our lives, but they'll never take our freedom!"} 4 | output = predictor._json_to_instance(json_dict) 5 | assert "anchors" in output 6 | assert "positives" not in output 7 | -------------------------------------------------------------------------------- /tests/fixtures/data/encoder_inputs.txt: -------------------------------------------------------------------------------- 1 | "A man inspects the uniform of a figure in some East Asian country." 2 | "The man is sleeping" 3 | "A soccer game with multiple males playing." 4 | "Some men are playing a sport." 5 | "A black race car starts up in front of a crowd of people." 6 | "A man is driving down a lonely road." 7 | "A smiling costumed woman is holding an umbrella." 8 | "A happy woman in a fairy costume holds an umbrella." -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 100 3 | 4 | include = '\.pyi?$' 5 | 6 | exclude = ''' 7 | ( 8 | __pycache__ 9 | | \btutorials\b 10 | | \bbuild\b 11 | | \.git 12 | | \.mypy_cache 13 | | \.pytest_cache 14 | | \.vscode 15 | | \.venv 16 | | \bdist\b 17 | | \bdoc\b 18 | ) 19 | ''' 20 | 21 | [build-system] 22 | requires = ["setuptools", "wheel"] 23 | build-backend = "setuptools.build_meta" -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # Please see the documentation for all configuration options: 2 | # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 3 | 4 | version: 2 5 | updates: 6 | 7 | # Maintain dependencies for GitHub Actions 8 | - package-ecosystem: "github-actions" 9 | directory: "/" 10 | schedule: 11 | interval: "daily" 12 | 13 | # Maintain dependencies for pip 14 | - package-ecosystem: "pip" 15 | directory: "/" 16 | schedule: 17 | interval: "daily" 18 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 115 3 | 4 | ignore = 5 | # these rules don't play well with black 6 | E203 # whitespace before : 7 | W503 # line break before binary operator 8 | 9 | per-file-ignores = 10 | # __init__.py files are allowed to have unused imports and lines-too-long 11 | */__init__.py:F401 12 | */**/**/__init__.py:F401,E501 13 | 14 | # tests don't have to respect 15 | # E731: do not assign a lambda expression, use a def 16 | tests/**:E731 17 | 18 | # scripts don't have to respect 19 | # E402: imports not at top of file (because we mess with sys.path) 20 | scripts/**:E402 21 | -------------------------------------------------------------------------------- /declutr/predictor.py: -------------------------------------------------------------------------------- 1 | from overrides import overrides 2 | 3 | from allennlp.common.util import JsonDict 4 | from allennlp.data import Instance 5 | from allennlp.predictors.predictor import Predictor 6 | 7 | 8 | @Predictor.register("declutr") 9 | class DeCLUTRPredictor(Predictor): 10 | """Predictor wrapper for `DeCLUTR` model. 11 | 12 | Registered as a `Predictor` with name "declutr". 13 | """ 14 | 15 | @overrides 16 | def _json_to_instance(self, json_dict: JsonDict) -> Instance: 17 | text = json_dict["text"] 18 | # Context manager ensures that the sample_spans property of our DatasetReader is False 19 | with self._dataset_reader.no_sample(): 20 | return self._dataset_reader.text_to_instance(text=text) 21 | -------------------------------------------------------------------------------- /tests/common/test_util.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from declutr.common import util 4 | from hypothesis import given 5 | from hypothesis.strategies import booleans, text 6 | 7 | 8 | @given(text=text(), lowercase=booleans()) 9 | def test_sanitize_text(text: str, lowercase: bool) -> None: 10 | sanitized_text = util.sanitize_text(text, lowercase=lowercase) 11 | 12 | # There should be no cases of multiple spaces or tabs 13 | assert re.search(r"[ ]{2,}", sanitized_text) is None 14 | assert "\t" not in sanitized_text 15 | # The beginning and end of the string should be stripped of whitespace 16 | assert not sanitized_text.startswith(("\n", " ")) 17 | assert not sanitized_text.endswith(("\n", " ")) 18 | # Sometimes, hypothesis generates text that cannot be lowercased (like latin characters). 19 | # We don't particularly care about this, and it breaks this check. 20 | # Only run if the generated text can be lowercased. 21 | if lowercase and text.lower().islower(): 22 | assert all(not char.isupper() for char in sanitized_text) 23 | -------------------------------------------------------------------------------- /tests/fixtures/experiment_mlm_only.jsonnet: -------------------------------------------------------------------------------- 1 | local COMMON = import 'common.jsonnet'; 2 | local transformer_model = "distilroberta-base"; 3 | 4 | { 5 | "vocabulary": COMMON['vocabulary'], 6 | "dataset_reader": COMMON['dataset_reader'], 7 | "datasets_for_vocab_creation": ["train"], 8 | "train_data_path": COMMON['train_data_path'], 9 | "validation_data_path": COMMON['validation_data_path'], 10 | "model": { 11 | "type": "declutr.DeCLUTR", 12 | "text_field_embedder": { 13 | "type": "declutr.modules.text_field_embedders.mlm_text_field_embedder.MLMTextFieldEmbedder", 14 | "token_embedders": { 15 | "tokens": { 16 | "type": "declutr.modules.token_embedders.pretrained_transformer_embedder_mlm.PretrainedTransformerEmbedderMLM", 17 | "model_name": transformer_model, 18 | "masked_language_modeling": true 19 | }, 20 | }, 21 | }, 22 | "loss": null 23 | }, 24 | "data_loader": COMMON['data_loader'], 25 | "trainer": COMMON['trainer'] 26 | } -------------------------------------------------------------------------------- /tests/fixtures/experiment.jsonnet: -------------------------------------------------------------------------------- 1 | local COMMON = import 'common.jsonnet'; 2 | local transformer_model = "distilroberta-base"; 3 | 4 | { 5 | "vocabulary": COMMON['vocabulary'], 6 | "dataset_reader": COMMON['dataset_reader'], 7 | "datasets_for_vocab_creation": ["train"], 8 | "train_data_path": COMMON['train_data_path'], 9 | "validation_data_path": COMMON['validation_data_path'], 10 | "model": { 11 | "type": "declutr.DeCLUTR", 12 | "text_field_embedder": { 13 | "type": "declutr.modules.text_field_embedders.MLMTextFieldEmbedder", 14 | "token_embedders": { 15 | "tokens": { 16 | "type": "declutr.modules.token_embedders.PretrainedTransformerEmbedderMLM", 17 | "model_name": transformer_model, 18 | "masked_language_modeling": true 19 | }, 20 | }, 21 | }, 22 | "loss": { 23 | "type": "declutr.losses.pytorch_metric_learning.NTXentLoss", 24 | "temperature": 0.05, 25 | }, 26 | }, 27 | "data_loader": COMMON['data_loader'], 28 | "trainer": COMMON['trainer'] 29 | } -------------------------------------------------------------------------------- /tests/fixtures/experiment_contrastive_only.jsonnet: -------------------------------------------------------------------------------- 1 | local COMMON = import 'common.jsonnet'; 2 | local transformer_model = "distilroberta-base"; 3 | 4 | { 5 | "vocabulary": COMMON['vocabulary'], 6 | "dataset_reader": COMMON['dataset_reader'], 7 | "datasets_for_vocab_creation": ["train"], 8 | "train_data_path": COMMON['train_data_path'], 9 | "validation_data_path": COMMON['validation_data_path'], 10 | "model": { 11 | "type": "declutr.DeCLUTR", 12 | "text_field_embedder": { 13 | "type": "declutr.modules.text_field_embedders.mlm_text_field_embedder.MLMTextFieldEmbedder", 14 | "token_embedders": { 15 | "tokens": { 16 | "type": "declutr.modules.token_embedders.pretrained_transformer_embedder_mlm.PretrainedTransformerEmbedderMLM", 17 | "model_name": transformer_model, 18 | "masked_language_modeling": false 19 | }, 20 | }, 21 | }, 22 | "loss": { 23 | "type": "declutr.losses.pytorch_metric_learning.NTXentLoss", 24 | "temperature": 0.05, 25 | }, 26 | }, 27 | "data_loader": COMMON['data_loader'], 28 | "trainer": COMMON['trainer'] 29 | } -------------------------------------------------------------------------------- /tests/fixtures/experiment_scalar_mix.jsonnet: -------------------------------------------------------------------------------- 1 | local COMMON = import 'common.jsonnet'; 2 | local transformer_model = "distilroberta-base"; 3 | 4 | { 5 | "vocabulary": COMMON['vocabulary'], 6 | "dataset_reader": COMMON['dataset_reader'], 7 | "datasets_for_vocab_creation": ["train"], 8 | "train_data_path": COMMON['train_data_path'], 9 | "validation_data_path": COMMON['validation_data_path'], 10 | "model": { 11 | "type": "declutr.DeCLUTR", 12 | "text_field_embedder": { 13 | "type": "declutr.modules.text_field_embedders.mlm_text_field_embedder.MLMTextFieldEmbedder", 14 | "token_embedders": { 15 | "tokens": { 16 | "type": "declutr.modules.token_embedders.pretrained_transformer_embedder_mlm.PretrainedTransformerEmbedderMLM", 17 | "model_name": transformer_model, 18 | "last_layer_only": false, 19 | "masked_language_modeling": true 20 | }, 21 | }, 22 | }, 23 | "loss": { 24 | "type": "declutr.losses.pytorch_metric_learning.NTXentLoss", 25 | "temperature": 0.05, 26 | }, 27 | }, 28 | "data_loader": COMMON['data_loader'], 29 | "trainer": COMMON['trainer'] 30 | } -------------------------------------------------------------------------------- /tests/fixtures/experiment_feedforward.jsonnet: -------------------------------------------------------------------------------- 1 | local COMMON = import 'common.jsonnet'; 2 | local transformer_model = "distilroberta-base"; 3 | 4 | { 5 | "vocabulary": COMMON['vocabulary'], 6 | "dataset_reader": COMMON['dataset_reader'], 7 | "datasets_for_vocab_creation": ["train"], 8 | "train_data_path": COMMON['train_data_path'], 9 | "validation_data_path": COMMON['validation_data_path'], 10 | "model": { 11 | "type": "declutr.DeCLUTR", 12 | "text_field_embedder": { 13 | "type": "declutr.modules.text_field_embedders.mlm_text_field_embedder.MLMTextFieldEmbedder", 14 | "token_embedders": { 15 | "tokens": { 16 | "type": "declutr.modules.token_embedders.pretrained_transformer_embedder_mlm.PretrainedTransformerEmbedderMLM", 17 | "model_name": transformer_model, 18 | "masked_language_modeling": true 19 | }, 20 | }, 21 | }, 22 | "feedforward": { 23 | "input_dim": 768, 24 | "num_layers": 1, 25 | "hidden_dims": 16, 26 | "activations": "relu", 27 | }, 28 | "loss": { 29 | "type": "declutr.losses.pytorch_metric_learning.NTXentLoss", 30 | "temperature": 0.05, 31 | }, 32 | }, 33 | "data_loader": COMMON['data_loader'], 34 | "trainer": COMMON['trainer'] 35 | } -------------------------------------------------------------------------------- /declutr/miners/pytorch_metric_learning.py: -------------------------------------------------------------------------------- 1 | from pytorch_metric_learning import miners 2 | 3 | from allennlp.common import Registrable 4 | 5 | 6 | class PyTorchMetricLearningMiner(Registrable): 7 | """This class just allows us to implement `Registrable` for PyTorch Metric Learning miner functions. 8 | Subclasses of this class should also subclass a miner function from PyTorch Metric Learning 9 | (see: https://kevinmusgrave.github.io/pytorch-metric-learning/miners/), and accept as arguments 10 | to the constructor the same arguments that the miner function does. See `MaximumLossMiner` below 11 | for an example. 12 | """ 13 | 14 | default_implementation = "pair_margin" 15 | 16 | 17 | @PyTorchMetricLearningMiner.register("pair_margin") 18 | class PairMarginMiner(PyTorchMetricLearningMiner, miners.PairMarginMiner): 19 | """Wraps the `PairMarginMiner` implementation from Pytorch Metric Learning: 20 | (https://kevinmusgrave.github.io/pytorch-metric-learning/miners/#pairmarginminer). 21 | 22 | Registered as a `PyTorchMetricLearningMiner` with name "pair_margin". 23 | """ 24 | 25 | def __init__( 26 | self, 27 | pos_margin: float, 28 | neg_margin: float, 29 | use_similarity: bool = True, 30 | squared_distances: bool = False, 31 | ) -> None: 32 | 33 | super().__init__( 34 | pos_margin=pos_margin, 35 | neg_margin=neg_margin, 36 | use_similarity=use_similarity, 37 | squared_distances=squared_distances, 38 | ) 39 | -------------------------------------------------------------------------------- /training_config/transformer_mean.jsonnet: -------------------------------------------------------------------------------- 1 | // This should be a registered name in the Transformers library (see https://huggingface.co/models) 2 | // OR a path on disk to a serialized transformer model. 3 | local transformer_model = std.extVar("TRANSFORMER_MODEL"); 4 | 5 | // This will be used to set the max/min # of tokens in the positive and negative examples. 6 | local max_length = 512; 7 | 8 | { 9 | "vocabulary": { 10 | "type": "empty" 11 | }, 12 | "dataset_reader": { 13 | "type": "declutr", 14 | "lazy": true, 15 | "tokenizer": { 16 | "type": "pretrained_transformer", 17 | "model_name": transformer_model, 18 | // Account for special tokens (e.g. CLS and SEP), otherwise a cryptic error is thrown. 19 | "max_length": max_length - 2, 20 | }, 21 | "token_indexers": { 22 | "tokens": { 23 | "type": "pretrained_transformer", 24 | "model_name": transformer_model, 25 | }, 26 | }, 27 | }, 28 | "train_data_path": null, 29 | "model": { 30 | "type": "declutr", 31 | "text_field_embedder": { 32 | "type": "mlm", 33 | "token_embedders": { 34 | "tokens": { 35 | "type": "pretrained_transformer_mlm", 36 | "model_name": transformer_model, 37 | "masked_language_modeling": true 38 | }, 39 | }, 40 | }, 41 | }, 42 | "data_loader": { 43 | "batch_size": 16, 44 | "num_workers": 1, 45 | "drop_last": true, 46 | }, 47 | "trainer": { 48 | "type": "no_op" 49 | }, 50 | } -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import List 3 | 4 | import pytest 5 | from allennlp.common import util as common_util 6 | from allennlp.common.file_utils import cached_path 7 | from allennlp.models.archival import Archive, load_archive 8 | from allennlp.predictors import Predictor 9 | 10 | from declutr.encoder import PRETRAINED_MODELS, Encoder 11 | from declutr.predictor import DeCLUTRPredictor 12 | 13 | # Note: Most of these are scoped as "module" to prevent a warning from hypothesis 14 | # about fixtures being reset between function calls. 15 | 16 | 17 | @pytest.fixture(params=["declutr-small", "declutr-base"], scope="module") 18 | def archive(request) -> Archive: 19 | if request.param in PRETRAINED_MODELS: 20 | pretrained_model_name_or_path = PRETRAINED_MODELS[request.param] 21 | common_util.import_module_and_submodules("declutr") 22 | pretrained_model_name_or_path = cached_path(pretrained_model_name_or_path) 23 | return load_archive(pretrained_model_name_or_path) 24 | 25 | 26 | @pytest.fixture(scope="module") 27 | def predictor(archive) -> DeCLUTRPredictor: 28 | return Predictor.from_archive(archive, predictor_name="declutr") 29 | 30 | 31 | @pytest.fixture(params=["declutr-small", "declutr-base"], scope="module") 32 | def encoder(request) -> Encoder: 33 | return Encoder(request.param) 34 | 35 | 36 | @pytest.fixture(scope="module") 37 | def inputs_filepath() -> str: 38 | # Some random examples taken from https://nlp.stanford.edu/projects/snli/ 39 | return "tests/fixtures/data/encoder_inputs.txt" 40 | 41 | 42 | @pytest.fixture(scope="module") 43 | def inputs(inputs_filepath) -> List[str]: 44 | return Path(inputs_filepath).read_text().split("\n") 45 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: build 5 | 6 | on: 7 | push: 8 | branches: [ master ] 9 | pull_request: 10 | branches: [ master ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ${{ matrix.os }} 16 | strategy: 17 | matrix: 18 | os: [ubuntu-latest, macos-latest] 19 | python-version: [3.6, 3.7, 3.8] 20 | 21 | steps: 22 | - uses: actions/checkout@v3 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v4.1.0 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | - name: Install dependencies 28 | run: | 29 | python -m pip install --upgrade pip 30 | pip install --editable ".[dev]" 31 | - name: Format code with black 32 | run: | 33 | black . 34 | - name: Lint with flake8 35 | run: | 36 | # stop the build if there are Python syntax errors or undefined names 37 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 38 | # exit-zero treats all errors as warnings. 39 | flake8 . --count --exit-zero --max-complexity=10 --statistics 40 | - name: Type check with mypy 41 | run: | 42 | mypy . 43 | - name: Test with pytest 44 | run: | 45 | pytest tests --cov ./declutr --cov-report=xml --cov-config=./.coveragerc 46 | - name: Upload coverage to Codecov 47 | uses: codecov/codecov-action@v3 48 | with: 49 | file: ./coverage.xml 50 | # Ignore codecov failures as the codecov server is not 51 | # very reliable but we don't want to report a failure 52 | # in the github UI just because the coverage report failed to 53 | # be published. 54 | fail_ci_if_error: false 55 | -------------------------------------------------------------------------------- /scripts/save_pretrained_hf.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import typer 4 | from allennlp.common import util as common_util 5 | from allennlp.models.archival import load_archive 6 | from allennlp.predictors import Predictor 7 | 8 | # Emoji's used in typer.secho calls 9 | # See: https://github.com/carpedm20/emoji/blob/master/emoji/unicode_codes.py" 10 | SAVING = "\U0001F4BE" 11 | HUGGING_FACE = "\U0001F917" 12 | 13 | 14 | def main(archive_file: str, save_directory: Path) -> None: 15 | """Saves the model and tokenizer from an AllenNLP `archive_file` path pointing to a trained 16 | DeCLUTR model to a format that can be used with HuggingFace Transformers at `save_directory`.""" 17 | save_directory = Path(save_directory) 18 | save_directory.parents[0].mkdir(parents=True, exist_ok=True) 19 | 20 | common_util.import_module_and_submodules("declutr") 21 | # cuda_device -1 places the model onto the CPU before saving. This avoids issues with 22 | # distributed models. 23 | overrides = "{'trainer.cuda_device': -1}" 24 | archive = load_archive(archive_file, overrides=overrides) 25 | predictor = Predictor.from_archive(archive, predictor_name="declutr") 26 | 27 | token_embedder = predictor._model._text_field_embedder._token_embedders["tokens"] 28 | model = token_embedder.transformer_model 29 | tokenizer = token_embedder.tokenizer 30 | 31 | # Casting as a string to avoid this error: https://github.com/huggingface/transformers/pull/4650 32 | # Can be removed after PR is merged and Transformers is updated. 33 | model.save_pretrained(str(save_directory)) 34 | tokenizer.save_pretrained(str(save_directory)) 35 | 36 | typer.secho( 37 | ( 38 | f"{SAVING} {HUGGING_FACE} Transformers compatible model saved to: {save_directory}." 39 | " See https://huggingface.co/transformers/model_sharing.html for instructions on" 40 | f" hosting the model with {HUGGING_FACE} Transformers." 41 | ), 42 | bold=True, 43 | ) 44 | 45 | 46 | if __name__ == "__main__": 47 | typer.run(main) 48 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r") as fh: 4 | long_description = fh.read() 5 | 6 | setuptools.setup( 7 | name="declutr", 8 | version="0.1.0rc1", 9 | author="John Giorgi", 10 | author_email="johnmgiorgi@gmail.com", 11 | description=("DeCLUTR: Deep Contrastive Learning for Unsupervised Textual Representations"), 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/JohnGiorgi/DeCLUTR", 15 | packages=setuptools.find_packages(), 16 | keywords=[ 17 | "universal sentence embeddings", 18 | "contrastive learning", 19 | "natural language processing", 20 | "allennlp", 21 | "pytorch", 22 | "transformers", 23 | "representation learning", 24 | "deep metric learning", 25 | "sentence embeddings", 26 | "sentence similarity", 27 | "semantic similarity", 28 | ], 29 | classifiers=[ 30 | "Development Status :: 1 - Planning", 31 | "Environment :: Console", 32 | "Intended Audience :: Science/Research", 33 | "License :: OSI Approved :: Apache Software License", 34 | "Operating System :: OS Independent", 35 | "Programming Language :: Python :: 3", 36 | "Programming Language :: Python :: 3.6", 37 | "Programming Language :: Python :: 3.7", 38 | "Programming Language :: Python :: 3.8", 39 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 40 | "Typing :: Typed", 41 | ], 42 | python_requires=">=3.6.1", 43 | install_requires=[ 44 | "allennlp>=1.1.0, <1.2.0", 45 | "pytorch-metric-learning>=0.9.98", 46 | "typer>=0.3.2", 47 | "validators>=0.18.2", 48 | ], 49 | extras_require={ 50 | "dev": [ 51 | "black", 52 | "coverage", 53 | "codecov", 54 | "flake8", 55 | "hypothesis", 56 | "pytest", 57 | "pytest-cov", 58 | "mypy", 59 | ] 60 | }, 61 | ) 62 | -------------------------------------------------------------------------------- /tests/fixtures/common.jsonnet: -------------------------------------------------------------------------------- 1 | // This should be a registered name in the Transformers library (see https://huggingface.co/models) 2 | // OR a path on disk to a serialized transformer model. 3 | local transformer_model = "distilroberta-base"; 4 | // This will be used to set the max/min # of tokens in the positive and negative examples. 5 | local max_length = 16; 6 | local min_length = 8; 7 | 8 | { 9 | "vocabulary": { 10 | "type": "empty" 11 | }, 12 | "dataset_reader": { 13 | "type": "declutr.dataset_reader.DeCLUTRDatasetReader", 14 | "lazy": true, 15 | "num_anchors": 2, 16 | "num_positives": 2, 17 | "max_span_len": max_length, 18 | "min_span_len": min_length, 19 | "tokenizer": { 20 | "type": "pretrained_transformer", 21 | "model_name": transformer_model, 22 | "max_length": max_length, 23 | }, 24 | "token_indexers": { 25 | "tokens": { 26 | "type": "pretrained_transformer", 27 | "model_name": transformer_model, 28 | }, 29 | }, 30 | }, 31 | "train_data_path": "tests/fixtures/data/openwebtext/train.txt", 32 | "validation_data_path": "tests/fixtures/data/openwebtext/valid.txt", 33 | "model": { 34 | "type": "declutr.DeCLUTR", 35 | }, 36 | "data_loader": { 37 | "batch_size": 4, 38 | "num_workers": 1, 39 | "drop_last": true 40 | }, 41 | "trainer": { 42 | "optimizer": { 43 | "type": "huggingface_adamw", 44 | "lr": 5e-5, 45 | "weight_decay": 0.1, 46 | "parameter_groups": [ 47 | // Apply weight decay to pre-trained params, excluding LayerNorm params and biases 48 | [["bias", "LayerNorm\\.weight", "layer_norm\\.weight"], {"weight_decay": 0}], 49 | ], 50 | }, 51 | "num_epochs": 1, 52 | "checkpointer": { 53 | "num_serialized_models_to_keep": -1, 54 | }, 55 | "grad_norm": 1.0, 56 | "learning_rate_scheduler": { 57 | "type": "slanted_triangular", 58 | }, 59 | }, 60 | } -------------------------------------------------------------------------------- /training_config/transformer_cls.jsonnet: -------------------------------------------------------------------------------- 1 | // This should be a registered name in the Transformers library (see https://huggingface.co/models) 2 | // OR a path on disk to a serialized transformer model. 3 | local transformer_model = std.extVar("TRANSFORMER_MODEL"); 4 | // The hidden size of the model, which can be found in its config as "hidden_size". 5 | local transformer_dim = std.parseInt(std.extVar("TRANSFORMER_DIM")); 6 | 7 | // This will be used to set the max/min # of tokens in the positive and negative examples. 8 | local max_length = 512; 9 | // Certain transformers use the last special token in the sequence to produce sequence embeddings 10 | // (e.g XLNet). 11 | local cls_is_last_token = false; 12 | 13 | { 14 | "vocabulary": { 15 | "type": "empty" 16 | }, 17 | "dataset_reader": { 18 | "type": "declutr", 19 | "lazy": true, 20 | "tokenizer": { 21 | "type": "pretrained_transformer", 22 | "model_name": transformer_model, 23 | // Account for special tokens (e.g. CLS and SEP), otherwise a cryptic error is thrown. 24 | "max_length": max_length - 2, 25 | }, 26 | "token_indexers": { 27 | "tokens": { 28 | "type": "pretrained_transformer", 29 | "model_name": transformer_model, 30 | }, 31 | }, 32 | }, 33 | "train_data_path": null, 34 | "model": { 35 | "type": "declutr", 36 | "text_field_embedder": { 37 | "type": "mlm", 38 | "token_embedders": { 39 | "tokens": { 40 | "type": "pretrained_transformer_mlm", 41 | "model_name": transformer_model, 42 | "masked_language_modeling": true 43 | }, 44 | }, 45 | }, 46 | "seq2vec_encoder": { 47 | "type": "cls_pooler", 48 | "embedding_dim": transformer_dim, 49 | "cls_is_last_token": cls_is_last_token 50 | }, 51 | }, 52 | "data_loader": { 53 | "batch_size": 16, 54 | "num_workers": 1, 55 | "drop_last": true, 56 | }, 57 | "trainer": { 58 | "type": "no_op" 59 | }, 60 | } -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # vscode 107 | .vscode/ 108 | 109 | # MacOS stuff: 110 | # General 111 | .DS_Store 112 | .AppleDouble 113 | .LSOverride 114 | # Icon must end with two \r 115 | Icon 116 | # Thumbnails 117 | ._* 118 | # Files that might appear in the root of a volume 119 | .DocumentRevisions-V100 120 | .fseventsd 121 | .Spotlight-V100 122 | .TemporaryItems 123 | .Trashes 124 | .VolumeIcon.icns 125 | .com.apple.timemachine.donotpresent 126 | # Directories potentially created on remote AFP share 127 | .AppleDB 128 | .AppleDesktop 129 | Network Trash Folder 130 | Temporary Items 131 | .apdisk 132 | 133 | # Added by us 134 | datasets 135 | pretrained_models 136 | SentEval -------------------------------------------------------------------------------- /tests/test_encoder.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import List 3 | 4 | import pytest 5 | import torch 6 | from declutr import Encoder 7 | from hypothesis import given, settings 8 | from hypothesis.strategies import booleans 9 | from torch.nn import CosineSimilarity 10 | 11 | 12 | class TestEncoder: 13 | cosine = CosineSimilarity(dim=-1) 14 | 15 | # The base model will take longer than the small model, which triggers a test timing error. 16 | # Turn off deadlines to avoid this. 17 | @settings(deadline=None) 18 | @given(sphereize=booleans()) 19 | def test_encoder( 20 | self, inputs: List[str], inputs_filepath: Path, encoder: Encoder, sphereize: bool 21 | ) -> None: 22 | # The relative ranking should not change if sphereize is True/False, so run tests with both. 23 | encoder._sphereize = sphereize 24 | 25 | # Run three distinct tests, which should cover all use cases of Encoder: 26 | # 1. A List[str] input where batch_size is not None. 27 | embeddings = encoder(inputs, batch_size=len(inputs)) 28 | embeddings = torch.from_numpy(embeddings) 29 | # These are hard-coded examples that should have the highest cosine similarity. 30 | assert torch.topk(self.cosine(embeddings[2], embeddings), k=2)[-1][-1].item() == 3 31 | assert torch.topk(self.cosine(embeddings[6], embeddings), k=2)[-1][-1].item() == 7 32 | 33 | # 2. A str input where batch_size is None. Check that the expected UserWarning is raised. 34 | embeddings = [] 35 | for text in inputs: 36 | if sphereize: 37 | with pytest.warns(UserWarning): 38 | embeddings.append(encoder(text, batch_size=None)) 39 | else: 40 | embeddings.append(encoder(text, batch_size=None)) 41 | embeddings = torch.as_tensor(embeddings).squeeze(1) 42 | assert torch.topk(self.cosine(embeddings[2], embeddings), k=2)[-1][-1].item() == 3 43 | assert torch.topk(self.cosine(embeddings[6], embeddings), k=2)[-1][-1].item() == 7 44 | 45 | # 3. A filepath input that points to file with one example per line. 46 | embeddings = encoder(inputs_filepath, batch_size=len(inputs)) 47 | embeddings = torch.from_numpy(embeddings) 48 | assert torch.topk(self.cosine(embeddings[2], embeddings), k=2)[-1][-1].item() == 3 49 | assert torch.topk(self.cosine(embeddings[6], embeddings), k=2)[-1][-1].item() == 7 50 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | To submit a pull request, please do the following: 4 | 5 | 1. Fork the [repository](https://github.com/JohnGiorgi/DeCLUTR) by clicking on the 'Fork' button on the repository's page. This creates a copy of the code under your GitHub user account. 6 | 7 | 2. Clone your fork to your local disk, and add the base repository as a remote: 8 | 9 | ```bash 10 | $ git clone git@github.com:/DeCLUTR.git 11 | $ cd DeCLUTR 12 | $ git remote add upstream https://github.com/JohnGiorgi/DeCLUTR.git 13 | ``` 14 | 15 | 3. Create a new branch to hold your development changes: 16 | 17 | ```bash 18 | $ git checkout -b a-descriptive-name-for-my-changes 19 | ``` 20 | 21 | __do not__ work on the `master` branch. 22 | 23 | 4. Set up a development environment by running the following command in a virtual environment: 24 | 25 | ```bash 26 | $ pip install -e ".[dev]" 27 | ``` 28 | 29 | (If the repository was already installed in the virtual environment, remove it with `pip uninstall` before reinstalling it in editable mode with the `-e` flag.) 30 | 31 | 5. Develop the features on your branch. 32 | 33 | This repository relies on `black` to format its source code 34 | consistently. After you make changes, format them with: 35 | 36 | ```bash 37 | $ black declutr 38 | ``` 39 | 40 | This repository also uses `flake8` to check for coding mistakes. To run the checks locally: 41 | 42 | ```bash 43 | $ flake8 declutr 44 | ``` 45 | 46 | Once you're happy with your changes, add changed files using `git add` and 47 | make a commit with `git commit` to record your changes locally: 48 | 49 | ```bash 50 | $ git add modified_file.py 51 | $ git commit 52 | ``` 53 | 54 | Please write [good commit messages](https://chris.beams.io/posts/git-commit/). 55 | 56 | It is a good idea to sync your copy of the code with the original 57 | repository regularly. This way you can quickly account for changes: 58 | 59 | ```bash 60 | $ git fetch upstream 61 | $ git rebase upstream/master 62 | ``` 63 | 64 | Push the changes to your account using: 65 | 66 | ```bash 67 | $ git push -u origin a-descriptive-name-for-my-changes 68 | ``` 69 | 70 | 6. Once you are satisfied, go to the webpage of your fork on GitHub. 71 | Click on 'Pull request' to send your changes to the project maintainers for review. 72 | 73 | > This is a work in progress. Inspiration for these guidelines were drawn from [here](https://github.com/huggingface/transformers/blob/master/CONTRIBUTING.md) and [here](https://github.com/nayafia/contributing-template). -------------------------------------------------------------------------------- /declutr/common/masked_lm_utils.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | 3 | import torch 4 | from transformers import PreTrainedTokenizer 5 | 6 | from allennlp.data import TextFieldTensors 7 | 8 | 9 | def _mask_tokens( 10 | inputs: torch.Tensor, tokenizer: PreTrainedTokenizer, mlm_probability: float = 0.15 11 | ) -> Tuple[torch.Tensor, torch.Tensor]: 12 | """Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% 13 | original. Copied from: 14 | https://github.com/huggingface/transformers/blob/master/examples/run_language_modeling.py""" 15 | 16 | if tokenizer.mask_token is None: 17 | raise ValueError( 18 | ( 19 | "This tokenizer does not have a mask token which is necessary for masked language" 20 | " modeling. Remove the --mlm flag if you want to use this tokenizer." 21 | ) 22 | ) 23 | 24 | labels = inputs.clone() 25 | # We sample a few tokens in each sequence for masked-LM training (with probability 26 | # mlm_probability defaults to 0.15 in Bert/RoBERTa) 27 | probability_matrix = torch.full(labels.shape, mlm_probability) 28 | special_tokens_mask = [ 29 | tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) 30 | for val in labels.tolist() 31 | ] 32 | probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) 33 | if tokenizer._pad_token is not None: 34 | padding_mask = labels.eq(tokenizer.pad_token_id) 35 | probability_matrix.masked_fill_(padding_mask, value=0.0) 36 | masked_indices = torch.bernoulli(probability_matrix).bool() 37 | labels[~masked_indices] = -100 # We only compute loss on masked tokens 38 | 39 | # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) 40 | indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices 41 | inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token) 42 | 43 | # 10% of the time, we replace masked input tokens with random word 44 | indices_random = ( 45 | torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced 46 | ) 47 | random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long) 48 | inputs[indices_random] = random_words[indices_random] 49 | 50 | # The rest of the time (10% of the time) we keep the masked input tokens unchanged 51 | return inputs, labels 52 | 53 | 54 | def mask_tokens( 55 | tokens: TextFieldTensors, 56 | tokenizer: PreTrainedTokenizer, 57 | mlm_probability: float = 0.15, 58 | ) -> TextFieldTensors: 59 | device = tokens["tokens"]["token_ids"].device 60 | inputs, labels = _mask_tokens( 61 | inputs=tokens["tokens"]["token_ids"].to("cpu"), 62 | tokenizer=tokenizer, 63 | mlm_probability=mlm_probability, 64 | ) 65 | tokens["tokens"]["token_ids"] = inputs.to(device) 66 | tokens["tokens"]["masked_lm_labels"] = labels.to(device) 67 | return tokens 68 | -------------------------------------------------------------------------------- /declutr/common/model_utils.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | 3 | import torch 4 | import torch.distributed as dist 5 | from allennlp.common import util 6 | from allennlp.data import TextFieldTensors 7 | 8 | 9 | def unpack_batch(tokens: TextFieldTensors) -> TextFieldTensors: 10 | """If the tensors of `tokens` are three-dimensional, we reshape them to be two-dimensional 11 | before returning the `TextFieldTensors` object. Otherwise, this is a no-op. 12 | 13 | # Parameters 14 | 15 | tokens : `TextFieldTensors` 16 | A `TextFieldTensors` object containnig the tensors to (possibly) reshape. 17 | 18 | # Returns 19 | 20 | `TextFieldTensors` 21 | Containing the (possibly) reshaped tensors. 22 | """ 23 | for name, tensor in tokens["tokens"].items(): 24 | if len(tensor.size()) == 3: 25 | tokens["tokens"][name] = tensor.reshape(tensor.size(0) * tensor.size(1), tensor.size(2)) 26 | return tokens 27 | 28 | 29 | def all_gather_anchor_positive_pairs( 30 | anchors: torch.Tensor, positives: torch.Tensor 31 | ) -> Tuple[torch.Tensor, torch.Tensor]: 32 | """If training on 2 or more GPUs, `all_gather`s the embeddings produced on each replica, 33 | ensuring that the gradients for the embeddings produced on each replica are not lost. The 34 | returned anchor, positive pairs can be fed to a contrastive loss. This method is necessary to 35 | ensure that we train against the expected number of negatives 2 * (batch size - 1) per batch, 36 | as a naive implementation would end up training against 2 * (batch size / n_gpus - 1) number of 37 | negatives. If we are not training on 2 or more GPUs, this method is a no-op and returns its 38 | inputs. 39 | 40 | # Parameters 41 | 42 | anchors : torch.Tensor 43 | Embedded text representing the anchors. 44 | positives : TextFieldTensors 45 | Embedded text representing the positives. 46 | 47 | # Returns 48 | 49 | Tuple[torch.Tensor, torch.Tensor] 50 | Embedded anchor, positive pairs that can be fed to a contrastive loss. 51 | """ 52 | 53 | # If we are not using distributed training, this is a no-op. 54 | if not util.is_distributed(): 55 | return anchors, positives 56 | 57 | # Gather the encoded anchors and positives on all replicas 58 | anchors_list = [torch.zeros_like(anchors) for _ in range(dist.get_world_size())] 59 | positives_list = [torch.zeros_like(positives) for _ in range(dist.get_world_size())] 60 | dist.all_gather(anchors_list, anchors.contiguous()) 61 | dist.all_gather(positives_list, positives.contiguous()) 62 | # The gathered copy of the current replicas positive pairs have no gradients, so we overwrite 63 | # them with the positive pairs generated on this replica, which DO have gradients. 64 | anchors_list[dist.get_rank()] = anchors 65 | positives_list[dist.get_rank()] = positives 66 | # Finally, we concatenate the positive pairs so they can be fed to the contrastive loss. 67 | anchors = torch.cat(anchors_list) 68 | positives = torch.cat(positives_list) 69 | 70 | return anchors, positives 71 | -------------------------------------------------------------------------------- /training_config/declutr_base.jsonnet: -------------------------------------------------------------------------------- 1 | // This should be a registered name in the Transformers library (see https://huggingface.co/models) 2 | // OR a path on disk to a serialized transformer model. 3 | local transformer_model = "roberta-base"; 4 | 5 | // This will be used to set the max/min # of tokens in the positive and negative examples. 6 | local max_length = 512; 7 | local min_length = 32; 8 | 9 | { 10 | "vocabulary": { 11 | "type": "empty" 12 | }, 13 | "dataset_reader": { 14 | "type": "declutr", 15 | "lazy": true, 16 | "num_anchors": 2, 17 | "num_positives": 2, 18 | "max_span_len": max_length, 19 | "min_span_len": min_length, 20 | "tokenizer": { 21 | "type": "pretrained_transformer", 22 | "model_name": transformer_model, 23 | // Account for special tokens (e.g. CLS and SEP), otherwise a cryptic error is thrown. 24 | "max_length": max_length - 2, 25 | }, 26 | "token_indexers": { 27 | "tokens": { 28 | "type": "pretrained_transformer", 29 | "model_name": transformer_model, 30 | }, 31 | }, 32 | }, 33 | "train_data_path": null, 34 | "model": { 35 | "type": "declutr", 36 | "text_field_embedder": { 37 | "type": "mlm", 38 | "token_embedders": { 39 | "tokens": { 40 | "type": "pretrained_transformer_mlm", 41 | "model_name": transformer_model, 42 | "masked_language_modeling": true 43 | }, 44 | }, 45 | }, 46 | "loss": { 47 | "type": "nt_xent", 48 | "temperature": 0.05, 49 | }, 50 | // There was a small bug in the original implementation that caused gradients derived from 51 | // the contrastive loss to be scaled by 1/N, where N is the number of GPUs used during 52 | // training. This has been fixed. To reproduce results from the paper, set this to false. 53 | // Note that this will have no effect if you are not using distributed training with more 54 | // than 1 GPU. 55 | "scale_fix": false 56 | }, 57 | "data_loader": { 58 | "batch_size": 4, 59 | "num_workers": 1, 60 | "drop_last": true, 61 | }, 62 | "trainer": { 63 | // Set use_amp to true to use automatic mixed-precision during training (if your GPU supports it) 64 | "use_amp": true, 65 | "optimizer": { 66 | "type": "huggingface_adamw", 67 | "lr": 5e-5, 68 | "eps": 1e-06, 69 | "correct_bias": false, 70 | "weight_decay": 0.1, 71 | "parameter_groups": [ 72 | // Apply weight decay to pre-trained params, excluding LayerNorm params and biases 73 | [["bias", "LayerNorm\\.weight", "layer_norm\\.weight"], {"weight_decay": 0}], 74 | ], 75 | }, 76 | "num_epochs": 1, 77 | "checkpointer": { 78 | // A value of null or -1 will save the weights of the model at the end of every epoch 79 | "num_serialized_models_to_keep": -1, 80 | }, 81 | "grad_norm": 1.0, 82 | "learning_rate_scheduler": { 83 | "type": "slanted_triangular", 84 | }, 85 | }, 86 | } -------------------------------------------------------------------------------- /training_config/declutr_small.jsonnet: -------------------------------------------------------------------------------- 1 | // This should be a registered name in the Transformers library (see https://huggingface.co/models) 2 | // OR a path on disk to a serialized transformer model. 3 | local transformer_model = "distilroberta-base"; 4 | 5 | // This will be used to set the max/min # of tokens in the positive and negative examples. 6 | local max_length = 512; 7 | local min_length = 32; 8 | 9 | { 10 | "vocabulary": { 11 | "type": "empty" 12 | }, 13 | "dataset_reader": { 14 | "type": "declutr", 15 | "lazy": true, 16 | "num_anchors": 2, 17 | "num_positives": 2, 18 | "max_span_len": max_length, 19 | "min_span_len": min_length, 20 | "tokenizer": { 21 | "type": "pretrained_transformer", 22 | "model_name": transformer_model, 23 | // Account for special tokens (e.g. CLS and SEP), otherwise a cryptic error is thrown. 24 | "max_length": max_length - 2, 25 | }, 26 | "token_indexers": { 27 | "tokens": { 28 | "type": "pretrained_transformer", 29 | "model_name": transformer_model, 30 | }, 31 | }, 32 | }, 33 | "train_data_path": null, 34 | "model": { 35 | "type": "declutr", 36 | "text_field_embedder": { 37 | "type": "mlm", 38 | "token_embedders": { 39 | "tokens": { 40 | "type": "pretrained_transformer_mlm", 41 | "model_name": transformer_model, 42 | "masked_language_modeling": true 43 | }, 44 | }, 45 | }, 46 | "loss": { 47 | "type": "nt_xent", 48 | "temperature": 0.05, 49 | }, 50 | // There was a small bug in the original implementation that caused gradients derived from 51 | // the contrastive loss to be scaled by 1/N, where N is the number of GPUs used during 52 | // training. This has been fixed. To reproduce results from the paper, set this to false. 53 | // Note that this will have no effect if you are not using distributed training with more 54 | // than 1 GPU. 55 | "scale_fix": false 56 | }, 57 | "data_loader": { 58 | "batch_size": 4, 59 | "num_workers": 1, 60 | "drop_last": true, 61 | }, 62 | "trainer": { 63 | // Set use_amp to true to use automatic mixed-precision during training (if your GPU supports it) 64 | "use_amp": true, 65 | "optimizer": { 66 | "type": "huggingface_adamw", 67 | "lr": 5e-5, 68 | "eps": 1e-06, 69 | "correct_bias": false, 70 | "weight_decay": 0.1, 71 | "parameter_groups": [ 72 | // Apply weight decay to pre-trained params, excluding LayerNorm params and biases 73 | [["bias", "LayerNorm\\.weight", "layer_norm\\.weight"], {"weight_decay": 0}], 74 | ], 75 | }, 76 | "num_epochs": 1, 77 | "checkpointer": { 78 | // A value of null or -1 will save the weights of the model at the end of every epoch 79 | "num_serialized_models_to_keep": -1, 80 | }, 81 | "grad_norm": 1.0, 82 | "learning_rate_scheduler": { 83 | "type": "slanted_triangular", 84 | }, 85 | }, 86 | } -------------------------------------------------------------------------------- /training_config/declutr.jsonnet: -------------------------------------------------------------------------------- 1 | // This should be a registered name in the Transformers library (see https://huggingface.co/models) 2 | // OR a path on disk to a serialized transformer model. 3 | local transformer_model = std.extVar("TRANSFORMER_MODEL"); 4 | 5 | // This will be used to set the max/min # of tokens in the positive and negative examples. 6 | local max_length = 512; 7 | local min_length = 32; 8 | 9 | { 10 | "vocabulary": { 11 | "type": "empty" 12 | }, 13 | "dataset_reader": { 14 | "type": "declutr", 15 | "lazy": true, 16 | "num_anchors": 2, 17 | "num_positives": 2, 18 | "max_span_len": max_length, 19 | "min_span_len": min_length, 20 | "tokenizer": { 21 | "type": "pretrained_transformer", 22 | "model_name": transformer_model, 23 | // Account for special tokens (e.g. CLS and SEP), otherwise a cryptic error is thrown. 24 | "max_length": max_length - 2, 25 | }, 26 | "token_indexers": { 27 | "tokens": { 28 | "type": "pretrained_transformer", 29 | "model_name": transformer_model, 30 | }, 31 | }, 32 | }, 33 | "train_data_path": null, 34 | "model": { 35 | "type": "declutr", 36 | "text_field_embedder": { 37 | "type": "mlm", 38 | "token_embedders": { 39 | "tokens": { 40 | "type": "pretrained_transformer_mlm", 41 | "model_name": transformer_model, 42 | "masked_language_modeling": true 43 | }, 44 | }, 45 | }, 46 | "loss": { 47 | "type": "nt_xent", 48 | "temperature": 0.05, 49 | }, 50 | // There was a small bug in the original implementation that caused gradients derived from 51 | // the contrastive loss to be scaled by 1/N, where N is the number of GPUs used during 52 | // training. This has been fixed. To reproduce results from the paper, set this to false. 53 | // Note that this will have no effect if you are not using distributed training with more 54 | // than 1 GPU. 55 | "scale_fix": false 56 | }, 57 | "data_loader": { 58 | "batch_size": 4, 59 | "num_workers": 1, 60 | "drop_last": true, 61 | }, 62 | "trainer": { 63 | // Set use_amp to true to use automatic mixed-precision during training (if your GPU supports it) 64 | "use_amp": true, 65 | "optimizer": { 66 | "type": "huggingface_adamw", 67 | "lr": 5e-5, 68 | "eps": 1e-06, 69 | "correct_bias": false, 70 | "weight_decay": 0.1, 71 | "parameter_groups": [ 72 | // Apply weight decay to pre-trained params, excluding LayerNorm params and biases 73 | [["bias", "LayerNorm\\.weight", "layer_norm\\.weight"], {"weight_decay": 0}], 74 | ], 75 | }, 76 | "num_epochs": 1, 77 | "checkpointer": { 78 | // A value of null or -1 will save the weights of the model at the end of every epoch 79 | "num_serialized_models_to_keep": -1, 80 | }, 81 | "grad_norm": 1.0, 82 | "learning_rate_scheduler": { 83 | "type": "slanted_triangular", 84 | }, 85 | }, 86 | } -------------------------------------------------------------------------------- /training_config/contrastive_only.jsonnet: -------------------------------------------------------------------------------- 1 | // This should be a registered name in the Transformers library (see https://huggingface.co/models) 2 | // OR a path on disk to a serialized transformer model. 3 | local transformer_model = std.extVar("TRANSFORMER_MODEL"); 4 | 5 | // This will be used to set the max/min # of tokens in the positive and negative examples. 6 | local max_length = 512; 7 | local min_length = 32; 8 | 9 | { 10 | "vocabulary": { 11 | "type": "empty" 12 | }, 13 | "dataset_reader": { 14 | "type": "declutr", 15 | "lazy": true, 16 | "num_anchors": 2, 17 | "num_positives": 2, 18 | "max_span_len": max_length, 19 | "min_span_len": min_length, 20 | "tokenizer": { 21 | "type": "pretrained_transformer", 22 | "model_name": transformer_model, 23 | // Account for special tokens (e.g. CLS and SEP), otherwise a cryptic error is thrown. 24 | "max_length": max_length - 2, 25 | }, 26 | "token_indexers": { 27 | "tokens": { 28 | "type": "pretrained_transformer", 29 | "model_name": transformer_model, 30 | }, 31 | }, 32 | }, 33 | "train_data_path": null, 34 | "model": { 35 | "type": "declutr", 36 | "text_field_embedder": { 37 | "type": "mlm", 38 | "token_embedders": { 39 | "tokens": { 40 | "type": "pretrained_transformer_mlm", 41 | "model_name": transformer_model, 42 | "masked_language_modeling": false 43 | }, 44 | }, 45 | }, 46 | "loss": { 47 | "type": "nt_xent", 48 | "temperature": 0.05, 49 | }, 50 | // There was a small bug in the original implementation that caused gradients derived from 51 | // the contrastive loss to be scaled by 1/N, where N is the number of GPUs used during 52 | // training. This has been fixed. To reproduce results from the paper, set this to false. 53 | // Note that this will have no effect if you are not using distributed training with more 54 | // than 1 GPU. 55 | "scale_fix": false 56 | }, 57 | "data_loader": { 58 | "batch_size": 4, 59 | "num_workers": 1, 60 | "drop_last": true, 61 | }, 62 | "trainer": { 63 | // Set use_amp to true to use automatic mixed-precision during training (if your GPU supports it) 64 | "use_amp": true, 65 | "optimizer": { 66 | "type": "huggingface_adamw", 67 | "lr": 5e-5, 68 | "eps": 1e-06, 69 | "correct_bias": false, 70 | "weight_decay": 0.1, 71 | "parameter_groups": [ 72 | // Apply weight decay to pre-trained params, excluding LayerNorm params and biases 73 | [["bias", "LayerNorm\\.weight", "layer_norm\\.weight"], {"weight_decay": 0}], 74 | ], 75 | }, 76 | "num_epochs": 1, 77 | "checkpointer": { 78 | // A value of null or -1 will save the weights of the model at the end of every epoch 79 | "num_serialized_models_to_keep": -1, 80 | }, 81 | "grad_norm": 1.0, 82 | "learning_rate_scheduler": { 83 | "type": "slanted_triangular", 84 | }, 85 | }, 86 | } -------------------------------------------------------------------------------- /training_config/mlm_only.jsonnet: -------------------------------------------------------------------------------- 1 | // This should be a registered name in the Transformers library (see https://huggingface.co/models) 2 | // OR a path on disk to a serialized transformer model. 3 | local transformer_model = std.extVar("TRANSFORMER_MODEL"); 4 | 5 | // This will be used to set the max/min # of tokens in the positive and negative examples. 6 | local max_length = 512; 7 | local min_length = 32; 8 | 9 | { 10 | "vocabulary": { 11 | "type": "empty" 12 | }, 13 | "dataset_reader": { 14 | "type": "declutr", 15 | "lazy": true, 16 | // Technically, we don't need to sample anchors or positives when training with MLM only. 17 | // However, to make this experiment as comparable as possible to the "Contrastive only" 18 | // and "Both" experiments, we sample the same number of anchors and MLM on all of them. 19 | "num_anchors": 2, 20 | "num_positives": 1, 21 | "max_span_len": max_length, 22 | "min_span_len": min_length, 23 | "tokenizer": { 24 | "type": "pretrained_transformer", 25 | "model_name": transformer_model, 26 | // Account for special tokens (e.g. CLS and SEP), otherwise a cryptic error is thrown. 27 | "max_length": max_length - 2, 28 | }, 29 | "token_indexers": { 30 | "tokens": { 31 | "type": "pretrained_transformer", 32 | "model_name": transformer_model, 33 | }, 34 | }, 35 | }, 36 | "train_data_path": null, 37 | "model": { 38 | "type": "declutr", 39 | "text_field_embedder": { 40 | "type": "mlm", 41 | "token_embedders": { 42 | "tokens": { 43 | "type": "pretrained_transformer_mlm", 44 | "model_name": transformer_model, 45 | "masked_language_modeling": true 46 | }, 47 | }, 48 | }, 49 | // There was a small bug in the original implementation that caused gradients derived from 50 | // the contrastive loss to be scaled by 1/N, where N is the number of GPUs used during 51 | // training. This has been fixed. To reproduce results from the paper, set this to false. 52 | // Note that this will have no effect if you are not using distributed training with more 53 | // than 1 GPU. 54 | "scale_fix": false 55 | }, 56 | "data_loader": { 57 | "batch_size": 4, 58 | "num_workers": 1, 59 | "drop_last": true, 60 | }, 61 | "trainer": { 62 | // Set use_amp to true to use automatic mixed-precision during training (if your GPU supports it) 63 | "use_amp": true, 64 | "optimizer": { 65 | "type": "huggingface_adamw", 66 | "lr": 5e-5, 67 | "eps": 1e-06, 68 | "correct_bias": false, 69 | "weight_decay": 0.1, 70 | "parameter_groups": [ 71 | // Apply weight decay to pre-trained params, excluding LayerNorm params and biases 72 | [["bias", "LayerNorm\\.weight", "layer_norm\\.weight"], {"weight_decay": 0}], 73 | ], 74 | }, 75 | "num_epochs": 1, 76 | "checkpointer": { 77 | // A value of null or -1 will save the weights of the model at the end of every epoch 78 | "num_serialized_models_to_keep": -1, 79 | }, 80 | "grad_norm": 1.0, 81 | "learning_rate_scheduler": { 82 | "type": "slanted_triangular", 83 | }, 84 | }, 85 | } -------------------------------------------------------------------------------- /tests/common/test_model_utils.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | 3 | import torch 4 | from allennlp.data import TextFieldTensors 5 | from hypothesis import given, settings 6 | from hypothesis.strategies import integers 7 | 8 | from declutr.common import model_utils 9 | 10 | 11 | class TestModelUtils: 12 | @settings(deadline=None) 13 | @given( 14 | batch_size=integers(min_value=1, max_value=4), 15 | num_anchors=integers(min_value=1, max_value=4), 16 | max_length=integers(min_value=1, max_value=16), 17 | ) 18 | def test_unpack_batch(self, batch_size: int, num_anchors: int, max_length: int) -> None: 19 | # Create some dummy data. 20 | two_dim_tensor = torch.randn(batch_size, max_length) 21 | two_dim_input: TextFieldTensors = { 22 | "tokens": { 23 | "token_ids": two_dim_tensor, 24 | "mask": torch.ones_like(two_dim_tensor), 25 | "type_ids": torch.ones_like(two_dim_tensor), 26 | } 27 | } 28 | three_dim_tensor = torch.randn(batch_size, num_anchors, max_length) 29 | three_dim_input: TextFieldTensors = { 30 | "tokens": { 31 | "token_ids": three_dim_tensor, 32 | "mask": torch.ones_like(three_dim_tensor), 33 | "type_ids": torch.ones_like(three_dim_tensor), 34 | } 35 | } 36 | four_dim_tensor = torch.randn(batch_size, num_anchors, num_anchors, max_length) 37 | four_dim_input: TextFieldTensors = { 38 | "tokens": { 39 | "token_ids": four_dim_tensor, 40 | "mask": torch.ones_like(four_dim_tensor), 41 | "type_ids": torch.ones_like(four_dim_tensor), 42 | } 43 | } 44 | 45 | # Only TextFieldTensors with tensors of three dimensions should be reshaped... 46 | # Tensors are updated in-place, so deepcopy before passing to unpack_batch 47 | actual_three_input_dim = model_utils.unpack_batch(deepcopy(three_dim_input)) 48 | for name, tensor in actual_three_input_dim["tokens"].items(): 49 | assert torch.equal( 50 | tensor, 51 | three_dim_input["tokens"][name].reshape(batch_size * num_anchors, max_length), 52 | ) 53 | # ...unpack_batch is a no-op for TextFieldTensors with tensors less than or greater than 3D. 54 | actual_two_dim_input = model_utils.unpack_batch(deepcopy(two_dim_input)) 55 | for name, tensor in actual_two_dim_input["tokens"].items(): 56 | assert torch.equal(tensor, two_dim_input["tokens"][name]) 57 | actual_four_dim_input = model_utils.unpack_batch(deepcopy(four_dim_input)) 58 | for name, tensor in actual_four_dim_input["tokens"].items(): 59 | assert torch.equal(tensor, four_dim_input["tokens"][name]) 60 | 61 | def test_all_gather_anchor_positive_pairs_no_op(self) -> None: 62 | """Check that `all_gather_anchor_positive_pairs` is a no-op when not in distributed mode.""" 63 | num_anchors = 2 64 | num_positives = 2 65 | batch_size = 16 66 | embedding_dim = 256 67 | 68 | expected_anchors = torch.randn(num_anchors, batch_size, embedding_dim) 69 | expected_positives = torch.randn(num_positives, batch_size, embedding_dim) 70 | actual_anchors, actual_positives = model_utils.all_gather_anchor_positive_pairs( 71 | expected_anchors, expected_positives 72 | ) 73 | 74 | assert torch.equal(actual_anchors, expected_anchors) 75 | assert torch.equal(actual_positives, expected_positives) 76 | -------------------------------------------------------------------------------- /declutr/modules/text_field_embedders/mlm_text_field_embedder.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | from typing import Dict 3 | 4 | import torch 5 | 6 | from allennlp.common.checks import ConfigurationError 7 | from allennlp.data import TextFieldTensors 8 | from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder 9 | from allennlp.modules.text_field_embedders.text_field_embedder import TextFieldEmbedder 10 | from allennlp.modules.time_distributed import TimeDistributed 11 | from allennlp.modules.token_embedders.token_embedder import TokenEmbedder 12 | 13 | 14 | @TextFieldEmbedder.register("mlm") 15 | class MLMTextFieldEmbedder(BasicTextFieldEmbedder): 16 | """ 17 | This is a a simple wrapper around `BasicTextFieldEmbedder` that accounts for the fact that 18 | our custom PretrainedTransformerEmbedderMLM returns a tuple containing the loss for the masked 19 | language modelling objective as well as some embedded text. 20 | 21 | Registered as a `TextFieldEmbedder` with name "mlm". 22 | 23 | # Parameters 24 | 25 | token_embedders : `Dict[str, TokenEmbedder]`, required. 26 | A dictionary mapping token embedder names to implementations. 27 | These names should match the corresponding indexer used to generate 28 | the tensor passed to the TokenEmbedder. 29 | """ 30 | 31 | def __init__(self, token_embedders: Dict[str, TokenEmbedder]) -> None: 32 | super().__init__(token_embedders) 33 | 34 | def forward( 35 | self, text_field_input: TextFieldTensors, num_wrapping_dims: int = 0, **kwargs 36 | ) -> torch.Tensor: 37 | if self._token_embedders.keys() != text_field_input.keys(): 38 | message = "Mismatched token keys: %s and %s" % ( 39 | str(self._token_embedders.keys()), 40 | str(text_field_input.keys()), 41 | ) 42 | raise ConfigurationError(message) 43 | 44 | embedded_representations = [] 45 | for key in self._ordered_embedder_keys: 46 | # Note: need to use getattr here so that the pytorch voodoo 47 | # with submodules works with multiple GPUs. 48 | embedder = getattr(self, "token_embedder_{}".format(key)) 49 | forward_params = inspect.signature(embedder.forward).parameters 50 | forward_params_values = {} 51 | missing_tensor_args = set() 52 | for param in forward_params.keys(): 53 | if param in kwargs: 54 | forward_params_values[param] = kwargs[param] 55 | else: 56 | missing_tensor_args.add(param) 57 | 58 | for _ in range(num_wrapping_dims): 59 | embedder = TimeDistributed(embedder) 60 | 61 | tensors: Dict[str, torch.Tensor] = text_field_input[key] 62 | if len(tensors) == 1 and len(missing_tensor_args) == 1: 63 | # If there's only one tensor argument to the embedder, and we just have one tensor 64 | # to embed, we can just pass in that tensor, without requiring a name match. 65 | masked_lm_loss, token_vectors = embedder( 66 | list(tensors.values())[0], **forward_params_values 67 | ) 68 | else: 69 | # If there are multiple tensor arguments, we have to require matching names from 70 | # the TokenIndexer. I don't think there's an easy way around that. 71 | masked_lm_loss, token_vectors = embedder(**tensors, **forward_params_values) 72 | if token_vectors is not None: 73 | # To handle some very rare use cases, we allow the return value of the embedder to 74 | # be None; we just skip it in that case. 75 | embedded_representations.append(token_vectors) 76 | return masked_lm_loss, torch.cat(embedded_representations, dim=-1) 77 | -------------------------------------------------------------------------------- /declutr/losses/pytorch_metric_learning.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from typing import Tuple 3 | 4 | import torch 5 | from allennlp.common import Registrable 6 | 7 | from declutr.miners import PyTorchMetricLearningMiner 8 | from pytorch_metric_learning import losses 9 | 10 | 11 | class PyTorchMetricLearningLoss(Registrable): 12 | """This class allows us to implement `Registrable` for PyTorch Metric Learning loss functions. 13 | Subclasses of this class should also subclass a loss function from PyTorch Metric Learning 14 | (see: https://kevinmusgrave.github.io/pytorch-metric-learning/losses/), and accept as arguments 15 | to the constructor the same arguments that the loss function does. See `NTXentLoss` below for 16 | an example. 17 | """ 18 | 19 | default_implementation = "nt_xent" 20 | 21 | @classmethod 22 | def get_embeddings_and_labels( 23 | self, anchors: torch.Tensor, positives: torch.Tensor 24 | ) -> Tuple[torch.Tensor, torch.Tensor]: 25 | """Formats a pair of anchor, positive embeddings for use with a PyTorch Metric Learning loss 26 | function (https://github.com/KevinMusgrave/pytorch-metric-learning). These loss functions 27 | expect a single embedding tensor, and a corresponding set of labels. Given two tensors: 28 | `anchor_embeddings` and `positive_embeddings` each of shape `(batch_size, embedding_dim)`, 29 | concatenate them along the first dimension to produce a single tensor, `embeddings`, of 30 | shape `(batch_size * 2, embedding_dim)`. Then, generate the corresponding `labels` tensor of 31 | shape `(batch_size * 2)` by assigning a matching integer index to each pair of anchor, 32 | positive embeddings in `embeddings`. 33 | 34 | # Parameters 35 | 36 | anchor_embeddings : `torch.Tensor` 37 | Encoded representations of the anchors. 38 | positive_embeddings : `torch.Tensor` 39 | Encoded representations of the positives. 40 | 41 | # Returns 42 | 43 | A tuple of embeddings and labels that can be fed directly to any PyTorch Metric Learning 44 | loss function. 45 | """ 46 | embeddings = torch.cat((anchors, positives)) 47 | # When using CrossBatchMemory, labels persist across batches so they need to be unique. 48 | # By choosing a random integer in (0, sys.maxsize) we can be reasonably sure of this. 49 | # Obviously, there are better (i.e. deterministic ways to do this), but I don't have 50 | # access to the current batch id or some other uniquely identifying value. 51 | indices = torch.randint(sys.maxsize, (anchors.size(0),), device=anchors.device) 52 | labels = torch.cat((indices, indices)) 53 | 54 | return embeddings, labels 55 | 56 | 57 | @PyTorchMetricLearningLoss.register("cross_batch_memory") 58 | class CrossBatchMemory(PyTorchMetricLearningLoss, losses.CrossBatchMemory): 59 | """Wraps the `CrossBatchMemory` implementation from Pytorch Metric Learning: 60 | (https://kevinmusgrave.github.io/pytorch-metric-learning/losses/#crossbatchmemory). 61 | 62 | Registered as a `PyTorchMetricLearningLoss` with name "cross_batch_memory". 63 | """ 64 | 65 | def __init__( 66 | self, 67 | loss: PyTorchMetricLearningLoss, 68 | embedding_size: int, 69 | memory_size: int = 1024, 70 | miner: PyTorchMetricLearningMiner = None, 71 | ) -> None: 72 | 73 | super().__init__( 74 | loss=loss, 75 | embedding_size=embedding_size, 76 | memory_size=memory_size, 77 | miner=miner, 78 | ) 79 | 80 | 81 | @PyTorchMetricLearningLoss.register("nt_xent") 82 | class NTXentLoss(PyTorchMetricLearningLoss, losses.NTXentLoss): 83 | """Wraps the `NTXentLoss` implementation from Pytorch Metric Learning: 84 | (https://kevinmusgrave.github.io/pytorch-metric-learning/losses/#ntxentloss). 85 | 86 | Registered as a `PyTorchMetricLearningLoss` with name "nt_xent". 87 | """ 88 | 89 | def __init__(self, temperature: float) -> None: 90 | 91 | super().__init__(temperature=temperature) 92 | -------------------------------------------------------------------------------- /tests/test_model.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | 5 | from allennlp.common.params import Params 6 | from allennlp.common.testing import ModelTestCase 7 | from allennlp.models import Model 8 | 9 | 10 | class TestDeCLUTR(ModelTestCase): 11 | def setup_method(self) -> None: 12 | super().setup_method() 13 | # We need to override the path set by AllenNLP 14 | self.FIXTURES_ROOT = Path("tests/fixtures") 15 | self.set_up_model( 16 | self.FIXTURES_ROOT / "experiment.jsonnet", 17 | self.FIXTURES_ROOT / "data" / "openwebtext" / "train.txt", 18 | ) 19 | 20 | def test_forward_pass_runs_correctly(self) -> None: 21 | training_tensors = self.dataset.as_tensor_dict() 22 | output_dict = self.model(**training_tensors) 23 | output_dict = self.model.make_output_human_readable(output_dict) 24 | assert "loss" in output_dict.keys() 25 | # Embeddings are not added to the output dict when training 26 | assert "embeddings" not in output_dict.keys() 27 | 28 | def test_forward_pass_with_feedforward_runs_correctly(self) -> None: 29 | self.set_up_model( 30 | self.FIXTURES_ROOT / "experiment_feedforward.jsonnet", 31 | self.FIXTURES_ROOT / "data" / "openwebtext" / "train.txt", 32 | ) 33 | training_tensors = self.dataset.as_tensor_dict() 34 | output_dict = self.model(**training_tensors) 35 | output_dict = self.model.make_output_human_readable(output_dict) 36 | assert "loss" in output_dict.keys() 37 | # Embeddings are not added to the output dict when training 38 | assert "embeddings" not in output_dict.keys() 39 | 40 | def test_forward_pass_contrastive_only_runs_correctly(self) -> None: 41 | self.set_up_model( 42 | self.FIXTURES_ROOT / "experiment_contrastive_only.jsonnet", 43 | self.FIXTURES_ROOT / "data" / "openwebtext" / "train.txt", 44 | ) 45 | training_tensors = self.dataset.as_tensor_dict() 46 | output_dict = self.model(**training_tensors) 47 | output_dict = self.model.make_output_human_readable(output_dict) 48 | assert "loss" in output_dict.keys() 49 | # Embeddings are not added to the output dict when training 50 | assert "embeddings" not in output_dict.keys() 51 | 52 | def test_forward_pass_mlm_only_runs_correctly(self) -> None: 53 | self.set_up_model( 54 | self.FIXTURES_ROOT / "experiment_mlm_only.jsonnet", 55 | self.FIXTURES_ROOT / "data" / "openwebtext" / "train.txt", 56 | ) 57 | training_tensors = self.dataset.as_tensor_dict() 58 | output_dict = self.model(**training_tensors) 59 | output_dict = self.model.make_output_human_readable(output_dict) 60 | assert "loss" in output_dict.keys() 61 | # Embeddings are not added to the output dict when training 62 | assert "embeddings" not in output_dict.keys() 63 | 64 | def test_forward_pass_scalar_mix_runs_correctly(self) -> None: 65 | self.set_up_model( 66 | self.FIXTURES_ROOT / "experiment_scalar_mix.jsonnet", 67 | self.FIXTURES_ROOT / "data" / "openwebtext" / "train.txt", 68 | ) 69 | training_tensors = self.dataset.as_tensor_dict() 70 | output_dict = self.model(**training_tensors) 71 | output_dict = self.model.make_output_human_readable(output_dict) 72 | assert "loss" in output_dict.keys() 73 | # Embeddings are not added to the output dict when training 74 | assert "embeddings" not in output_dict.keys() 75 | 76 | def test_no_loss_throws_configuration_error(self) -> None: 77 | params = Params.from_file(self.param_file) 78 | params["model"]["loss"] = None 79 | params["model"]["text_field_embedder"]["token_embedders"]["tokens"][ 80 | "masked_language_modeling" 81 | ] = False 82 | with pytest.raises(ValueError): 83 | Model.from_params(vocab=self.vocab, params=params.get("model")) 84 | 85 | @pytest.mark.skip(reason="failing for upstream reasons") 86 | def test_can_train_save_and_load(self) -> None: 87 | self.ensure_model_can_train_save_and_load(self.param_file) 88 | -------------------------------------------------------------------------------- /tests/test_dataset_reader.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from hypothesis import given, settings 3 | from hypothesis.strategies import integers, text 4 | 5 | from declutr.dataset_reader import DeCLUTRDatasetReader 6 | 7 | 8 | class TestDeCLUTRDatasetReader: 9 | # Not clear why turning off the deadline is neccesary? Errors out otherwise. 10 | @settings(deadline=None) 11 | @given( 12 | num_anchors=integers(min_value=0, max_value=4), 13 | num_positives=integers(min_value=1, max_value=4), 14 | max_span_len=integers(min_value=32, max_value=64), 15 | min_span_len=integers(min_value=16, max_value=32), 16 | ) 17 | def test_no_sample_context_manager( 18 | self, num_anchors: int, num_positives: int, max_span_len: int, min_span_len: int 19 | ) -> None: 20 | dataset_reader = DeCLUTRDatasetReader( 21 | num_anchors=num_anchors, 22 | num_positives=num_positives, 23 | max_span_len=max_span_len, 24 | min_span_len=min_span_len, 25 | ) 26 | 27 | # While in the scope of the context manager, sample_spans should be false. 28 | # After existing the context manger, it should return to whatever value it was at 29 | # before entering the contxt manager. 30 | previous = dataset_reader.sample_spans 31 | with dataset_reader.no_sample(): 32 | assert not dataset_reader.sample_spans 33 | assert dataset_reader.sample_spans == previous 34 | 35 | @given( 36 | num_anchors=integers(min_value=0, max_value=4), 37 | num_positives=integers(min_value=1, max_value=4), 38 | max_span_len=integers(min_value=32, max_value=64), 39 | min_span_len=integers(min_value=16, max_value=32), 40 | ) 41 | def test_init_raises_value_error_sampling_missing_arguments( 42 | self, num_anchors: int, num_positives: int, max_span_len: int, min_span_len: int 43 | ) -> None: 44 | if num_anchors: # should only raise the error when num_anchors is truthy 45 | with pytest.raises(ValueError): 46 | _ = DeCLUTRDatasetReader( 47 | num_anchors=num_anchors, 48 | num_positives=num_positives, 49 | max_span_len=None, 50 | min_span_len=min_span_len, 51 | ) 52 | with pytest.raises(ValueError): 53 | _ = DeCLUTRDatasetReader( 54 | num_anchors=num_anchors, 55 | num_positives=num_positives, 56 | max_span_len=max_span_len, 57 | min_span_len=None, 58 | ) 59 | with pytest.raises(ValueError): 60 | _ = DeCLUTRDatasetReader( 61 | num_anchors=num_anchors, 62 | num_positives=num_positives, 63 | max_span_len=None, 64 | min_span_len=None, 65 | ) 66 | with pytest.raises(ValueError): 67 | _ = DeCLUTRDatasetReader( 68 | num_anchors=num_anchors, 69 | num_positives=None, 70 | max_span_len=max_span_len, 71 | min_span_len=min_span_len, 72 | ) 73 | 74 | @given( 75 | num_anchors=integers(min_value=0, max_value=4), 76 | num_positives=integers(min_value=1, max_value=4), 77 | max_span_len=integers(min_value=32, max_value=64), 78 | min_span_len=integers(min_value=16, max_value=32), 79 | sampling_strategy=text(), 80 | ) 81 | def test_init_raises_value_error_invalid_sampling_strategy( 82 | self, 83 | num_anchors: int, 84 | num_positives: int, 85 | max_span_len: int, 86 | min_span_len: int, 87 | sampling_strategy: str, 88 | ) -> None: 89 | if num_anchors: # should only raise the error when num_spans is truthy 90 | with pytest.raises(ValueError): 91 | _ = DeCLUTRDatasetReader( 92 | num_anchors=num_anchors, 93 | num_positives=num_positives, 94 | max_span_len=max_span_len, 95 | min_span_len=min_span_len, 96 | sampling_strategy=sampling_strategy, 97 | ) 98 | -------------------------------------------------------------------------------- /scripts/preprocess_wikitext_103.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import io 3 | import re 4 | import zipfile 5 | from pathlib import Path 6 | from typing import List, Optional 7 | 8 | import requests 9 | import typer 10 | from declutr.common.util import sanitize_text 11 | 12 | WIKITEXT_103_URL = "https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip" 13 | 14 | # Emoji's used in typer.secho calls 15 | # See: https://github.com/carpedm20/emoji/blob/master/emoji/unicode_codes.py" 16 | SAVING = "\U0001F4BE" 17 | DOWNLOAD = "\U00002B07" 18 | 19 | 20 | def _write_output_to_disk(text: List[str], output_filepath: Path) -> None: 21 | """Writes a list of documents, `text`, to the file `output_filepath`, one document per line.""" 22 | # Create the directory path if it doesn't exist 23 | output_filepath = Path(output_filepath) 24 | output_filepath.parents[0].mkdir(parents=True, exist_ok=True) 25 | 26 | with open(output_filepath, "w") as f: 27 | # TODO (John): In the future, it might make sense to both batch and shard: 28 | # 1) Batch, meaning write batches of documents to a file as opposed to 1 at a time 29 | # 2) Shard, meaning break a file up into shard_size // len(text) files, and return a 30 | # directory instead. Loading a dataset like this is supported in AllenNLP (see: 31 | # https://docs.allennlp.org/master/api/data/dataset_readers/sharded_dataset_reader/) 32 | with typer.progressbar(text, label="Writing to disk") as progress: 33 | for doc in progress: 34 | f.write(doc.strip() + "\n") 35 | typer.secho( 36 | f"{SAVING} {len(text)} preprocessed documents saved to: {output_filepath}", 37 | bold=True, 38 | ) 39 | 40 | 41 | def main( 42 | output_filepath: Path, 43 | segment_sentences: bool = False, 44 | lowercase: bool = False, 45 | min_length: Optional[int] = None, 46 | max_instances: Optional[int] = None, 47 | pretrained_model_name_or_path: Optional[str] = None, 48 | ) -> None: 49 | """Downloads and lightly preprocesses WikiText-103. If `min_length is not None`, only documents 50 | with at least this many tokens are retained. If `pretrained_model_name_or_path` is not None, the 51 | tokenizer will be loaded as `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` 52 | using the HuggingFace Transformers library. Otherwise `str.split()` is used. This argument has 53 | no effect if `min-length is None`. If `segment_sentences` is provided, individual sentences 54 | will be returned instead of documents. You must have the `"en_core_web_sm"` spacy model 55 | installed to segment sentences. 56 | """ 57 | # Setup the pre-trained tokenizer, if specified 58 | if min_length is not None: 59 | if pretrained_model_name_or_path is not None: 60 | # Import transformers here to prevent ImportError errors if the 61 | # user doesn't want to use it. 62 | from transformers import AutoTokenizer 63 | 64 | tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path).tokenize 65 | else: 66 | tokenizer = lambda x: x.split() # noqa 67 | else: 68 | tokenizer = None 69 | 70 | # Setup spacy lang object if we are segmenting sentences 71 | if segment_sentences: 72 | import spacy 73 | 74 | nlp = spacy.load("en_core_web_sm", disable=["ner"]) 75 | 76 | # Download WikiText-103 77 | r = requests.get(WIKITEXT_103_URL, stream=True) 78 | z = zipfile.ZipFile(io.BytesIO(r.content)) 79 | partition_filenames = z.namelist()[1:] 80 | typer.secho(f"{DOWNLOAD} Downloaded WikiText-103", bold=True) 81 | 82 | preprocessed_documents: List[str] = [] 83 | for filename in partition_filenames: 84 | text = z.open(filename).read().decode("utf-8") 85 | 86 | # Strip out subtitles and split the text into documents 87 | no_subtitles = re.sub(r"(=\s){2,5}.*(=\s){2,5}", "", text) 88 | documents = re.split(r"=\s.*\s=", no_subtitles) 89 | 90 | if segment_sentences: 91 | documents = (sent.text for doc in documents for sent in nlp(doc).sents) # type: ignore 92 | 93 | with typer.progressbar( 94 | documents, length=max_instances, label=typer.style("Preprocessing text", bold=True) 95 | ) as progress: 96 | for doc in progress: 97 | doc = sanitize_text(doc, lowercase=lowercase) 98 | if not doc: 99 | continue 100 | 101 | # Retain documents if the length of their shortest document is 102 | # equal to or greater than the minimum specified length 103 | if tokenizer is not None: 104 | num_tokens = len(tokenizer(doc)) 105 | if min_length and num_tokens < min_length: 106 | continue 107 | 108 | if max_instances and len(preprocessed_documents) >= max_instances: 109 | break 110 | preprocessed_documents.append(doc) 111 | progress.update(1) 112 | 113 | _write_output_to_disk(preprocessed_documents, output_filepath) 114 | 115 | 116 | if __name__ == "__main__": 117 | typer.run(main) 118 | -------------------------------------------------------------------------------- /notebooks/evaluating.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "evaluating.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "accelerator": "GPU" 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "markdown", 19 | "metadata": { 20 | "id": "LY9LO9FnSPIa" 21 | }, 22 | "source": [ 23 | "# Evaluating a model\n", 24 | "\n", 25 | "This notebook will walk you through evaluating a [DeCLUTR](https://github.com/JohnGiorgi/DeCLUTR) model with [SentEval](https://github.com/facebookresearch/SentEval)." 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": { 31 | "id": "ZbZ4o1HHSM5t" 32 | }, 33 | "source": [ 34 | "## 🔧 Install the prerequisites\n", 35 | "\n", 36 | "Clone to repository locally so we have access to the evaluation scripts. Then install DeCLUTR" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "metadata": { 42 | "id": "fdyoe-EPSKLN" 43 | }, 44 | "source": [ 45 | "%%bash\n", 46 | "git clone https://github.com/JohnGiorgi/DeCLUTR.git\n", 47 | "cd DeCLUTR\n", 48 | "pip install -e .\n", 49 | "cd ../" 50 | ], 51 | "execution_count": null, 52 | "outputs": [] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": { 57 | "id": "5o1owrdbWDl9" 58 | }, 59 | "source": [ 60 | "Next, we have to clone the SentEval benchmark locally (this will take a few minutes)" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "metadata": { 66 | "id": "_9Mg77kOREs7" 67 | }, 68 | "source": [ 69 | "%%bash\n", 70 | "# Clone our fork which has several bug fixes merged\n", 71 | "git clone https://github.com/JohnGiorgi/SentEval.git\n", 72 | "cd SentEval/data/downstream/\n", 73 | "./get_transfer_data.bash\n", 74 | "cd ../../../" 75 | ], 76 | "execution_count": null, 77 | "outputs": [] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": { 82 | "id": "m05mAP5wWU-f" 83 | }, 84 | "source": [ 85 | "Lastly, we need a model to evaluate. We will download `DeCLUTR-small`:\n", 86 | "\n" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "metadata": { 92 | "id": "0Nd8oYGpUn5k" 93 | }, 94 | "source": [ 95 | "!wget https://github.com/JohnGiorgi/DeCLUTR/releases/download/v0.1.0rc1/declutr-small.tar.gz" 96 | ], 97 | "execution_count": null, 98 | "outputs": [] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": { 103 | "id": "3ae7g5puWn8X" 104 | }, 105 | "source": [ 106 | "## 📋 Evaluating the model\n", 107 | "\n", 108 | "Finally, use our provided script to evaluate the model on SentEval.\n", 109 | "\n", 110 | "> Note, the script will evaluate on all 28 SentEval tasks. This can take 7 hours or more on a GPU." 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "metadata": { 116 | "id": "zLWGGQSJUwW1" 117 | }, 118 | "source": [ 119 | "!python DeCLUTR/scripts/run_senteval.py allennlp \"SentEval\" \"declutr-small.tar.gz\" \\\n", 120 | " --output-filepath \"senteval_results.json\" \\\n", 121 | " --cuda-device 0 \\\n", 122 | " --include-package \"declutr\" \\\n", 123 | " --verbose" 124 | ], 125 | "execution_count": null, 126 | "outputs": [] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": { 131 | "id": "MLRyWa1IXepJ" 132 | }, 133 | "source": [ 134 | "We also provide commands for evaluating other popular sentence encoders. For a list of commands, run:" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "metadata": { 140 | "id": "GWdKsXc_W6TC" 141 | }, 142 | "source": [ 143 | "!python DeCLUTR/scripts/run_senteval.py --help" 144 | ], 145 | "execution_count": null, 146 | "outputs": [] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": { 151 | "id": "OCNuiVzLXozu" 152 | }, 153 | "source": [ 154 | "For help with a specific command, e.g. `transformers`, run:" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "metadata": { 160 | "id": "XEbSmYIVWNoB" 161 | }, 162 | "source": [ 163 | "!python DeCLUTR/scripts/run_senteval.py transformers --help" 164 | ], 165 | "execution_count": null, 166 | "outputs": [] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": { 171 | "id": "AZyXiAmHX3RF" 172 | }, 173 | "source": [ 174 | "Notice that evaluate other popular models, like [Sentence Transformers](https://www.sbert.net/)! Just make sure to install it first:" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "metadata": { 180 | "id": "samad0pSbg4N" 181 | }, 182 | "source": [ 183 | "!pip install sentence-transformers" 184 | ], 185 | "execution_count": null, 186 | "outputs": [] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "metadata": { 191 | "id": "V1kN71RBXtwB" 192 | }, 193 | "source": [ 194 | "!python DeCLUTR/scripts/run_senteval.py sentence-transformers \"SentEval\" \"roberta-base-nli-mean-tokens\" \\\n", 195 | " --output-filepath \"senteval_results.json\" \\\n", 196 | " --cuda-device 0 \\\n", 197 | " --verbose" 198 | ], 199 | "execution_count": null, 200 | "outputs": [] 201 | } 202 | ] 203 | } -------------------------------------------------------------------------------- /scripts/preprocess_openwebtext.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import shutil 3 | import tarfile 4 | from pathlib import Path 5 | from typing import Optional 6 | 7 | import typer 8 | from declutr.common.util import sanitize_text 9 | from more_itertools import chunked 10 | 11 | # Emoji's used in typer.secho calls 12 | # See: https://github.com/carpedm20/emoji/blob/master/emoji/unicode_codes.py" 13 | WARNING = "\U000026A0" 14 | SAVING = "\U0001F4BE" 15 | MINING = "\U000026CF" 16 | 17 | 18 | def main( 19 | openwebtext_path: Path = typer.Argument(..., help="Path to a OpenWebText dump."), 20 | output_filepath: Path = typer.Argument(..., help="Filepath to save the preprocessed text"), 21 | min_length: Optional[int] = typer.Option( 22 | None, help="Minimum token length of documents to retain" 23 | ), 24 | lowercase: bool = typer.Option(True, help="Whether text should be lowercased"), 25 | max_documents: Optional[int] = typer.Option( 26 | None, 27 | help="Maximum number of documents to retain. Because of batching, this won't be exact.", 28 | ), 29 | pretrained_model_name_or_path: Optional[str] = typer.Option( 30 | None, 31 | help=( 32 | "Name of the HuggingFace Tokenizer to use when determining the token length of a" 33 | "document. Has no effect if min-length is None" 34 | ), 35 | ), 36 | ) -> None: 37 | """Lightly preprocesses an OpenWebText dump obtained from 38 | https://skylion007.github.io/OpenWebTextCorpus/. If `min-length is not None`, only documents 39 | with at least this many tokens are retained. If `pretrained_model_name_or_path` is not None, 40 | the tokenizer will be loaded as `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` 41 | using the HuggingFace Transformers library. Otherwise `str.split()` is used. This argument has 42 | no effect if `min-length is None`. 43 | """ 44 | openwebtext_path = Path(openwebtext_path) 45 | output_filepath = Path(output_filepath) 46 | output_filepath.parents[0].mkdir(parents=True, exist_ok=True) 47 | 48 | # Setup the pre-trained tokenizer, if specified 49 | if min_length is not None: 50 | if pretrained_model_name_or_path is not None: 51 | # Import transformers here to prevent ImportError errors if the 52 | # user doesn't want to use it. 53 | from transformers import AutoTokenizer 54 | 55 | tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, use_fast=True) 56 | else: 57 | tokenizer = lambda x: x.split() # noqa 58 | else: 59 | tokenizer = None 60 | 61 | processed_docs = 0 62 | skipped_files = 0 63 | typer.secho( 64 | ( 65 | f'{MINING} Scraping {max_documents or "all"} documents' 66 | f' {f"with a minimum token length of {min_length}" if min_length else ""}' 67 | ), 68 | bold=True, 69 | ) 70 | 71 | with typer.progressbar( 72 | length=max_documents or len(list(openwebtext_path.iterdir())), label="Preprocessing text" 73 | ) as progress: 74 | for tar_filepath in openwebtext_path.iterdir(): 75 | # Didn't bother debugging as it only happens for a tiny number (1-2) of tar archives. 76 | # Instead, catch the error and report to the user at the end how many we skipped. 77 | untared_filepath = Path(tar_filepath.stem) 78 | try: 79 | with tarfile.open(tar_filepath) as tf: 80 | tf.extractall(untared_filepath) 81 | except (tarfile.ReadError, IsADirectoryError): 82 | skipped_files += 1 83 | continue 84 | 85 | for text_filepaths in chunked(untared_filepath.iterdir(), 128): 86 | docs = [] 87 | for fp in text_filepaths: 88 | # Some very minimal preprocessing to remove extra whitespace, newlines and tabs. 89 | doc = sanitize_text(fp.read_text(), lowercase=lowercase) 90 | # We add a space in front of the text in order to achieve consistant tokenization 91 | # with certain tokenizers, e.g. the BPE tokenizer used by RoBERTa, GPT and others. 92 | # See: https://github.com/huggingface/transformers/issues/1196 93 | doc = f"{ doc.lstrip()}" 94 | docs.append(doc) 95 | 96 | if tokenizer is not None: 97 | if pretrained_model_name_or_path: 98 | lengths = tokenizer( 99 | docs, add_special_tokens=False, truncation=False, return_length=True 100 | ).length 101 | else: 102 | lengths = [len(tokenizer(doc)) for doc in docs] 103 | docs = [doc for doc, length in zip(docs, lengths) if length > min_length] 104 | 105 | with open(output_filepath, "a") as f: 106 | f.write("\n".join(docs).strip() + "\n") 107 | 108 | if max_documents: 109 | progress.update(len(docs)) 110 | processed_docs += len(docs) 111 | if processed_docs >= max_documents: 112 | break 113 | 114 | # We are using a for-else trick here, see: https://stackoverflow.com/a/3150107/6578628 115 | else: 116 | if max_documents is None: 117 | progress.update(1) 118 | shutil.rmtree(untared_filepath) 119 | # Continue if the inner loop wasn't broken. 120 | continue 121 | shutil.rmtree(untared_filepath) 122 | # Inner loop was broken, break the outer. 123 | break 124 | 125 | if skipped_files: 126 | typer.secho( 127 | f"{WARNING} {skipped_files} tar files were skipped because they couldn't be extracted.", 128 | fg=typer.colors.YELLOW, 129 | bold=True, 130 | ) 131 | 132 | 133 | if __name__ == "__main__": 134 | typer.run(main) 135 | -------------------------------------------------------------------------------- /declutr/encoder.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from operator import itemgetter 3 | from pathlib import Path 4 | from typing import List, Optional, Tuple, Union, cast 5 | 6 | import torch 7 | from allennlp.common import util as common_util 8 | from allennlp.common.file_utils import cached_path 9 | from allennlp.models.archival import load_archive 10 | from allennlp.predictors import Predictor 11 | from validators.url import url 12 | 13 | from declutr.common.util import sanitize_text 14 | 15 | PRETRAINED_MODELS = { 16 | "declutr-small": "https://github.com/JohnGiorgi/DeCLUTR/releases/download/v0.1.0rc1/declutr-small.tar.gz", 17 | "declutr-base": "https://github.com/JohnGiorgi/DeCLUTR/releases/download/v0.1.0rc1/declutr-base.tar.gz", 18 | } 19 | 20 | 21 | class Encoder: 22 | """A simple interface to the model for the purposes of embedding sentences/paragraphs. 23 | 24 | # Example Usage 25 | 26 | ```python 27 | from declutr import Encoder 28 | 29 | # This can be a path on disk to a model you have trained yourself OR 30 | # the name of one of our pretrained models. 31 | pretrained_model_or_path = "declutr-small" 32 | 33 | encoder = Encoder(pretrained_model_or_path) 34 | embeddings = encoder([ 35 | "A smiling costumed woman is holding an umbrella.", 36 | "A happy woman in a fairy costume holds an umbrella." 37 | ]) 38 | ``` 39 | 40 | # Parameters 41 | 42 | pretrained_model_name_or_path : `str`, required 43 | Path to a serialized AllenNLP archive or a model name from: 44 | `declutr.encoder.PRETRAINED_MODEL_URLS` 45 | sphereize : `bool`, optional (default = `False`) 46 | If `True` embeddings will be l2-normalized and shifted by the centroid. Defaults to `False`. 47 | **kwargs : `Dict`, optional 48 | Keyword arguments that will be passed to `allennlp.models.archival.load_archive`. This is 49 | useful, for example, to specify a CUDA device id with `cuda_device`. See: 50 | https://docs.allennlp.org/master/api/models/archival/#load_archive for more details. 51 | """ 52 | 53 | _output_dict_field = "embeddings" 54 | 55 | def __init__( 56 | self, pretrained_model_name_or_path: str, sphereize: bool = False, **kwargs 57 | ) -> None: 58 | if pretrained_model_name_or_path in PRETRAINED_MODELS: 59 | pretrained_model_name_or_path = PRETRAINED_MODELS[pretrained_model_name_or_path] 60 | common_util.import_module_and_submodules("declutr") 61 | archive = load_archive(pretrained_model_name_or_path, **kwargs) 62 | self._predictor = Predictor.from_archive(archive, predictor_name="declutr") 63 | self._sphereize = sphereize 64 | 65 | @torch.no_grad() 66 | def __call__( 67 | self, inputs: Union[str, List[str]], batch_size: Optional[int] = None 68 | ) -> torch.Tensor: 69 | """Returns a numpy array of embeddings, one for each item in `inputs`. 70 | 71 | # Parameters 72 | 73 | inputs : `Union[str, List[str]]`, required 74 | The input text to embed. Can be a string, list of strings, or a filepath/URL to a text 75 | file with one input per line. 76 | batch_size : `int`, optional 77 | If given, the `inputs` will be batched before embedding. 78 | """ 79 | if isinstance(inputs, str): 80 | # Determine if inputs is a path, or text string 81 | try: 82 | is_path = Path(inputs).is_file() 83 | except OSError: 84 | warnings.warn( 85 | "'OSError' raised when checking if 'inputs' is a filepath." 86 | " Assuming it is a string or URL." 87 | ) 88 | else: 89 | is_path = Path(inputs).is_file() or url(inputs) 90 | 91 | if is_path: 92 | inputs = Path(cached_path(inputs)).read_text().split("\n") 93 | else: 94 | inputs = [inputs] 95 | 96 | if batch_size is None: 97 | unsort = False 98 | batch_size = len(inputs) 99 | else: 100 | # Sort the inputs by length, maintaining the original indices so we can un-sort 101 | # before returning the embeddings. This speeds up embedding by minimizing the 102 | # amount of computation performed on pads. Because this sorting happens before 103 | # tokenization, it is only a proxy of the true lengths of the inputs to the model. 104 | # In the future, it would be better to use the built-in bucket sort of AllenNLP, 105 | # which would lead to an even larger speedup. 106 | unsort = True 107 | sorted_indices, inputs = cast( 108 | Tuple[List[int], List[str]], zip(*sorted(enumerate(inputs), key=itemgetter(1))) 109 | ) # tell mypy explicitly the types of items in the unpacked tuple 110 | unsorted_indices, _ = zip(*sorted(enumerate(sorted_indices), key=itemgetter(1))) 111 | 112 | embeddings: torch.FloatTensor = [] # promise mypy we will behave 113 | for i in range(0, len(inputs), batch_size): 114 | batch_json = [{"text": sanitize_text(input_)} for input_ in inputs[i : i + batch_size]] 115 | outputs = self._predictor.predict_batch_json(batch_json) 116 | outputs = torch.as_tensor( 117 | # Accumulating the tensors on the GPU would quickly lead to OOM. 118 | [output[self._output_dict_field] for output in outputs], 119 | device="cpu", 120 | ) 121 | embeddings.append(outputs) 122 | embeddings = torch.cat(embeddings) 123 | # Make sure to unsort the embeddings if they were sorted. 124 | if unsort: 125 | unsorted_indices = torch.as_tensor(unsorted_indices, dtype=torch.long) 126 | embeddings = torch.index_select(embeddings, dim=0, index=unsorted_indices) 127 | if self._sphereize: 128 | if embeddings.size(0) > 1: 129 | centroid = torch.mean(embeddings, dim=0) 130 | embeddings -= centroid 131 | embeddings /= torch.norm(embeddings, dim=1, keepdim=True) 132 | else: 133 | warnings.warn( 134 | "sphereize==True but only a single input sentence was passed." 135 | " Inputs will not be sphereized." 136 | ) 137 | 138 | return embeddings.numpy() 139 | -------------------------------------------------------------------------------- /tests/common/test_contrastive_utils.py: -------------------------------------------------------------------------------- 1 | import random 2 | from typing import List, Union 3 | 4 | import pytest 5 | from declutr.common.contrastive_utils import sample_anchor_positive_pairs 6 | from hypothesis import given 7 | from hypothesis.strategies import integers, sampled_from 8 | from transformers import AutoTokenizer 9 | 10 | 11 | class TestContrastiveUtils: 12 | def tokenize(self, text) -> List[str]: 13 | return text.split() 14 | 15 | @given( 16 | num_anchors=integers(min_value=1, max_value=4), 17 | num_positives=integers(min_value=1, max_value=4), 18 | sampling_strategy=sampled_from(["subsuming", "adjacent", None]), 19 | ) 20 | def test_sample_spans( 21 | self, 22 | inputs: List[str], 23 | num_anchors: int, 24 | num_positives: int, 25 | sampling_strategy: Union[str, None], 26 | ) -> None: 27 | 28 | for text in inputs: 29 | tokens = self.tokenize(text) 30 | num_tokens = len(tokens) 31 | 32 | # Really short examples make the tests unreliable. 33 | if num_tokens < 7: 34 | continue 35 | 36 | # These represent sensible defaults 37 | max_span_len = num_tokens // 4 38 | min_span_len = random.randint(1, max_span_len) if max_span_len > 1 else 1 39 | 40 | if num_tokens < num_anchors * max_span_len * 2: 41 | with pytest.raises(ValueError): 42 | _, _ = sample_anchor_positive_pairs( 43 | text, 44 | num_anchors=num_anchors, 45 | num_positives=num_positives, 46 | max_span_len=max_span_len, 47 | min_span_len=min_span_len, 48 | sampling_strategy=sampling_strategy, 49 | ) 50 | else: 51 | anchors, positives = sample_anchor_positive_pairs( 52 | text, 53 | num_anchors=num_anchors, 54 | num_positives=num_positives, 55 | max_span_len=max_span_len, 56 | min_span_len=min_span_len, 57 | sampling_strategy=sampling_strategy, 58 | ) 59 | assert len(anchors) == num_anchors 60 | assert len(positives) == num_anchors * num_positives 61 | for i, anchor in enumerate(anchors): 62 | # Several simple checks for valid anchors. 63 | anchor_tokens = self.tokenize(anchor) 64 | anchor_length = len(anchor_tokens) 65 | assert anchor_length <= max_span_len 66 | assert anchor_length >= min_span_len 67 | # The tokenization process may lead to certain characters (such as escape 68 | # characters) being dropped, so repeat the tokenization process before 69 | # performing this check (otherwise a bunch of tests fail). 70 | assert anchor in " ".join(tokens) 71 | for j in range(i * num_positives, i * num_positives + num_positives): 72 | # Several simple checks for valid positives. 73 | positive = positives[j] 74 | positive_tokens = self.tokenize(positive) 75 | positive_length = len(positive_tokens) 76 | assert positive_length <= max_span_len 77 | assert positive_length >= min_span_len 78 | assert positive in " ".join(tokens) 79 | # Test that specific sampling strategies are obeyed. 80 | if sampling_strategy == "subsuming": 81 | assert positive in " ".join(anchor_tokens) 82 | elif sampling_strategy == "adjacent": 83 | assert positive not in " ".join(anchor_tokens) 84 | 85 | @given( 86 | num_anchors=integers(min_value=1, max_value=4), 87 | num_positives=integers(min_value=1, max_value=4), 88 | ) 89 | def test_sample_spans_raises_value_error_invalid_min_span_length( 90 | self, num_anchors: int, num_positives: int 91 | ) -> None: 92 | text = "They may take our lives, but they'll never take our freedom!" 93 | num_tokens = len(self.tokenize(text)) 94 | 95 | max_span_len = num_tokens - 1 # This is guaranteed to be valid. 96 | min_span_len = max_span_len + 1 # This is guaranteed to be invalid. 97 | 98 | with pytest.raises(ValueError): 99 | _, _ = sample_anchor_positive_pairs( 100 | text, 101 | num_anchors=num_anchors, 102 | num_positives=num_positives, 103 | max_span_len=max_span_len, 104 | min_span_len=min_span_len, 105 | ) 106 | 107 | @given( 108 | num_anchors=integers(min_value=1, max_value=4), 109 | num_positives=integers(min_value=1, max_value=4), 110 | ) 111 | def test_sample_spans_raises_value_error_invalid_max_span_length( 112 | self, num_anchors: int, num_positives: int 113 | ) -> None: 114 | text = "They may take our lives, but they'll never take our freedom!" 115 | num_tokens = len(self.tokenize(text)) 116 | 117 | max_span_len = num_tokens + 1 # This is guaranteed to be invalid. 118 | min_span_len = max_span_len - 1 # This is guaranteed to be valid. 119 | 120 | with pytest.raises(ValueError): 121 | _, _ = sample_anchor_positive_pairs( 122 | text, 123 | num_anchors=num_anchors, 124 | num_positives=num_positives, 125 | max_span_len=max_span_len, 126 | min_span_len=min_span_len, 127 | ) 128 | 129 | def test_sample_spans_with_hf_tokenizer(self): 130 | text = "They may take our lives, but they'll never take our freedom!" 131 | tokenizer = AutoTokenizer.from_pretrained("distilroberta-base") 132 | num_tokens = len(tokenizer(text)) 133 | 134 | # Arbitrary but valid choices 135 | max_span_len = num_tokens 136 | min_span_len = max_span_len - 1 137 | 138 | anchors, positives = sample_anchor_positive_pairs( 139 | text, 140 | num_anchors=1, 141 | num_positives=1, 142 | max_span_len=max_span_len, 143 | min_span_len=min_span_len, 144 | tokenizer=tokenizer.tokenize, 145 | ) 146 | 147 | for anchor in anchors: 148 | tokens = anchor.split() 149 | assert tokenizer.convert_tokens_to_string(tokens) in text 150 | for positive in positives: 151 | tokens = positive.split() 152 | assert tokenizer.convert_tokens_to_string(tokens) in text 153 | -------------------------------------------------------------------------------- /declutr/common/contrastive_utils.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, List, Optional, Tuple 2 | 3 | import numpy as np 4 | 5 | from allennlp.common.logging import AllenNlpLogger 6 | 7 | logger = AllenNlpLogger(__name__) 8 | 9 | 10 | def sample_anchor_positive_pairs( 11 | text: str, 12 | num_anchors: int, 13 | num_positives: int, 14 | max_span_len: int, 15 | min_span_len: int, 16 | sampling_strategy: Optional[str] = None, 17 | tokenizer: Optional[Callable[[str], List[str]]] = None, 18 | ) -> Tuple[List[str], List[str]]: 19 | """Returns a `Tuple` of `List`s, containing `num_anchors` anchor spans and `num_positives` 20 | positive spans sampled from `text`. 21 | 22 | # Parameters 23 | 24 | text : `str`, required 25 | The string to extract anchor and positive spans from. 26 | num_anchors : `int`, required 27 | The number of spans to sample from `text` to serve as anchors. 28 | num_positives : `int`, required 29 | The number of spans to sample from `text` to serve as positives (per anchor). 30 | max_span_len : `int`, required 31 | The maximum length of spans, after tokenization, to sample. 32 | min_span_len : `int`, required 33 | The minimum length of spans, after tokenization, to sample. 34 | sampling_strategy : `str`, optional (default = `None`) 35 | One of `"subsuming"` or `"adjacent"`. If `"subsuming"`, positive spans are always subsumed 36 | by the anchor. If `"adjacent"`, positive spans are always adjacent to the anchor. If not 37 | provided, positives may be subsumed, adjacent to, or overlapping with the anchor. 38 | tokenizer : `Callable`, optional (default = `None`) 39 | Optional tokenizer to use before sampling spans. If `None`, `text.split()` is used. 40 | """ 41 | # Tokenize the incoming text. Whitespace tokenization is much more straightforward 42 | # (we don't need to worry about chopping up subword tokens), but a user can also provide 43 | # their own tokenization scheme if they want. 44 | tokens = tokenizer(text) if tokenizer is not None else text.split() 45 | tok_method = "tokenizer(text)" if tokenizer else "text.split()" 46 | num_tokens = len(tokens) 47 | 48 | if num_tokens < num_anchors * max_span_len * 2: 49 | raise ValueError( 50 | f"len({tok_method}) should be at least {num_anchors * max_span_len * 2}" 51 | f" (num_anchors * max_span_len * 2), got {num_tokens}." 52 | ) 53 | if min_span_len > max_span_len: 54 | raise ValueError( 55 | f"min_span_len must be less than max_span_len ({max_span_len}), got {min_span_len}." 56 | ) 57 | if max_span_len > num_tokens: 58 | raise ValueError( 59 | ( 60 | f"max_span_len must be less than or equal to" 61 | f" len({tok_method}) ({num_tokens}), got {max_span_len}." 62 | ) 63 | ) 64 | 65 | # Valid anchor starts are token indices which begin a token span of at least max_span_len. 66 | anchors, positives = [], [] 67 | valid_anchor_starts = list(range(0, num_tokens - max_span_len + 1, max_span_len)) 68 | for i in range(num_anchors): 69 | # Sample the anchor length from a beta distribution skewed towards longer spans, the 70 | # intuition being that longer spans have the best chance of being representative of the 71 | # document they are sampled from. 72 | anchor_len = int(np.random.beta(4, 2) * (max_span_len - min_span_len) + min_span_len) 73 | # This check prevents an edge case were we run out of valid_anchor_starts. 74 | if len(valid_anchor_starts) // (num_anchors - i) < num_anchors - i: 75 | anchor_start_idx = np.random.choice([0, len(valid_anchor_starts) - 1]) 76 | else: 77 | anchor_start_idx = np.random.randint(len(valid_anchor_starts)) 78 | # When num_anchors = 1, this is equivalent to uniformly sampling that starting position. 79 | anchor_start = np.random.randint( 80 | valid_anchor_starts[anchor_start_idx], 81 | # randint is high-exclusive 82 | valid_anchor_starts[anchor_start_idx] + max_span_len - anchor_len + 1, 83 | ) 84 | # Once sampled, remove an anchor (and its immediate neighbours) from consideration. 85 | del valid_anchor_starts[max(0, anchor_start_idx - 1) : anchor_start_idx + 2] 86 | anchor_end = anchor_start + anchor_len 87 | anchors.append(" ".join(tokens[anchor_start:anchor_end])) 88 | 89 | # Sample positives from around the anchor. The intuition being that text that appears 90 | # close together is the same document is likely to be semantically similar. 91 | for _ in range(num_positives): 92 | # A user can specify a subsuming or adjacent only sampling strategy. 93 | if sampling_strategy == "subsuming": 94 | # To be strictly subsuming, we cannot allow the positive_len > anchor_len. 95 | positive_len = int( 96 | np.random.beta(2, 4) * (anchor_len - min_span_len) + min_span_len 97 | ) 98 | # randint is high-exclusive 99 | positive_start = np.random.randint(anchor_start, anchor_end - positive_len + 1) 100 | elif sampling_strategy == "adjacent": 101 | # Restrict positives to a length that will allow them to be adjacent to the anchor 102 | # without running off the edge of the document. If the anchor has sufficent room on 103 | # either side, this won't be a problem and max_positive_len will equal max_span_len. 104 | max_positive_len = min(max_span_len, max(anchor_start, num_tokens - anchor_end)) 105 | if max_positive_len < max_span_len: 106 | logger.warning_once( 107 | ( 108 | "There is no room to sample an adjacent positive span. Temporarily" 109 | " reducing the maximum span length of positives. This message will not" 110 | " be displayed again." 111 | ) 112 | ) 113 | positive_len = int( 114 | np.random.beta(2, 4) * (max_positive_len - min_span_len) + min_span_len 115 | ) 116 | # There are two types of adjacent positives, those that border the beginning of the 117 | # anchor and those that border the end. The checks above guarantee at least one of 118 | # these is valid. Here we just choose from the valid positive starts at random. 119 | valid_starts = [] 120 | if anchor_start - positive_len > 0: 121 | valid_starts.append(anchor_start - positive_len) 122 | if anchor_end + positive_len <= num_tokens: 123 | valid_starts.append(anchor_end) 124 | positive_start = np.random.choice(valid_starts) 125 | else: 126 | # Sample positive length from a beta distribution skewed towards shorter spans. The 127 | # idea is to promote diversity and minimize the amount of overlapping text. 128 | positive_len = int( 129 | np.random.beta(2, 4) * (max_span_len - min_span_len) + min_span_len 130 | ) 131 | # By default, spans may be adjacent or overlap with each other and the anchor. 132 | # Careful not to run off the edges of the document (this error may pass silently). 133 | positive_start = np.random.randint( 134 | max(0, anchor_start - positive_len), 135 | min(anchor_end, num_tokens - positive_len) + 1, # randint is high-exclusive 136 | ) 137 | 138 | positive_end = positive_start + positive_len 139 | positives.append(" ".join(tokens[positive_start:positive_end])) 140 | 141 | return anchors, positives 142 | -------------------------------------------------------------------------------- /notebooks/training.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "accelerator": "GPU", 6 | "colab": { 7 | "name": "training.ipynb", 8 | "private_outputs": true, 9 | "provenance": [], 10 | "collapsed_sections": [] 11 | }, 12 | "kernelspec": { 13 | "display_name": "Python 3", 14 | "language": "python", 15 | "name": "python3" 16 | }, 17 | "language_info": { 18 | "codemirror_mode": { 19 | "name": "ipython", 20 | "version": 3 21 | }, 22 | "file_extension": ".py", 23 | "mimetype": "text/x-python", 24 | "name": "python", 25 | "nbconvert_exporter": "python", 26 | "pygments_lexer": "ipython3", 27 | "version": "3.8.5" 28 | } 29 | }, 30 | "cells": [ 31 | { 32 | "cell_type": "markdown", 33 | "metadata": { 34 | "id": "I8jt6ML03DS5" 35 | }, 36 | "source": [ 37 | "# Training your own model\n", 38 | "\n", 39 | "This notebook will walk you through training your own model using [DeCLUTR](https://github.com/JohnGiorgi/DeCLUTR)." 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": { 45 | "id": "SU3Iod2-g0-o" 46 | }, 47 | "source": [ 48 | "## 🔧 Install the prerequisites" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "metadata": { 54 | "id": "sr4r5pN40Kli" 55 | }, 56 | "source": [ 57 | "!pip install git+https://github.com/JohnGiorgi/DeCLUTR.git" 58 | ], 59 | "execution_count": null, 60 | "outputs": [] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": { 65 | "id": "Zog7ApwuUD7_" 66 | }, 67 | "source": [ 68 | "## 📖 Preparing a dataset" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": { 74 | "id": "uwnLpUmN4Art" 75 | }, 76 | "source": [ 77 | "\n", 78 | "A dataset is simply a file containing one item of text (a document, a scientific paper, etc.) per line. For demonstration purposes, we have provided a script that will download the [WikiText-103](https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/) dataset and format it for training with our method.\n", 79 | "\n", 80 | "The only \"gotcha\" is that each piece of text needs to be long enough so that we can sample spans from it. In general, you should collect documents of a minimum length according to the following:\n", 81 | "\n", 82 | "```python\n", 83 | "min_length = num_anchors * max_span_len * 2\n", 84 | "```\n", 85 | "\n", 86 | "In our paper, we set `num_anchors=2` and `max_span_len=512`, so we require documents of `min_length=2048`. We simply need to provide this value as an argument when running the script:" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "metadata": { 92 | "id": "q0fwnwq23aAZ" 93 | }, 94 | "source": [ 95 | "import os\n", 96 | "\n", 97 | "train_data_path = \"wikitext_103/train.txt\"\n", 98 | "min_length = 2048\n", 99 | "\n", 100 | "!wget -nc https://raw.githubusercontent.com/JohnGiorgi/DeCLUTR/master/scripts/preprocess_wikitext_103.py\n", 101 | "!python preprocess_wikitext_103.py $train_data_path --min-length $min_length" 102 | ], 103 | "execution_count": null, 104 | "outputs": [] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": { 109 | "id": "yUEFeupP6qy-" 110 | }, 111 | "source": [ 112 | "Lets confirm that our dataset looks as expected." 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "metadata": { 118 | "id": "K7ffGXCn7Cpq" 119 | }, 120 | "source": [ 121 | "!wc -l $train_data_path # This should be approximately 17.8K lines" 122 | ], 123 | "execution_count": null, 124 | "outputs": [] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "metadata": { 129 | "id": "10DprWZc9iV6" 130 | }, 131 | "source": [ 132 | "!head -n 1 $train_data_path # This should be a single Wikipedia entry" 133 | ], 134 | "execution_count": null, 135 | "outputs": [] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": { 140 | "id": "VKYdambZ59nM" 141 | }, 142 | "source": [ 143 | "## 🏃 Training the model\n", 144 | "\n", 145 | "Once you have collected the dataset, you can easily initiate a training session with the `allennlp train` command. An experiment is configured using a [Jsonnet](https://jsonnet.org/) config file. Lets take a look at the config for the DeCLUTR-small model presented in [our paper](https://arxiv.org/abs/2006.03659):" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "metadata": { 151 | "id": "xTaSExh4ba8e" 152 | }, 153 | "source": [ 154 | "!wget -nc https://raw.githubusercontent.com/JohnGiorgi/DeCLUTR/master/training_config/declutr_small.jsonnet\n", 155 | "with open(\"declutr_small.jsonnet\", \"r\") as f:\n", 156 | " print(f.read())" 157 | ], 158 | "execution_count": null, 159 | "outputs": [] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": { 164 | "id": "-f1HqWSscWOx" 165 | }, 166 | "source": [ 167 | "\n", 168 | "The only thing to configure is the path to the training set (`train_data_path`), which can be passed to `allennlp train` via the `--overrides` argument (but you can also provide it in your config file directly, if you prefer):" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "metadata": { 174 | "id": "YS9VuxESBcr3" 175 | }, 176 | "source": [ 177 | "overrides = (\n", 178 | " f\"{{'train_data_path': '{train_data_path}', \"\n", 179 | " # lower the batch size to be able to train on Colab GPUs\n", 180 | " \"'data_loader.batch_size': 2, \"\n", 181 | " # training examples / batch size. Not required, but gives us a more informative progress bar during training\n", 182 | " \"'data_loader.batches_per_epoch': 8912}\"\n", 183 | ")" 184 | ], 185 | "execution_count": null, 186 | "outputs": [] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "metadata": { 191 | "id": "2v4tiiXgBC2M" 192 | }, 193 | "source": [ 194 | "overrides" 195 | ], 196 | "execution_count": null, 197 | "outputs": [] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "metadata": { 202 | "id": "Db_cNfZ76KRf" 203 | }, 204 | "source": [ 205 | "!allennlp train \"declutr_small.jsonnet\" \\\n", 206 | " --serialization-dir \"output\" \\\n", 207 | " --overrides \"$overrides\" \\\n", 208 | " --include-package \"declutr\" \\\n", 209 | " -f" 210 | ], 211 | "execution_count": null, 212 | "outputs": [] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": { 217 | "id": "Qsbr6OMv16GQ" 218 | }, 219 | "source": [ 220 | "### 🤗 Exporting a trained model to HuggingFace Transformers\n", 221 | "\n", 222 | "We have provided a simple script to export a trained model so that it can be loaded with [Hugging Face Transformers](https://github.com/huggingface/transformers)" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "metadata": { 228 | "id": "KqmWVD0y16GQ" 229 | }, 230 | "source": [ 231 | "!wget -nc https://github.com/JohnGiorgi/DeCLUTR/blob/master/scripts/save_pretrained_hf.py\n", 232 | "!python save_pretrained_hf.py --archive-file \"output\" --save-directory \"output_transformers\"" 233 | ], 234 | "execution_count": null, 235 | "outputs": [] 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "metadata": { 240 | "id": "N0-NTFaH16GQ" 241 | }, 242 | "source": [ 243 | "The model, saved to `--save-directory`, can then be loaded using the Hugging Face Transformers library\n", 244 | "\n", 245 | "> See the [embedding notebook](https://colab.research.google.com/github/JohnGiorgi/DeCLUTR/blob/master/notebooks/embedding.ipynb) for more details on using trained models." 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "metadata": { 251 | "id": "pAl1zIya16GQ" 252 | }, 253 | "source": [ 254 | "from transformers import AutoTokenizer, AutoModelForMaskedLM\n", 255 | " \n", 256 | "tokenizer = AutoTokenizer.from_pretrained(\"output_transformers\")\n", 257 | "model = AutoModel.from_pretrained(\"output_transformers\")" 258 | ], 259 | "execution_count": null, 260 | "outputs": [] 261 | }, 262 | { 263 | "cell_type": "markdown", 264 | "metadata": { 265 | "id": "mzQ0G4rp16GQ" 266 | }, 267 | "source": [ 268 | "> If you would like to upload your model to the Hugging Face model repository, follow the instructions [here](https://huggingface.co/transformers/model_sharing.html)." 269 | ] 270 | }, 271 | { 272 | "cell_type": "markdown", 273 | "metadata": { 274 | "id": "eD5dZo18EE-S" 275 | }, 276 | "source": [ 277 | "## ♻️ Conclusion\n", 278 | "\n", 279 | "That's it! In this notebook, we covered how to collect data for training the model, and specifically how _long_ that text needs to be. We then briefly covered configuring and running a training session. Please see [our paper](https://arxiv.org/abs/2006.03659) and [repo](https://github.com/JohnGiorgi/DeCLUTR) for more details, and don't hesitate to open an issue if you have any trouble!" 280 | ] 281 | } 282 | ] 283 | } -------------------------------------------------------------------------------- /declutr/dataset_reader.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import random 3 | from contextlib import contextmanager 4 | from typing import Any, Dict, Iterable, Iterator, List 5 | 6 | from allennlp.common.file_utils import cached_path 7 | from allennlp.data.dataset_readers import DatasetReader 8 | from allennlp.data.fields import Field, ListField, TextField 9 | from allennlp.data.instance import Instance 10 | from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenIndexer 11 | from allennlp.data.tokenizers import PretrainedTransformerTokenizer, SpacyTokenizer, Tokenizer 12 | from overrides import overrides 13 | 14 | from declutr.common.contrastive_utils import sample_anchor_positive_pairs 15 | from declutr.common.util import sanitize_text 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | 20 | @DatasetReader.register("declutr") 21 | class DeCLUTRDatasetReader(DatasetReader): 22 | """ 23 | Read a text file containing one instance per line, and create a dataset suitable for a 24 | `DeCLUTR` model. 25 | 26 | The output of `read` is a list of `Instance` s with the field: 27 | tokens : `ListField[TextField]` 28 | if `num_anchors > 0`, else: 29 | tokens : `TextField` 30 | 31 | Registered as a `DatasetReader` with name "declutr". 32 | 33 | # Parameters 34 | 35 | tokenizer : `Tokenizer`, optional (default = `{"tokens": SpacyTokenizer()}`) 36 | Tokenizer to use to split the input text into words or other kinds of tokens. 37 | token_indexers : `Dict[str, TokenIndexer]`, optional 38 | We use this to define the input representation for the text. See :class:`TokenIndexer`. 39 | num_anchors : `int`, optional 40 | The number of spans to sample from each instance to serve as anchors. 41 | num_positives : `int`, optional 42 | The number of spans to sample from each instance to serve as positive examples (per anchor). 43 | Has no effect if `num_anchors` is not provided. 44 | max_span_len : `int`, optional 45 | The maximum length of spans (after tokenization) which should be sampled. Has no effect if 46 | `num_anchors` is not provided. 47 | min_span_len : `int`, optional 48 | The minimum length of spans (after tokenization) which should be sampled. Has no effect if 49 | `num_anchors` is not provided. 50 | sampling_strategy : `str`, optional (default = None) 51 | One of "subsuming" or "adjacent". If "subsuming," positive spans are always subsumed by the 52 | anchor. If "adjacent", positive spans are always adjacent to the anchor. If not provided, 53 | positives may be subsumed, adjacent to, or overlapping with the anchor. Has no effect if 54 | `num_anchors` is not provided. 55 | """ 56 | 57 | def __init__( 58 | self, 59 | tokenizer: Tokenizer = None, 60 | token_indexers: Dict[str, TokenIndexer] = None, 61 | num_anchors: int = None, 62 | num_positives: int = None, 63 | max_span_len: int = None, 64 | min_span_len: int = None, 65 | sampling_strategy: str = None, 66 | **kwargs, 67 | ) -> None: 68 | super().__init__(**kwargs) 69 | self._tokenizer = tokenizer or SpacyTokenizer() 70 | self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()} 71 | 72 | # If the user provided us with a number of anchors to sample, we automatically 73 | # check that the other expected values are provided and valid. 74 | if num_anchors is not None: 75 | self._num_anchors = num_anchors 76 | self.sample_spans = True 77 | if num_positives is None: 78 | raise ValueError("num_positives must be provided if num_anchors is not None.") 79 | if max_span_len is None: 80 | raise ValueError("max_span_len must be provided if num_anchors is not None.") 81 | if min_span_len is None: 82 | raise ValueError("min_span_len must be provided if num_anchors is not None.") 83 | self._num_positives = num_positives 84 | self._max_span_len = max_span_len 85 | self._min_span_len = min_span_len 86 | self._sampling_strategy = ( 87 | sampling_strategy.lower() if sampling_strategy is not None else sampling_strategy 88 | ) 89 | if ( 90 | self.sample_spans 91 | and self._sampling_strategy is not None 92 | and self._sampling_strategy not in ["subsuming", "adjacent"] 93 | ): 94 | raise ValueError( 95 | ( 96 | 'sampling_strategy must be one of ["subsuming", "adjacent"].' 97 | f" Got {self._sampling_strategy}." 98 | ) 99 | ) 100 | else: 101 | self.sample_spans = False 102 | 103 | @property 104 | def sample_spans(self) -> bool: 105 | return self._sample_spans 106 | 107 | @sample_spans.setter 108 | def sample_spans(self, sample_spans: bool) -> None: 109 | self._sample_spans = sample_spans 110 | 111 | @contextmanager 112 | def no_sample(self) -> Iterator[None]: 113 | """A context manager that temporarily disables sampling of spans. Useful at test time when 114 | we want to embed unseen text. 115 | """ 116 | prev = self.sample_spans 117 | self.sample_spans = False 118 | yield 119 | self.sample_spans = prev 120 | 121 | @overrides 122 | def _read(self, file_path: str) -> Iterable[Instance]: 123 | # if `file_path` is a URL, redirect to the cache 124 | file_path = cached_path(file_path) 125 | 126 | with open(file_path, "r") as data_file: 127 | logger.info("Reading instances from lines in file at: %s", file_path) 128 | 129 | # If we are sampling spans (i.e. we are training) we need to shuffle the data so that 130 | # we don't yield instances in the same order every epoch. Our current solution is to 131 | # read the entire file into memory. This is a little expensive (roughly 1G per 1 million 132 | # docs), so a better solution might be required down the line. 133 | data: Iterable[Any] = [] 134 | if self.sample_spans: 135 | data = list(enumerate(data_file)) 136 | random.shuffle(data) 137 | data = iter(data) 138 | else: 139 | data = enumerate(data_file) 140 | 141 | for _, text in data: 142 | yield self.text_to_instance(text) 143 | 144 | @overrides 145 | def text_to_instance(self, text: str) -> Instance: # type: ignore 146 | """ 147 | # Parameters 148 | 149 | text : `str`, required. 150 | The text to process. 151 | 152 | # Returns 153 | 154 | An `Instance` containing the following fields: 155 | - anchors (`Union[TextField, ListField[TextField]]`) : 156 | If `self.sample_spans`, this will be a `ListField[TextField]` object, containing 157 | each anchor span sampled from `text`. Otherwise, this will be a `TextField` object 158 | containing the tokenized `text`. 159 | - positives (`ListField[TextField]`) : 160 | If `self.sample_spans`, this will be a `ListField[TextField]` object, containing 161 | each positive span sampled from `text`. Otherwise this field will not be included 162 | in the returned `Instance`. 163 | """ 164 | # Some very minimal preprocessing to remove whitespace, newlines and tabs. 165 | # We peform it here as it will cover both training and predicting with the model. 166 | # We DON'T lowercase by default, but rather allow `self._tokenizer` to decide. 167 | text = sanitize_text(text, lowercase=False) 168 | 169 | fields: Dict[str, Field] = {} 170 | if self.sample_spans: 171 | if isinstance(self._tokenizer, PretrainedTransformerTokenizer): 172 | # We add a space in front of the text in order to achieve consistant tokenization with 173 | # certain tokenizers, e.g. the BPE tokenizer used by RoBERTa, GPT and others. 174 | # See: https://github.com/huggingface/transformers/issues/1196 175 | text = f" {text.lstrip()}" 176 | tokenization_func = self._tokenizer.tokenizer.tokenize 177 | # A call to the `tokenize` method of the AllenNLP tokenizer causes 178 | # subsequent calls to the underlying HuggingFace Tokenizer (if `use_fast`) 179 | # to truncate text. Reset the truncation each time here. 180 | # Note this only appears to happen for transformers<3.1 181 | if self._tokenizer.tokenizer.is_fast: 182 | self._tokenizer.tokenizer._tokenizer.no_truncation() 183 | else: 184 | tokenization_func = None 185 | # Choose the anchor/positives at random. 186 | anchor_spans, positive_spans = sample_anchor_positive_pairs( 187 | text=text, 188 | num_anchors=self._num_anchors, 189 | num_positives=self._num_positives, 190 | max_span_len=self._max_span_len, 191 | min_span_len=self._min_span_len, 192 | sampling_strategy=self._sampling_strategy, 193 | tokenizer=tokenization_func, 194 | ) 195 | 196 | anchors: List[Field] = [] 197 | for span in anchor_spans: 198 | # Sampled spans have already been tokenized and joined by whitespace. 199 | # We need to convert them back to a string to use the AllenNLP tokenizer 200 | # It would be simpler to use convert_tokens_to_string, but we can't guarantee 201 | # this method is implemented for all HuggingFace Tokenizers 202 | anchor_text = self._tokenizer.tokenizer.decode( 203 | self._tokenizer.tokenizer.convert_tokens_to_ids(span.split()) 204 | ) 205 | tokens = self._tokenizer.tokenize(anchor_text) 206 | anchors.append(TextField(tokens, self._token_indexers)) 207 | fields["anchors"] = ListField(anchors) 208 | positives: List[Field] = [] 209 | for span in positive_spans: 210 | positive_text = self._tokenizer.tokenizer.decode( 211 | self._tokenizer.tokenizer.convert_tokens_to_ids(span.split()) 212 | ) 213 | tokens = self._tokenizer.tokenize(positive_text) 214 | positives.append(TextField(tokens, self._token_indexers)) 215 | fields["positives"] = ListField(positives) 216 | else: 217 | tokens = self._tokenizer.tokenize(text) 218 | fields["anchors"] = TextField(tokens, self._token_indexers) 219 | return Instance(fields) 220 | -------------------------------------------------------------------------------- /declutr/modules/token_embedders/pretrained_transformer_embedder_mlm.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, Optional, Tuple, Union 2 | 3 | import torch 4 | from allennlp.data.tokenizers import PretrainedTransformerTokenizer 5 | from allennlp.modules.scalar_mix import ScalarMix 6 | from allennlp.modules.token_embedders import PretrainedTransformerEmbedder 7 | from allennlp.modules.token_embedders.token_embedder import TokenEmbedder 8 | from overrides import overrides 9 | from transformers import AutoConfig, AutoModelForMaskedLM 10 | 11 | 12 | @TokenEmbedder.register("pretrained_transformer_mlm") 13 | class PretrainedTransformerEmbedderMLM(PretrainedTransformerEmbedder): 14 | """ 15 | This is a wrapper around `PretrainedTransformerEmbedder` that allows us to train against a 16 | masked language modelling objective while we are embedding text. 17 | 18 | Registered as a `TokenEmbedder` with name "pretrained_transformer_mlm". 19 | 20 | # Parameters 21 | 22 | model_name : `str` 23 | The name of the `transformers` model to use. Should be the same as the corresponding 24 | `PretrainedTransformerIndexer`. 25 | max_length : `int`, optional (default = `None`) 26 | If positive, folds input token IDs into multiple segments of this length, pass them 27 | through the transformer model independently, and concatenate the final representations. 28 | Should be set to the same value as the `max_length` option on the 29 | `PretrainedTransformerIndexer`. 30 | sub_module: `str`, optional (default = `None`) 31 | The name of a submodule of the transformer to be used as the embedder. Some transformers naturally act 32 | as embedders such as BERT. However, other models consist of encoder and decoder, in which case we just 33 | want to use the encoder. 34 | train_parameters: `bool`, optional (default = `True`) 35 | If this is `True`, the transformer weights get updated during training. 36 | last_layer_only: `bool`, optional (default = `True`) 37 | When `True` (the default), only the final layer of the pretrained transformer is taken 38 | for the embeddings. But if set to `False`, a scalar mix of all of the layers 39 | is used. 40 | gradient_checkpointing: `bool`, optional (default = `None`) 41 | Enable or disable gradient checkpointing. 42 | tokenizer_kwargs: `Dict[str, Any]`, optional (default = `None`) 43 | Dictionary with 44 | [additional arguments](https://github.com/huggingface/transformers/blob/155c782a2ccd103cf63ad48a2becd7c76a7d2115/transformers/tokenization_utils.py#L691) 45 | for `AutoTokenizer.from_pretrained`. 46 | transformer_kwargs: `Dict[str, Any]`, optional (default = `None`) 47 | Dictionary with 48 | [additional arguments](https://github.com/huggingface/transformers/blob/155c782a2ccd103cf63ad48a2becd7c76a7d2115/transformers/modeling_utils.py#L253) 49 | for `AutoModel.from_pretrained`. 50 | masked_language_modeling: `bool`, optional (default = `True`) 51 | If this is `True` and `masked_lm_labels is not None` in the call to `forward`, the model 52 | will be trained against a masked language modelling objective and the resulting loss will 53 | be returned along with the output tensor. 54 | """ # noqa: E501 55 | 56 | def __init__( 57 | self, 58 | model_name: str, 59 | *, 60 | max_length: int = None, 61 | sub_module: str = None, 62 | train_parameters: bool = True, 63 | last_layer_only: bool = True, 64 | override_weights_file: Optional[str] = None, 65 | override_weights_strip_prefix: Optional[str] = None, 66 | gradient_checkpointing: Optional[bool] = None, 67 | tokenizer_kwargs: Optional[Dict[str, Any]] = None, 68 | transformer_kwargs: Optional[Dict[str, Any]] = None, 69 | masked_language_modeling: bool = True, 70 | ) -> None: 71 | TokenEmbedder.__init__(self) # Call the base class constructor 72 | tokenizer = PretrainedTransformerTokenizer(model_name, tokenizer_kwargs=tokenizer_kwargs) 73 | self.masked_language_modeling = masked_language_modeling 74 | 75 | if self.masked_language_modeling: 76 | self.config = AutoConfig.from_pretrained(model_name, output_hidden_states=True) 77 | # We only need access to the HF tokenizer if we are masked language modeling 78 | self.tokenizer = tokenizer.tokenizer 79 | # The only differences when masked language modeling are: 80 | # 1) `output_hidden_states` must be True to get access to token embeddings. 81 | # 2) We need to use `AutoModelForMaskedLM` to get the correct model 82 | self.transformer_model = AutoModelForMaskedLM.from_pretrained( 83 | model_name, config=self.config, **(transformer_kwargs or {}) 84 | ) 85 | # Eveything after the if statement (including the else) is copied directly from: 86 | # https://github.com/allenai/allennlp/blob/master/allennlp/modules/token_embedders/pretrained_transformer_embedder.py 87 | else: 88 | from allennlp.common import cached_transformers 89 | 90 | self.transformer_model = cached_transformers.get( 91 | model_name, True, override_weights_file, override_weights_strip_prefix 92 | ) 93 | self.config = self.transformer_model.config 94 | 95 | if gradient_checkpointing is not None: 96 | self.transformer_model.config.update({"gradient_checkpointing": gradient_checkpointing}) 97 | 98 | if sub_module: 99 | assert hasattr(self.transformer_model, sub_module) 100 | self.transformer_model = getattr(self.transformer_model, sub_module) 101 | self._max_length = max_length 102 | 103 | # I'm not sure if this works for all models; open an issue on github if you find a case 104 | # where it doesn't work. 105 | self.output_dim = self.config.hidden_size 106 | 107 | self._scalar_mix: Optional[ScalarMix] = None 108 | if not last_layer_only: 109 | self._scalar_mix = ScalarMix(self.config.num_hidden_layers) 110 | self.config.output_hidden_states = True 111 | 112 | self._num_added_start_tokens = len(tokenizer.single_sequence_start_tokens) 113 | self._num_added_end_tokens = len(tokenizer.single_sequence_end_tokens) 114 | self._num_added_tokens = self._num_added_start_tokens + self._num_added_end_tokens 115 | 116 | if not train_parameters: 117 | for param in self.transformer_model.parameters(): 118 | param.requires_grad = False 119 | 120 | @overrides 121 | def forward( 122 | self, 123 | token_ids: torch.LongTensor, 124 | mask: torch.BoolTensor, 125 | type_ids: Optional[torch.LongTensor] = None, 126 | segment_concat_mask: Optional[torch.BoolTensor] = None, 127 | masked_lm_labels: Optional[torch.LongTensor] = None, 128 | ) -> Union[Tuple[torch.FloatTensor, torch.Tensor], torch.Tensor]: # type: ignore 129 | """ 130 | # Parameters 131 | 132 | token_ids: `torch.LongTensor` 133 | Shape: `[batch_size, num_wordpieces if max_length is None else num_segment_concat_wordpieces]`. 134 | num_segment_concat_wordpieces is num_wordpieces plus special tokens inserted in the 135 | middle, e.g. the length of: "[CLS] A B C [SEP] [CLS] D E F [SEP]" (see indexer logic). 136 | mask: `torch.BoolTensor` 137 | Shape: [batch_size, num_wordpieces]. 138 | type_ids: `Optional[torch.LongTensor]` 139 | Shape: `[batch_size, num_wordpieces if max_length is None else num_segment_concat_wordpieces]`. 140 | segment_concat_mask: `Optional[torch.BoolTensor]` 141 | Shape: `[batch_size, num_segment_concat_wordpieces]`. 142 | masked_lm_labels: `Optional[torch.LongTensor]` 143 | Shape: `[batch_size, num_wordpieces]`. 144 | 145 | # Returns: 146 | 147 | If `self.masked_language_modeling`, returns a `Tuple` of the masked language modeling loss 148 | and a `torch.Tensor` of shape: `[batch_size, num_wordpieces, embedding_size]`. Otherwise, 149 | returns only the `torch.Tensor` of shape: `[batch_size, num_wordpieces, embedding_size]`. 150 | """ 151 | # Some of the huggingface transformers don't support type ids at all and crash when you supply 152 | # them. For others, you can supply a tensor of zeros, and if you don't, they act as if you did. 153 | # There is no practical difference to the caller, so here we pretend that one case is the same 154 | # as another case. 155 | if type_ids is not None: 156 | max_type_id = type_ids.max() 157 | if max_type_id == 0: 158 | type_ids = None 159 | else: 160 | if max_type_id >= self._number_of_token_type_embeddings(): 161 | raise ValueError("Found type ids too large for the chosen transformer model.") 162 | assert token_ids.shape == type_ids.shape 163 | 164 | fold_long_sequences = self._max_length is not None and token_ids.size(1) > self._max_length 165 | if fold_long_sequences: 166 | batch_size, num_segment_concat_wordpieces = token_ids.size() 167 | token_ids, segment_concat_mask, type_ids = self._fold_long_sequences( 168 | token_ids, segment_concat_mask, type_ids 169 | ) 170 | 171 | transformer_mask = segment_concat_mask if self._max_length is not None else mask 172 | # Shape: [batch_size, num_wordpieces, embedding_size], 173 | # or if self._max_length is not None: 174 | # [batch_size * num_segments, self._max_length, embedding_size] 175 | 176 | # We call this with kwargs because some of the huggingface models don't have the 177 | # token_type_ids parameter and fail even when it's given as None. 178 | # Also, as of transformers v2.5.1, they are taking FloatTensor masks. 179 | parameters = {"input_ids": token_ids, "attention_mask": transformer_mask.float()} # type: ignore 180 | if type_ids is not None: 181 | parameters["token_type_ids"] = type_ids 182 | if masked_lm_labels is not None and self.masked_language_modeling: 183 | parameters["labels"] = masked_lm_labels 184 | 185 | masked_lm_loss = None 186 | transformer_output = self.transformer_model(**parameters) 187 | 188 | if self.config.output_hidden_states: 189 | # Even if masked_language_modeling is True, we may not be masked language modeling on 190 | # the current batch. Check if masked language modeling labels are present in the input. 191 | if "labels" in parameters: 192 | masked_lm_loss = transformer_output[0] 193 | 194 | if self._scalar_mix: 195 | embeddings = self._scalar_mix(transformer_output[-1][1:]) 196 | else: 197 | embeddings = transformer_output[-1][-1] 198 | else: 199 | embeddings = transformer_output[0] 200 | 201 | if fold_long_sequences: 202 | embeddings = self._unfold_long_sequences( 203 | embeddings, segment_concat_mask, batch_size, num_segment_concat_wordpieces 204 | ) 205 | 206 | return masked_lm_loss, embeddings 207 | -------------------------------------------------------------------------------- /declutr/model.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Optional 2 | 3 | import torch 4 | import torch.distributed as dist 5 | from allennlp.common import util 6 | from allennlp.data import TextFieldTensors, Vocabulary 7 | from allennlp.models.model import Model 8 | from allennlp.modules import FeedForward, Seq2VecEncoder, TextFieldEmbedder 9 | from allennlp.modules.seq2vec_encoders import BagOfEmbeddingsEncoder 10 | from allennlp.nn import InitializerApplicator 11 | from allennlp.nn.util import get_text_field_mask 12 | 13 | from declutr.common.masked_lm_utils import mask_tokens 14 | from declutr.common.model_utils import all_gather_anchor_positive_pairs, unpack_batch 15 | from declutr.losses import PyTorchMetricLearningLoss 16 | from declutr.miners import PyTorchMetricLearningMiner 17 | 18 | 19 | @Model.register("declutr") 20 | class DeCLUTR(Model): 21 | """ 22 | This `Model` implements a text encoder trained against a contrastive, self-supervised objective. 23 | After embedding the text into a text field, we will optionally encode the embeddings with a 24 | `Seq2SeqEncoder`. The resulting sequence is pooled using a `Seq2VecEncoder` and then passed to 25 | a `FeedFoward` layer, which projects the embeddings to a certain size. 26 | 27 | Registered as a `Model` with name "declutr". 28 | 29 | # Parameters 30 | 31 | vocab : `Vocabulary` 32 | text_field_embedder : `TextFieldEmbedder` 33 | Used to embed the input text into a `TextField` 34 | seq2vec_encoder : `Seq2VecEncoder`, optional, (default = `None`) 35 | Seq2Vec encoder layer. If `seq2seq_encoder` is provided, this encoder will pool its output. 36 | Otherwise, this encoder will operate directly on the output of the `text_field_embedder`. 37 | If `None`, defaults to `BagOfEmbeddingsEncoder` with `averaged=True`. 38 | feedforward : `FeedForward`, optional, (default = None). 39 | An optional feedforward layer to apply after the seq2vec_encoder. 40 | loss : `PyTorchMetricLearningLoss`, option (default = None). 41 | An optional metric learning loss function. Will be combined with the masked language 42 | modeling objective if 43 | `text_field_embedder.token_embedders["tokens"].masked_language_modeling` is True. Must be 44 | provided if `text_field_embedder.token_embedders["tokens"].masked_language_modeling` is 45 | False. See https://kevinmusgrave.github.io/pytorch-metric-learning/losses/ for a list of 46 | available loss functions. 47 | miner: `PyTorchMetricLearningMiner`, option (default = None). 48 | An optional mining function which will mine hard negatives from each batch before computing 49 | the loss. See https://kevinmusgrave.github.io/pytorch-metric-learning/miners/ for a list 50 | of available mining functions. 51 | initializer : `InitializerApplicator`, optional (default=`InitializerApplicator()`) 52 | If provided, will be used to initialize the model parameters. 53 | """ 54 | 55 | def __init__( 56 | self, 57 | vocab: Vocabulary, 58 | text_field_embedder: TextFieldEmbedder, 59 | seq2vec_encoder: Optional[Seq2VecEncoder] = None, 60 | feedforward: Optional[FeedForward] = None, 61 | miner: Optional[PyTorchMetricLearningMiner] = None, 62 | loss: Optional[PyTorchMetricLearningLoss] = None, 63 | scale_fix: bool = True, 64 | initializer: InitializerApplicator = InitializerApplicator(), 65 | **kwargs, 66 | ) -> None: 67 | 68 | super().__init__(vocab, **kwargs) 69 | self._text_field_embedder = text_field_embedder 70 | # Prevents the user from having to specify the tokenizer / masked language modeling 71 | # objective. In the future it would be great to come up with something more elegant. 72 | token_embedder = self._text_field_embedder._token_embedders["tokens"] 73 | self._masked_language_modeling = token_embedder.masked_language_modeling 74 | if self._masked_language_modeling: 75 | self._tokenizer = token_embedder.tokenizer 76 | 77 | # Default to mean BOW pooler. This performs well and so it serves as a sensible default. 78 | self._seq2vec_encoder = seq2vec_encoder or BagOfEmbeddingsEncoder( 79 | text_field_embedder.get_output_dim(), averaged=True 80 | ) 81 | self._feedforward = feedforward 82 | 83 | self._miner = miner 84 | self._loss = loss 85 | if self._loss is None and not self._masked_language_modeling: 86 | raise ValueError( 87 | ( 88 | "No loss function provided. You must provide a contrastive loss (DeCLUTR.loss)" 89 | " and/or specify `masked_language_modeling=True` in the config when training." 90 | ) 91 | ) 92 | # There was a small bug in the original implementation that caused gradients derived from 93 | # the contrastive loss to be scaled by 1/N, where N is the number of GPUs used during 94 | # training. This has been fixed. To reproduce results from the paper, set `model.scale_fix` 95 | # to `False` in your config. Note that this will have no effect if you are not using 96 | # distributed training with more than 1 GPU. 97 | self._scale_fix = scale_fix 98 | initializer(self) 99 | 100 | def forward( # type: ignore 101 | self, anchors: TextFieldTensors, positives: TextFieldTensors = None 102 | ) -> Dict[str, torch.Tensor]: 103 | 104 | """ 105 | # Parameters 106 | 107 | tokens : TextFieldTensors 108 | From a `TextField` 109 | 110 | # Returns 111 | 112 | An output dictionary consisting of: 113 | 114 | embeddings : torch.FloatTensor 115 | A tensor of shape `(batch_size, self._seq2vec_encoder.get_output_dim())`, which is the 116 | representation for the given `tokens` output by the encoder. The encoder is composed of: 117 | `self._text_field_embedder`, and `self._seq2vec_encoder`, in that order. 118 | projections : torch.FloatTensor 119 | A tensor of shape `(batch_size, self._feedforward.get_output_dim())`, which is the 120 | non-linear projection of the learned representation for the given `anchor_tokens` output 121 | by the projection head. This field will only be included if `self._feedforward` is not 122 | `None`. 123 | loss : torch.FloatTensor, optional 124 | A scalar loss to be optimized. 125 | """ 126 | output_dict: Dict[str, torch.Tensor] = {} 127 | 128 | # If multiple anchors were sampled, we need to unpack them. 129 | anchors = unpack_batch(anchors) 130 | # Mask anchor input ids and get labels required for MLM. 131 | if self.training and self._masked_language_modeling: 132 | anchors = mask_tokens(anchors, self._tokenizer) 133 | # This is the textual representation learned by a model and used for downstream tasks. 134 | masked_lm_loss, embedded_anchors = self._forward_internal(anchors, output_dict) 135 | 136 | # If positives are supplied by DataLoader and we are training, compute a contrastive loss. 137 | if self.training: 138 | output_dict["loss"] = 0 139 | # TODO: We should throw a ValueError if no postives provided but loss is not None. 140 | if self._loss is not None: 141 | # Like the anchors, if we sampled multiple positives, we need to unpack them. 142 | positives = unpack_batch(positives) 143 | # Positives are represented by their mean embedding a la 144 | # https://arxiv.org/abs/1902.09229. 145 | _, embedded_positives = self._forward_internal(positives) 146 | # Shape: (num_anchors, num_positives_per_anchor, embedding_dim) 147 | embedded_positives = torch.reshape( 148 | embedded_positives, 149 | (embedded_anchors.size(0), -1, embedded_anchors.size(-1)), 150 | ) 151 | # Shape: (num_anchors, embedding_dim) 152 | embedded_positives = torch.mean(embedded_positives, dim=1) 153 | 154 | # If we are training on multiple GPUs using DistributedDataParallel, then a naive 155 | # application would result in 2 * (batch_size/n_gpus - 1) number of negatives per 156 | # GPU. To avoid this, we need to gather the anchors/positives from each replica on 157 | # every other replica in order to generate the correct number of negatives, 158 | # i.e. 2 * (batch_size - 1), before computing the contrastive loss. 159 | embedded_anchors, embedded_positives = all_gather_anchor_positive_pairs( 160 | embedded_anchors, embedded_positives 161 | ) 162 | # Get embeddings into the format that the PyTorch Metric Learning library expects 163 | # before computing the loss (with an optional mining step). 164 | embeddings, labels = self._loss.get_embeddings_and_labels( 165 | embedded_anchors, embedded_positives 166 | ) 167 | indices_tuple = self._miner(embeddings, labels) if self._miner is not None else None 168 | contrastive_loss = self._loss(embeddings, labels, indices_tuple) 169 | # Loss needs to be scaled by world size when using DistributedDataParallel 170 | # See: https://amsword.medium.com/gradient-backpropagation-with-torch-distributed-all-gather-9f3941a381f8 171 | if util.is_distributed() and self._scale_fix: 172 | contrastive_loss *= dist.get_world_size() 173 | output_dict["loss"] += contrastive_loss 174 | # Loss may be derived from contrastive objective, MLM objective or both. 175 | if masked_lm_loss is not None: 176 | output_dict["loss"] += masked_lm_loss 177 | 178 | return output_dict 179 | 180 | def _forward_internal( 181 | self, 182 | tokens: TextFieldTensors, 183 | output_dict: Optional[Dict[str, torch.Tensor]] = None, 184 | ) -> torch.Tensor: 185 | 186 | masked_lm_loss, embedded_text = self._text_field_embedder(tokens) 187 | mask = get_text_field_mask(tokens).float() 188 | 189 | embedded_text = self._seq2vec_encoder(embedded_text, mask=mask) 190 | # Don't hold on to embeddings or projections during training. 191 | if output_dict is not None and not self.training: 192 | output_dict["embeddings"] = embedded_text.clone().detach() 193 | 194 | # Representations produced by a non-linear projection can be used for training with a 195 | # contrastive loss. Previous works in computer vision have found this projection head to 196 | # improve the quality of the learned embeddings (see: https://arxiv.org/abs/2002.05709). 197 | # When embedding text with a trained model, we want the representation produced by the 198 | # encoder network. We therefore call these vectors "projections" to distinguish them from 199 | # the "embeddings". 200 | if self._feedforward is not None: 201 | embedded_text = self._feedforward(embedded_text) 202 | if output_dict is not None and not self.training: 203 | output_dict["projections"] = embedded_text.clone().detach() 204 | 205 | return masked_lm_loss, embedded_text 206 | 207 | default_predictor = "declutr" 208 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /tests/fixtures/data/openwebtext/valid.txt: -------------------------------------------------------------------------------- 1 | Labour and the Greens say they're committed to running budget surpluses, paying debt and keeping spending down. The two parties will present their 'Budget Responsibility Rules' to business leaders, economists, NGOs and academics on Friday morning. "People want to know the kinds of principles that we're going to manage the books by," Labour leader Andrew Little told The AM Show. The two parties' economic policies have differed in the past, and while they'll still have separate policies, the rules are a "framework" for working together in government, said Labour finance spokesman Grant Robertson. Among the rules are: running surpluses (barring major economic shocks or natural disasters) keep Crown spending to about 30 percent of GDP a progressive, fairer tax system reduce net debt to 20 percent of GDP in five years increase investment in superannuation, climate change and infrastructure. "We're committed to running surpluses over an economic cycle," says Mr Little. "We're going to be responsible with the government budget. There's stuff we've already committed to - we've got to fix housing, we've got to fix health, we've got to fix education." Crown spending currently is at 30 percent of GDP, according to Treasury figures. The 2016 Budget predicted it would drop to about 28 percent by 2020, under National. It peaked in 2011, in the wake of the global financial crisis, at around 34 percent. The last time Labour was in government, Finance Minister Michael Cullen ran up a series of significant surpluses, but came under fire for not cutting taxes. No tax increases are planned if Labour does win the upcoming election. "Every commitment we've made we can fund out of existing tax revenue," said Mr Little. The Greens went into the last election promising a new top tax rate of 40 percent on income over $140,000. Co-leader James Shaw wouldn't say if that would be the case this year, saying the party's focus wouldn't be on income tax. "A lot of people don't know this, but at the last election we actually promised a tax cut for 97 percent of New Zealanders," he told The AM Show. "That was going to be funded out of a tax on pollution that causes climate change." A capital gains tax would likely be included in the Greens' economic policy, but not Labour's. "It won't be exactly the same as what we've said in the past," said Mr Shaw. Coalition negotiations As for who'll take what job should the left bloc win the election, Mr Shaw says it's wide open. "The largest party in government is guaranteed the Prime Minister and Minister of Finance. Everything else depends on what happens on election night." 2 | DOWNEY, Calif. When Mario Guerra strolls through the streets of downtown Downey, he cant help but play the role of seasoned salesman for the city in southeastern Los Angeles County he adopted more than 35 years ago. Guerra, a Cuban-American immigrant who served eight years on the city council and two terms as mayor, sings the praises of Portos Bakery, a Cuban sandwich and pastry shop that he helped lure to Downey. He shows off the vibrant murals and sleekly designed street sculpture that he commissioned as a city leader to spruce up the downtown area and imbue it with a sense of culture and character. Subsequently, an acquaintance stops Guerra, known about town for his role as a Catholic deacon as well as consummate problem-solver, to ask if Guerra would perform his wedding at the towns annual Dia de los Muertes festival. Guerra, a Republican in a city that is 70 percent Hispanic and leans Democratic, attributes his success as a GOP politician to a laser-like focus on finding solutions, building bridges and approaching every problem with a kind of neighborly compassion. Its how Guerra says he came within 5 percentage points of his opponent in his race for a state Senate seat last fall in a district where Democrats have a 24-point voter registration advantage I govern in that 60 to 70 percent where I feel that we can all agree and say, Lets just make things better, he said. Were not going to agree on everything but we can agree on this stuff that we can fix, and its in that space that you can get things done. But to Guerras dismay, its a markedly different philosophy than the one that seems to have taken hold within the Republican Party at the national level in the 2016 presidential cycle, particularly with the rise of Donald Trump. Guerra has watched with a mixture of bewilderment and exasperation as the business mogul has ridden to the top of the polls in part by spewing barbed invective against illegal immigration, and branding Latino immigrants as criminals and rapists. In preparation of going on Spanish-language television to talk about the summer of Trump, Guerra brushed up to make sure he had one specific word in his arsenal payaso clown. For Californians, the narrative playing out at the national level has an air of dj vu. Twenty years ago, a similar wave of anti-immigrant sentiment washed over the Golden State, and voters responded by passing a ballot initiative that blocked undocumented immigrants from receiving a litany of critical state services, including public education and health care. 3 | Let's just get something out of the way: the Phillies are not a good baseball team. Ryan Howard is listed as the number one starter at first base on the depth chart, and it's not 2009. Freddy Galvis and Cesar Hernandez compose the middle infield; Cedric Hunter and Peter Bourjos are the starting corner outfielders. Those are five players that shouldn't be starters, starting, but this is the reality that the Phillies currently inhabit. Fortunately for Philadelphia, and their fans, they have already have the pieces in place to help them make a quantum leap. Through a series of trades, and good drafts, the Phillies are in a position to put a competitive baseball team on the field as early as 2017. Homegrown players Philadelphia has two players that have come through their own farm system and that could be ready as soon as this year: J.P. Crawford and Andrew Knapp. 2015 PA BB% K% ISO wOBA wRC+ Crawford (AA) 405 12.1% 11.1% .142 .348 121 Andrew Knapp (AA) 241 9.1% 17.8% .271 .465 200 Both players spent the majority of their 2015 seasons in AA, and they were fantastic. Knapp, a catcher, showcased more power and a better offensive line overall, but in no way should that diminish Crawford's campaign. FanGraphs rates his future value at 60 (out of 80), and he's expected to be the next shortstop prospect to join the surfeit of budding MLB stars at that position. He's starting the year at AA, but with a current wOBA of .421 and a wRC+ of 164 though 43 at-bats, he's likely going to join Knapp at AAA relatively soon. The Phillies don't want to start anyone's service clock sooner than they have to, but Knapp seems like a safe bet to see time at the major league level this year. Carlos Ruiz (likely) won't be playing for Philadelphia beyond this year, and Cameron Rupp (with his career wOBA of .282 and wRC+ of 75) isn't going to block Knapp from reaching the big leagues. The catcher's spot is up for grabs on the big league club, and Knapp has a chance to entrench himself as the starter. There's also Aaron Nola, who's already at the big league level. In the first 91.2 innings of his career, Nola's posted a K/9 of 8.35 and a BB/9 of 1.87, along with an ERA of 3.53, an FIP of 3.81, and an fWAR of 1.3. In two starts this year, he's striking out a significantly higher percentage of batters than he did in 2015, and has yet to walk anybody. He's under team control through the 2021 season, and barring an injury, Nola should be one of the Phillies' front line starters for years to come. Philadelphia also has Aaron Altherr (currently on the DL) and Roman Quinn moving up through their system. 4 | Is it possible to completely eliminate scent? Share this article Every day a new scent control product seems to hit the market. How do you make sense of the marketing tidal wave that hits you every time you turn on the T.V. or open a hunting magazine? Do scent control products actually work? Is it possible to completely eliminate odor? Maybe. In this article, I will uncover the answers to those questions and determine the best way to remain undetected in the field. How animals smell Animals have special membranes in their noses that pick up scent signatures in the air. In fact, deer and elk have more receptors in their noses than even dogs. Deer and elk can also use an additional organ in their mouth, called a vomeronasal organ, to detect smells. This allows them to detect multiple scents simultaneously and makes it possible for them to detect smells that are incredibly far away. This means that the proverbial deck is stacked against a hunter almost immediately because animals can smell you before you ever see them. Types of scent It will be easiest to understand scent control if we first understand scent in general. There are three types of scents that comprise the smells that an animal will pick up: natural, unnatural/foreign and odor. Natural Natural smells are those smells that naturally occur in its given environment. The smell of pine trees, the smell of a deer's coat, etc.; however, natural smells are not just those smells that are found in nature. Human beings also have a natural smell even though we do not actually smell it. This is because our brains recognize it as our own scent and the way we are programmed, it becomes useless information that does not register for us. But that does not mean that it is not there. I could shower with scentless soap and scentless shampoo and I would still have a natural scent. Natural scents cannot be removed; they can only be masked. Unnatural/foreign Unnatural or foreign smells are those that are not naturally occurring. Scented detergents might be mountain fresh, but in the woods in the fall that smell is unnatural. Coffee is an unnatural smell as well as vehicle exhaust. Unnatural scents can be removed with little effort. Simply by being aware of these unnatural or foreign smells is enough to make changes that can drastically reduce or eliminate them. Odor Odor is a specific type of scent because it is caused by the growth of bacteria in an environment. Sweat does not smell like body odor until it stays in a dark, wet area of your body (i.e., your armpit) for an extended period of time and bacteria begins to grow. 5 | Steam Hammer is the first hardcore sandbox-style RPG set in a dark and mysterious steampunk world. Experience the intensity as you try to survive on the mysterious Acribo Islands. Steam Hammer features: a classic Victorian steampunk setting with wondrous mechanisms, machinery, weapons, armor, clothing, andof coursesteam and smoke. an open-class system that frees you from arbitrary constraints. Engineer, scientist, farmer, gunsmith, stormtrooper, sharpshooter, and more can all be combined and switched depending on your skill set. a huge open world for you to explore, travel, and terraform. Go where you will and master the land. Craft your Glory. Craft your Victory. Craft your Steam Hammer! Updates: Update #1 Thank you! Update #2 Update #3 Update #4 Two Weeks Left Update #5 Final week Update #6 We are very grateful to all our backers who spread the word about our campaign, and now we can offer you something extra in return as a token of our appreciation. Now with KickBooster you can share our campaign with your friends and you'll get 11% of every dollar you help raise. Click here for more info. The Victorian Empire was at the height of its power. It dominated its neighbors and basked in glory. The Victorians made remarkable technological achievements in steam-powered machinery. Imperial airships flew over the many lands and provinces under its control. The foundation of the Empires power was the celebrium trees growing on the Acribo Islands. Its precious sap fueled its technological wonders, but the empire had a dark secret. Harvesting the sap of the trees drove the workers mad. Day after day, the Acribian laborers slaved away in nightmarish conditions, all for the sake of harvesting the cursed tree. A storm was brewing that could not be seen from above, and one day, everything changed. The ancient gods of the Acribo Islands returned and took revenge on the Victorians exploiting their people. Devastating cataclysms struck the land. Having suffered long under the yoke of the Empire, the Acribians rebelled in a war that nearly drowned the country in blood, and they declared their independence. The world was never the same. Time passed. The crisis was over, but the Empires greatness was gone. So was its industry, and the lives of so many of its citizens. It lost its access to the celebrium trees. But now, Victoria is healing its wounds and it is time to take action. All that remains of its air force is mustering and the Imperial airships are headed West. This is where your story begins. Bring the Empire back to its former glory! You must win whatever it takes. 6 | -------------------------------------------------------------------------------- /tests/fixtures/data/openwebtext/train.txt: -------------------------------------------------------------------------------- 1 | Just two months into the baseball season, the Cubs are exceeding fans' expectations and creating a national buzz with one of the best records in the National League. But, quietly, progress also is being made off the field as team Chairman Tom Ricketts methodically moves to gain further control of the neighborhood streets just beyond the walls of Wrigley Field and win his battle with the rooftop businesses that help define the Wrigley vibe. His latest coup came earlier this month, records show, when an entity controlled by the Ricketts family bought three more rooftop buildings on Sheffield Avenue, bringing to six the number the family now owns. The Ricketts family paid Sheffield Finance an undisclosed price for the buildings at 3637 N. Sheffield which was torn down and rebuilt just for the rooftop business 3617 N. Sheffield and 3619 N. Sheffield, Cook County property records show. The Ricketts family will assume no debt on the buildings. A sale was expected after a federal judge last month dismissed a foreclosure lawsuit against the businesses. In November, Fifth Third Bank sued the rooftop operations and their owners, alleging that the businesses owed more than $18 million on mortgages and missed payments. Sheffield Finance later bought a portion of the debt and replaced the bank as plaintiffs in the case. Sheffield Finance is an entity owned by Jerry Lasky and Murray Peretz, partners in Spectrum Real Estate, a Chicago commercial real estate business. "I always felt these rooftops were an extension of Wrigley Field and they belong with the Ricketts family," said Lasky, who called himself a diehard Cubs fan. "It was a natural fit." The Rickettses have held a financial interest in a seventh rooftop operation, Down the Line Rooftop, since 2010. Earlier this year, George Loukas, who helped start the rooftop craze years ago, sold two buildings while James Lourgos and his partners sold another. Just three rooftop businesses on Sheffield aren't owned by the Ricketts family now: Murphy's Rooftop, at the corner of Waveland and Sheffield and above Murphy's Bleachers, and Skybox on Sheffield and Lakeview Baseball Club, which have sued the team in federal court. That suit is pending. According to records, Ricketts tried to buy all of the rooftop club properties shortly after acquiring the Cubs. Today, sports team owners search for new sources of revenue, and there are more changes on the horizon for the Cubs: Future plans include an open-air plaza, a nearby hotel and street fairs similar to the ones the Boston Red Sox host. A Cubs spokesman said Thursday that the newly acquired rooftops will be managed like the three bought in January through an agreement with Loukas, who owns popular bars in Wrigleyville and still has one rooftop business. Lasky said he admired that the team is investing more than $575 million into renovating the 101-year-old stadium and neighborhood, and praised the team's operation of Wrigley. He said he approached team executives there weren't other potential buyers a few weeks ago, completing the deal with Cubs Chairman Tom Ricketts and President of Business Operations Crane Kenney. Ricketts has said little about the team's plans for the rooftops other than that he plans to keep them in operation. In a statement, the Cubs said, "The Ricketts family has said in the past they are interested in reasonable opportunities to purchase rooftop property and are willing to pay a fair price. ... The rooftop situation has been a political and legal morass for more than a decade, and the Ricketts family will remain interested in opportunities which make sound business sense." The team's $375 million overhaul of the stadium continues. The 3,990-square-foot left field video board was ready for the Cubs' home opener, and the left-field bleachers opened earlier this month. 2 | Im surrounded by big spenders at family gatherings. My siblings, their spouses/partners, my uncles and cousins all seem to spend at least every dime they make. The thing is, these arent poor people struggling to get by. The poorest among them brings in probably around $65k/year, and the others are well into the six figures. So they all have two or three refrigerators. One for the kitchen, one for freezing meats and vegetables in the basement and one in the garage just full of beer and soda. They all have cars no more than 2 years old. They have houses with unused rooms filled with unused furniture. When Im with people one on one Ill often bring up the topic of personal finance and investing. So I know that none of these people have any savings what-so-ever. Theyre barely into their 30s so they think they dont need to plan for retirement yet. I think they used to all just think I didnt earn much income and was a struggling student or something. So they would rag on me and tell me where I can get a good deal on a much shinier car to replace my 10 year old sedan. Theyre offering 0% interest for the next six weeks, you should go! It would only be like $300/month, even you could swing that. Then, over the past six months or so, word has gotten out that I paid cash for a house. And that Im fixing to do it again a few times over in the next year. Yet I still drive the old sedan around. Theyre realizing their apparent financial superiority has been merely that; apparent. Now, when Im in ear shot, Ill hear things like, I play hard, but I work hard! or, Whats the point of earning it if youre not going to spend it??? and You only live once! I think they are feeling a bit guilty about their behavior and my mere presence is bringing it out. They know that TV they bought over a year ago that theyre still making payments on has lost its novelty. They know they should be putting some money away, at least for a typical retirement when theyre 67. They know they ought to be saving something. The fact that no one else is doing it though makes it easier for them to slide as well. Theyll all be on that sinking ship together at least. I think, if they were honest, their platitudes would sound more like: I know I should save some money for a rainy day, but damn that cars shiny! Or, Ive already resigned myself to working for the next 40 years, I may as well buy some crap that at least makes me happy for a few weeks. Or, Were actually in a contest to see who can spend the most on their daily transportation, I just got a little closer to the winners circle. Ive said my piece many times over the years. When someone mentions theyre thinking about getting a new car, I explain the vast cost savings in getting something at least slightly used that gets good mileage. And if not, I explain the advantages of saving up and paying cash rather than paying all those finance charges. Ive suggested to my brother when he was buying his house that, as a single guy, he didnt really need 4 bedrooms and to consider the cost of heating all those empty rooms through a New England winter. Ive recommended to everyone, without much success, that they at least make use of tax-advantaged retirement accounts. After a while of that I just started to get eye-rolls. Or anticipatory glances when someone brought up some financial topic. Realizing Im just blowing into the wind, now I just say, You guys know what Im going to say. You know it makes sense. But its your money; your future, do what you want with it. My parents are no better. They make a good income and they do a good job of spending it. They make 3 to 5 Caribbean trips every year. I dont think theyve ever not had a car payment. The two of them live in a 6 bedroom house. They order exotic meats through the mail. My dad gambles. They carry way too much insurance. 3 | Colorado cannabis is better than anything Amsterdam's got, and the medicine here is on par with, if not better than, what is coming out of California. While we already knew that here in Colorado, it was still cool to hear those words come out of High Times editor Danny Danko's mouth last night at the 2011 High Times Cannabis Cup award ceremony. I don't think anyone really knew what to expect from the Cannabis Cup. Even dispensary owners I spoke with beforehand had only an inkling. Were people really going to be able to light up? Would staffers be able to hand out meds to patients? What coalesced, though, turned out to be easily one of the most amazing cannabis events in Colorado short of passing Amendment 20 nearly twelve years ago. Continue Reading See a photo slide show from the Medical Cannabis Cup Inside Exdo was a miniature version of the massive KushCon II from last December, with dispensaries and bong shops setting up booths. The big difference was that the majority of the dispensaries this weekend were displaying real cannabis on their tables -- something KushCon frowned upon. Half of the large hall was given over to speakers, including talks from Danko on hash making and cultivation tips. All of that was cool, and it definitely had a more relaxed and patient-driven vibe than the corporate-feeling KushCon. But what really made the cup worthwhile was going on outside and down the alley. Marijuana Deals Near You In a warehouse not connected to Exdo, hundreds of medical marijuana patients lit up and created the largest hot box I have ever been a part of. Massive pillows of ganja smoke were billowing over the head of the security guard checking to make sure our wristbands were all in order. Inside was exactly what a cannabis convention should look like. Thick air, hazy eyes and bong load after bong load of smoke being blown around the huge room. Some dispensaries were giving out herb, others were playing it more sly and only displaying their ganja while puffing with patients a few steps away from their booths. At one booth, there was an at least five-foot glass bong being packed up for patients, while across the way, another group was filling up equally as tall Volcano bags and offering a hit to anyone who would walk by. The Cannasseur dispensary had one of the most creative booths, offering a plywood simulation of a first-class private jet lounge and having their budtenders dress like flight attendants. Cannasseur also had some the most delicious samples of herb, with the scantily-clad stewardesses handing out bong rips of Kurple Fantasy from sick 4.0 Glass micro tubes. The Clinic also had a unique setup, letting patients play on a homemade The Price Is Right-like Klinko board for coupons and specials. A lot of booths had oil rigs, so it was fun to walk around and try different waxes and budders -- though some booths neglectfully weren't wiping down pipes with sanitary wipes, and the thought of catching some crap from one of the hundreds of other puffers kept me away on occasion. Still, having that many people together all for cannabis, and to have our community recognized by High Times, was exciting. See a photo slide show from the Medical Cannabis Cup One strange thing, even for a guy in the media, was all of the media. It seems like around every corner, someone being followed by their pet documentary film crew. People seemed to love hamming it up for the cameras, especially when the crew from G4's Attack of the Show would walk by a booth of puffing patients. I also met a few people from a crew in town filming for National Geographic, as well as another independent documentary film. I know it's legal here, and we should have no shame in what we are doing -- but as attorney Warren Edson appropriately asked after the event: "You know those cameras were on, right? 4 | Diesel engines are starting to make a return in the United States but they have been massively popular for decades in Europe. Its not just economy cars, either: AMG has built a diesel engine, oil-burning Audis have won the grueling 24 Hours of Le Mans on several occasions and BMW offers a 5-Series with a triple-turbocharged 3.0-liter straight-six diesel worthy of a M badge on the trunk lid. 40 years ago, Peugeot and Mercedes-Benz were among the very few manufacturers that offered a diesel in a passenger car and the thought of a compression ignition engine mounted in the engine bay of a sports car was a daring one that only Mercedes was willing to dabble in with the C111-III, a 230-horsepower experimental sports car that was never given the green light for production. The oil crisis that rocked the 1970s convinced BMWs top brass to take a close look at the diesel engine as a good compromise between power and fuel economy. A team of engineers tasked with studying oil-burning engines was formed at the firms Munich, Germany, headquarters in 1975. Engineers chose to use the M20 straight-six gasoline-burning engine as the starting point for the new diesel. The engines basic structure and belt-driven overhead cam setup were retained, but it featured purpose-designed valves, pistons and crankshaft and, importantly, an exhaust gas-driven turbocharger. BMW also worked with outside suppliers to design a system called Instant Start that shortened the glow time. With a displacement of 2,443 cubic centimeters, the new M21 engine was manufactured in Steyr, Austria, on a production line operated jointly by BMW and Magna-Steyr. The first regular-production diesel-powered BMW, the 524td, made its public debut at the 1983 Frankfurt Motor Show. With 115 horsepower and 154 lb-ft. of torque under the hood, it sprinted from zero to 62 mph (100 km/h) in 12.9 seconds and reached a top speed of 111 mph (180 km/h). At the time, BMW proudly called the car the fastest diesel-burning sedan in the world, though Mercedes turbocharged w123 300D was not far behind. Fuel economy was rated at an impressive 7.1 liters per 100 kilometers (33 mpg U.S., 39 mpg U.K.) in a mixed European cycle. Diesel engines rose to prominence in Europe during the early 1980s and manufacturers who didnt offer at least one oil-burning model often lost sales to competitors. In hindsight, the 524td came at exactly the right moment for BMW and it quickly became one of the most popular variants of the E28 5-Series in Germany. Lincoln burns oil Eager to keep up with rival Cadillac, who offered the Eldorado coupe and the Seville sedan with an Oldsmobile-sourced V8 diesel, Fords Lincoln division equipped its Continental sedan and Mark VII coupe with BMWs 2.4-liter diesel in 1984 but public demand was almost non-existent and the model was axed a year later after a handful of examples were built. Democratizing the diesel The 524tds popularity in Germany and abroad convinced BMW to widen its diesel offering. Launched in 1985, the 324d (E30) was powered by a naturally-aspirated variant of the 2.4-liter that churned out 82 horsepower and 113 lb-ft. of torque. It hit 60 mph from a stop in 16.1 seconds and returned 6.9 liters per 100 kilometers (34 mpg U.S., 40 mpg U.K.) in a mixed European cycle. Like in the 524td, power was sent to the rear wheels via a standard five-speed manual transmission or an optional four-speed automatic. The 324d was a hit in Europe but buyers clamored for more power so BMW quickly offered the turbodiesel 2.4 in the E30, creating the 324td. Conversely, the naturally-aspirated mill was installed the e28 and the 524d was popular in heavily-taxed markets like Italy, Spain and France. The original BMW diesel was replaced by a brand new unit presented at the 1991 Frankfurt Motor Show. 5 | New Jersey Democratic Sen. Cory Booker came under attack after his Wednesday night vote against allowing the importation of cheaper drugs from Canada into the United States. The amendment to the budget resolution bill would have encouraged the importing of cheaper pharmaceutical products into the U.S. to lower prescription drug prices. Those exorbitant price tags, which are set by pharmaceutical companies, are putting a financial crunch on families, according to a Consumer Reports survey. Spending on drugs is also taking a huge bite out of not only families' pocketbooks, but also government coffers which could end up coming back to bite taxpayers again. As the outrage grew online, Booker responded to questions about why he joined Republicans and a dozen Democrats in opposing the amendment sponsored by Sen. Amy Klobuchar (D-Minn.) and Sen. Bernie Sanders (I-Vt.). "Any plan to allow the importation of prescription medications should also include consumer protections that ensure foreign protections that ensure foreign drugs meet American safety standards," Booker said in a statement to Jezebel. Back in December, though, Booker voted to weaken federal safety standards that regulate whether a medication can be sold in the U.S. in the name of broader consumer access to drugs, but experimental ones, not necessarily cheaper ones. The 21st Century Cures Act, which Booker vocally supported, passed with resounding bipartisan support and was signed into law last month. The law promised government investments in cancer and Alzheimer's research, allocated funds to fight the opioid epidemic and contained a host of other measures intended to facilitate the modernization of the health care industry. Among those steps was one to roll back the notorious gauntlet of Food and Drug Administration regulations in order to expedite the arrival of experimental medicine and medical equipment to market with the idea of allowing Americans easier access to cutting-edge treatments for what ails them. Despite the overwhelming support in Congress, critics were vocal about flaws in the bill notably the roll back of FDA regulations that would benefit pharmaceutical and medical tech companies. "Big pharma has its hand out for a bunch of special giveaways and favors that are packed together in something called the 21st Century Cures bill," Sen. Elizabeth Warren said during the debate over the law. "When American voters say Congress is owned by big companies, this bill is exactly what they are talking about." Charles Krupa/AP Sen. Elizabeth Warren was a leading critic of the rollbacks on regulations on experimental drugs in the 21st Century Cures Act. "A greater threat" The 21st Century Cures Act was "terrible for drug quality," Peter Maybarduk, who directs progressive watchdog Public Citizen's access to medicines group, said in a phone call Friday. Pulitzer Prize-winning journalist Michael Hiltzik wrote in a column for the Los Angeles Times: "Remarkably, nothing in the measure would address the main problem the public sees with the drug industry excessive prices." Booker's office defended the seeming contradiction between the votes. There's a "big difference between adjusting FDA's requirements for medical products [and] experimental medications and a situation where you could have no FDA review of drugs at all," Jeff Giertz, a spokesperson for Booker, said in a phone call Friday. "If the amendment had some more specifics on what it would have spelled out, in terms of a review process, that would have been something he supported," Giertz said. Critics of the the 21st Century Cures Act, though, contended the inconsistencies remain rife. "Twenty-first Century Cures was a greater threat to drug safety and efficacy than the import amendment," Maybarduk said, adding of Booker's objections to the amendment: "It could be legitimate drug quality concerns. But that argument is also used as a fig-leaf when an elected rep doesn't want to break with pharma." 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DeCLUTR: Deep Contrastive Learning for Unsupervised Textual Representations 2 | 3 | ![build](https://github.com/JohnGiorgi/declutr/workflows/build/badge.svg?branch=master) 4 | [![codecov](https://codecov.io/gh/JohnGiorgi/DeCLUTR/branch/master/graph/badge.svg)](https://codecov.io/gh/JohnGiorgi/DeCLUTR) 5 | [![Checked with mypy](http://www.mypy-lang.org/static/mypy_badge.svg)](http://mypy-lang.org/) 6 | ![GitHub](https://img.shields.io/github/license/JohnGiorgi/DeCLUTR?color=blue) 7 | 8 | The corresponding code for our paper: [DeCLUTR: Deep Contrastive Learning for Unsupervised Textual Representations](https://aclanthology.org/2021.acl-long.72/). Results on [SentEval](https://github.com/facebookresearch/SentEval) are presented below (as averaged scores on the downstream and probing task test sets), along with existing state-of-the-art methods. 9 | 10 | | Model | Requires labelled data? | Parameters | Embed. dim. | Downstream (-SNLI) | Probing | Δ | 11 | |------------------------------------------------------------------------------------------------------------|:-----------------------:|:----------:|:-----------:|:------------------:|:---------:|:-----:| 12 | | [InferSent V2](https://github.com/facebookresearch/InferSent) | Yes | 38M | 4096 | 76.00 | 72.58 | -3.10 | 13 | | [Universal Sentence Encoder](https://tfhub.dev/google/universal-sentence-encoder-large/5) | Yes | 147M | 512 | 78.89 | 66.70 | -0.21 | 14 | | [Sentence Transformers](https://github.com/UKPLab/sentence-transformers) ("roberta-base-nli-mean-tokens") | Yes | 125M | 768 | 77.19 | 63.22 | -1.91 | 15 | | Transformer-small ([DistilRoBERTa-base](https://huggingface.co/distilroberta-base)) | No | 82M | 768 | 72.58 | 74.57 | -6.52 | 16 | | Transformer-base ([RoBERTa-base](https://huggingface.co/roberta-base)) | No | 125M | 768 | 72.70 | 74.19 | -6.40 | 17 | | DeCLUTR-small ([DistilRoBERTa-base](https://huggingface.co/distilroberta-base)) | No | 82M | 768 | 77.50 | __74.71__ | -1.60 | 18 | | DeCLUTR-base ([RoBERTa-base](https://huggingface.co/roberta-base)) | No | 125M | 768 | __79.10__ | 74.65 | -- | 19 | 20 | > Transformer-* is the same underlying architecture and pretrained weights as DeCLUTR-* _before_ continued pretraining with our contrastive objective. Transformer-* and DeCLUTR-* use mean pooling on their token-level embeddings to produce a fixed-length sentence representation. Downstream scores are computed without considering perfomance on SNLI (denoted "Downstream (-SNLI)") as InferSent, USE and Sentence Transformers all train on SNLI. Δ: difference to DeCLUTR-base downstream score. 21 | 22 | ## Table of contents 23 | 24 | - [Notebooks](#notebooks) 25 | - [Installation](#installation) 26 | - [Usage](#usage) 27 | - [Training](#training) 28 | - [Embedding](#embedding) 29 | - [Evaluating with SentEval](#evaluating-with-senteval) 30 | - [Reproducing results](#reproducing-results) 31 | - [Citing](#citing) 32 | 33 | ## Notebooks 34 | 35 | The easiest way to get started is to follow along with one of our [notebooks](notebooks): 36 | 37 | - Training your own model [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnGiorgi/DeCLUTR/blob/master/notebooks/training.ipynb) 38 | - Embedding text with a pretrained model [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnGiorgi/DeCLUTR/blob/master/notebooks/embedding.ipynb) 39 | - Evaluating a model with [SentEval](https://github.com/facebookresearch/SentEval) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnGiorgi/DeCLUTR/blob/master/notebooks/evaluating.ipynb) 40 | 41 | ## Installation 42 | 43 | This repository requires Python 3.6.1 or later. 44 | 45 | ### Setting up a virtual environment 46 | 47 | Before installing, you should create and activate a Python virtual environment. See [here](https://github.com/allenai/allennlp#installing-via-pip) for detailed instructions. 48 | 49 | ### Installing the library and dependencies 50 | 51 | If you _don't_ plan on modifying the source code, install from `git` using `pip` 52 | 53 | ``` 54 | pip install git+https://github.com/JohnGiorgi/DeCLUTR.git 55 | ``` 56 | 57 | Otherwise, clone the repository locally and then install 58 | 59 | ```bash 60 | git clone https://github.com/JohnGiorgi/DeCLUTR.git 61 | cd DeCLUTR 62 | pip install --editable . 63 | ``` 64 | 65 | #### Gotchas 66 | 67 | - If you plan on training your own model, you should also install [PyTorch](https://pytorch.org/) with [CUDA](https://developer.nvidia.com/cuda-zone) support by following the instructions for your system [here](https://pytorch.org/get-started/locally/). 68 | 69 | ## Usage 70 | 71 | ### Preparing a dataset 72 | 73 | A dataset is simply a file containing one item of text (a document, a scientific paper, etc.) per line. For demonstration purposes, we have provided a script that will download the [WikiText-103](https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/) dataset and match our minimal preprocessing 74 | 75 | ```bash 76 | python scripts/preprocess_wikitext_103.py path/to/output/wikitext-103/train.txt --min-length 2048 77 | ``` 78 | 79 | > See [scripts/preprocess_openwebtext.py](scripts/preprocess_openwebtext.py) for a script that can be used to recreate the (much larger) dataset used in our paper. 80 | 81 | You can specify the train set path in the [configs](training_config) under `"train_data_path"`. 82 | 83 | #### Gotchas 84 | 85 | - A training dataset should contain documents with a minimum of `num_anchors * max_span_len * 2` whitespace tokens. This is required to sample spans according to our sampling procedure. See the [dataset reader](declutr/dataset_reader.py) and/or [our paper](https://aclanthology.org/2021.acl-long.72/) for more details on these hyperparameters. 86 | 87 | ### Training 88 | 89 | To train the model, use the [`allennlp train`](https://docs.allennlp.org/master/api/commands/train/) command with our [`declutr.jsonnet`](training_config/declutr.jsonnet) config. For example, to train DeCLUTR-small, run the following 90 | 91 | ```bash 92 | # This can be (almost) any model from https://huggingface.co/ that supports masked language modelling. 93 | TRANSFORMER_MODEL="distilroberta-base" 94 | 95 | allennlp train "training_config/declutr.jsonnet" \ 96 | --serialization-dir "output" \ 97 | --overrides "{'train_data_path': 'path/to/your/dataset/train.txt'}" \ 98 | --include-package "declutr" 99 | ``` 100 | 101 | The `--overrides` flag allows you to override any field in the config with a JSON-formatted string, but you can equivalently update the config itself if you prefer. During training, models, vocabulary, configuration, and log files will be saved to the directory provided by `--serialization-dir`. This can be changed to any directory you like. 102 | 103 | #### Gotchas 104 | 105 | - There was a small bug in the original implementation that caused gradients derived from the contrastive loss to be scaled by 1/N, where N is the number of GPUs used during training. This has been fixed. To reproduce results from the paper, set `model.scale_fix` to `False` in your config. Note that this will have no effect if you are not using distributed training with more than 1 GPU. 106 | 107 | #### Exporting a trained model to HuggingFace Transformers 108 | 109 | We have provided a simple script to export a trained model so that it can be loaded with [Hugging Face Transformers](https://github.com/huggingface/transformers) 110 | 111 | ```bash 112 | wget -nc https://github.com/JohnGiorgi/DeCLUTR/blob/master/scripts/save_pretrained_hf.py 113 | python save_pretrained_hf.py --archive-file "output" --save-directory "output_transformers" 114 | ``` 115 | 116 | The model, saved to `--save-directory`, can then be loaded using the Hugging Face Transformers library (see [Embedding](#hugging-face-transformers) for more details) 117 | 118 | ```python 119 | from transformers import AutoTokenizer, AutoModelForMaskedLM 120 | 121 | tokenizer = AutoTokenizer.from_pretrained("output_transformers") 122 | model = AutoModel.from_pretrained("output_transformers") 123 | ``` 124 | 125 | > If you would like to upload your model to the Hugging Face model repository, follow the instructions [here](https://huggingface.co/transformers/model_sharing.html). 126 | 127 | #### Multi-GPU training 128 | 129 | To train on more than one GPU, provide a list of CUDA devices in your call to `allennlp train`. For example, to train with four CUDA devices with IDs `0, 1, 2, 3` 130 | 131 | ```bash 132 | --overrides "{'distributed.cuda_devices': [0, 1, 2, 3]}" 133 | ``` 134 | 135 | #### Training with mixed-precision 136 | 137 | If your GPU supports it, [mixed-precision](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) will be used automatically during training and inference. 138 | 139 | ### Embedding 140 | 141 | You can embed text with a trained model in one of four ways: 142 | 143 | 1. [Sentence Transformers](#sentencetransformers): load our pretrained models with the [SentenceTransformers](https://www.sbert.net/) library (_recommended_). 144 | 2. [Hugging Face Transformers](#hugging-face-transformers): load our pretrained models with the [Hugging Face Transformers](https://github.com/huggingface/transformers) library. 145 | 3. [From this repo](#from-this-repo): import and initialize an object from this repo which can be used to embed sentences/paragraphs. 146 | 4. [Bulk embed](#bulk-embed-a-file): embed all text in a given text file with a simple command-line interface. 147 | 148 | The following pre-trained models are available: 149 | 150 | - [johngiorgi/declutr-small](https://huggingface.co/johngiorgi/declutr-small) 151 | - [johngiorgi/declutr-base](https://huggingface.co/johngiorgi/declutr-base) 152 | - [johngiorgi/declutr-sci-base](https://huggingface.co/johngiorgi/declutr-sci-base) 153 | 154 | #### SentenceTransformers 155 | 156 | Our pretrained models are hosted with Hugging Face Transformers, so they can easily be loaded in SentenceTransformers. Just make sure to [install the SentenceTransformers library](https://www.sbert.net/docs/installation.html) first. Here is a simple example 157 | 158 | ```python 159 | from sentence_transformers import SentenceTransformer 160 | 161 | # Load the model 162 | model = SentenceTransformer("johngiorgi/declutr-small") 163 | 164 | # Prepare some text to embed 165 | texts = [ 166 | "A smiling costumed woman is holding an umbrella.", 167 | "A happy woman in a fairy costume holds an umbrella.", 168 | ] 169 | 170 | # Embed the text 171 | embeddings = model.encode(texts) 172 | ``` 173 | 174 | These embeddings can then be used, for example, to compute the semantic similarity between some number of sentences or paragraphs 175 | 176 | ```python 177 | from scipy.spatial.distance import cosine 178 | 179 | semantic_sim = 1 - cosine(embeddings[0], embeddings[1]) 180 | ``` 181 | 182 | #### Hugging Face Transformers 183 | 184 | Alternatively, you can use the models straight from Hugging Face Transformers. This just requires a few extra steps. Here is a simple example 185 | 186 | ```python 187 | import torch 188 | from transformers import AutoModel, AutoTokenizer 189 | 190 | # Load the model 191 | tokenizer = AutoTokenizer.from_pretrained("johngiorgi/declutr-small") 192 | model = AutoModel.from_pretrained("johngiorgi/declutr-small") 193 | 194 | # Prepare some text to embed 195 | texts = [ 196 | "A smiling costumed woman is holding an umbrella.", 197 | "A happy woman in a fairy costume holds an umbrella.", 198 | ] 199 | inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt") 200 | 201 | # Embed the text 202 | with torch.no_grad(): 203 | sequence_output = model(**inputs)[0] 204 | 205 | # Mean pool the token-level embeddings to get sentence-level embeddings 206 | embeddings = torch.sum( 207 | sequence_output * inputs["attention_mask"].unsqueeze(-1), dim=1 208 | ) / torch.clamp(torch.sum(inputs["attention_mask"], dim=1, keepdims=True), min=1e-9) 209 | ``` 210 | 211 | #### From this repo 212 | 213 | To use the model directly from this repo, import `Encoder` and pass it some text (it accepts both strings and lists of strings) 214 | 215 | ```python 216 | from declutr import Encoder 217 | 218 | # This can be a path on disk to a model you have trained yourself OR 219 | # the name of one of our pretrained models. 220 | pretrained_model_or_path = "declutr-small" 221 | 222 | encoder = Encoder(pretrained_model_or_path) 223 | embeddings = encoder([ 224 | "A smiling costumed woman is holding an umbrella.", 225 | "A happy woman in a fairy costume holds an umbrella." 226 | ]) 227 | ``` 228 | 229 | See the list of available `PRETRAINED_MODELS` in [declutr/encoder.py](declutr/encoder.py) 230 | 231 | ```bash 232 | python -c "from declutr.encoder import PRETRAINED_MODELS ; print(list(PRETRAINED_MODELS.keys()))" 233 | ``` 234 | 235 | #### Bulk embed a file 236 | 237 | To embed all text in a **given** file with a trained model, run the following command 238 | 239 | ```bash 240 | allennlp predict "output" "path/to/input.txt" \ 241 | --output-file "output/embeddings.jsonl" \ 242 | --batch-size 32 \ 243 | --cuda-device 0 \ 244 | --use-dataset-reader \ 245 | --overrides "{'dataset_reader.num_anchors': null}" \ 246 | --include-package "declutr" 247 | ``` 248 | 249 | This will: 250 | 251 | 1. Load the model serialized to `"output"` with the "best" weights (i.e. the ones that achieved the lowest loss during training). 252 | 2. Use that model to embed the text in the provided input file (`"path/to/input.txt"`). 253 | 3. Save the embeddings to disk as a [JSON lines](http://jsonlines.org/) file (`"output/embeddings.jsonl"`) 254 | 255 | The text embeddings are stored in the field `"embeddings"` in `"output/embeddings.jsonl"`. 256 | 257 | ### Evaluating with SentEval 258 | 259 | [SentEval](https://github.com/facebookresearch/SentEval) is a library for evaluating the quality of sentence embeddings. We provide a script to evaluate our model against SentEval. We have provided a [notebook](https://colab.research.google.com/github/JohnGiorgi/DeCLUTR/blob/master/notebooks/evaluating.ipynb) that documents the process of evaluating a trained model on SentEval. Broadly, the steps are the following: 260 | 261 | First, clone the SentEval repository and download the transfer task datasets (you only need to do this once) 262 | 263 | ```bash 264 | # Clone our fork which has several bug fixes merged 265 | git clone https://github.com/JohnGiorgi/SentEval.git 266 | cd SentEval/data/downstream/ 267 | ./get_transfer_data.bash 268 | cd ../../../ 269 | ``` 270 | 271 | > See the [SentEval](https://github.com/facebookresearch/SentEval) repository for full details. 272 | 273 | Then you can run our [script](scripts/run_senteval.py) to evaluate a trained model against SentEval 274 | 275 | ```bash 276 | python scripts/run_senteval.py allennlp "SentEval" "output" 277 | --output-filepath "output/senteval_results.json" \ 278 | --cuda-device 0 \ 279 | --include-package "declutr" 280 | ``` 281 | 282 | The results will be saved to `"output/senteval_results.json"`. This can be changed to any path you like. 283 | 284 | > Pass the flag `--prototyping-config` to get a proxy of the results while dramatically reducing computation time. 285 | 286 | For a list of commands, run 287 | 288 | ```bash 289 | python scripts/run_senteval.py --help 290 | ``` 291 | 292 | For help with a specific command, e.g. `allennlp`, run 293 | 294 | ``` 295 | python scripts/run_senteval.py allennlp --help 296 | ``` 297 | 298 | ### Reproducing results 299 | 300 | To reproduce results from the paper, first follow the instructions to set up SentEval in [Evaluating with SentEval](#evaluating-with-senteval). Then, run 301 | 302 | ```bash 303 | python scripts/run_senteval.py transformers "SentEval" "johngiorgi/declutr-base" \ 304 | --output-filepath "senteval_results.json" \ 305 | --cuda-device 0 \ 306 | --mean-pool 307 | ``` 308 | 309 | `"johngiorgi/declutr-base"` can be replaced with (almost) any model on the [HuggingFace model hub](https://huggingface.co/models). Evaluation takes approximately 10-12 hours on a NVIDIA V100 Tesla GPU. 310 | 311 | ## Citing 312 | 313 | If you use DeCLUTR in your work, please consider citing our paper 314 | 315 | ``` 316 | @inproceedings{giorgi-etal-2021-declutr, 317 | title = "{D}e{CLUTR}: Deep Contrastive Learning for Unsupervised Textual Representations", 318 | author = "Giorgi, John and 319 | Nitski, Osvald and 320 | Wang, Bo and 321 | Bader, Gary", 322 | booktitle = "Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)", 323 | month = aug, 324 | year = "2021", 325 | address = "Online", 326 | publisher = "Association for Computational Linguistics", 327 | url = "https://aclanthology.org/2021.acl-long.72", 328 | doi = "10.18653/v1/2021.acl-long.72", 329 | pages = "879--895", 330 | abstract = "Sentence embeddings are an important component of many natural language processing (NLP) systems. Like word embeddings, sentence embeddings are typically learned on large text corpora and then transferred to various downstream tasks, such as clustering and retrieval. Unlike word embeddings, the highest performing solutions for learning sentence embeddings require labelled data, limiting their usefulness to languages and domains where labelled data is abundant. In this paper, we present DeCLUTR: Deep Contrastive Learning for Unsupervised Textual Representations. Inspired by recent advances in deep metric learning (DML), we carefully design a self-supervised objective for learning universal sentence embeddings that does not require labelled training data. When used to extend the pretraining of transformer-based language models, our approach closes the performance gap between unsupervised and supervised pretraining for universal sentence encoders. Importantly, our experiments suggest that the quality of the learned embeddings scale with both the number of trainable parameters and the amount of unlabelled training data. Our code and pretrained models are publicly available and can be easily adapted to new domains or used to embed unseen text.", 331 | } 332 | ``` 333 | --------------------------------------------------------------------------------