├── larimar_base
    ├── pytorch_transformers
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── fixtures
    │   │   │   ├── input.txt
    │   │   │   ├── test_sentencepiece.model
    │   │   │   └── sample_text.txt
    │   │   ├── conftest.py
    │   │   ├── tokenization_dilbert_test.py
    │   │   ├── tokenization_auto_test.py
    │   │   ├── tokenization_utils_test.py
    │   │   ├── configuration_common_test.py
    │   │   ├── tokenization_openai_test.py
    │   │   ├── tokenization_transfo_xl_test.py
    │   │   ├── tokenization_gpt2_test.py
    │   │   ├── tokenization_xlm_test.py
    │   │   ├── modeling_auto_test.py
    │   │   ├── tokenization_roberta_test.py
    │   │   ├── tokenization_xlnet_test.py
    │   │   ├── tokenization_bert_test.py
    │   │   └── optimization_test.py
    │   ├── utils
    │   │   ├── constants.py
    │   │   ├── dummy_keras_nlp_objects.py
    │   │   ├── dummy_sentencepiece_and_tokenizers_objects.py
    │   │   ├── dummy_tensorflow_text_objects.py
    │   │   ├── dummy_detectron2_objects.py
    │   │   ├── dummy_music_objects.py
    │   │   ├── dummy_speech_objects.py
    │   │   ├── dummy_essentia_and_librosa_and_pretty_midi_and_scipy_and_torch_objects.py
    │   │   ├── bitsandbytes.py
    │   │   ├── model_parallel_utils.py
    │   │   ├── versions.py
    │   │   ├── peft_utils.py
    │   │   ├── hp_naming.py
    │   │   ├── sentencepiece_model_pb2_new.py
    │   │   └── dummy_sentencepiece_objects.py
    │   ├── configuration_roberta.py
    │   ├── dependency_versions_check.py
    │   ├── model_parallel_utils.py
    │   ├── tokenization_distilbert.py
    │   ├── convert_tf_checkpoint_to_pytorch.py
    │   ├── convert_xlm_checkpoint_to_pytorch.py
    │   ├── convert_gpt2_checkpoint_to_pytorch.py
    │   ├── convert_openai_checkpoint_to_pytorch.py
    │   ├── dependency_versions_table.py
    │   ├── configuration_distilbert.py
    │   ├── tokenization_roberta.py
    │   ├── convert_xlnet_checkpoint_to_pytorch.py
    │   ├── __init__.py
    │   ├── integrations
    │   │   └── __init__.py
    │   ├── convert_pytorch_checkpoint_to_tf.py
    │   ├── configuration_openai.py
    │   ├── convert_transfo_xl_checkpoint_to_pytorch.py
    │   ├── configuration_bert.py
    │   └── generation
    │   │   └── stopping_criteria.py
    ├── modules
    │   ├── encoders
    │   │   ├── __init__.py
    │   │   ├── encoder.py
    │   │   ├── enc_lstm.py
    │   │   └── gaussian_encoder.py
    │   ├── __init__.py
    │   ├── decoders
    │   │   └── decoder.py
    │   ├── utils.py
    │   └── spacefusion.py
    ├── configs
    │   ├── ds_config.json
    │   ├── default_accelerate_config.yaml
    │   └── config_train_larimar.yaml
    ├── eval.sh
    ├── eval_rephrase.sh
    ├── main_pl.py
    ├── train_larimar.sh
    ├── lightning_data.py
    └── ddp.py
├── models
    └── model_locations.json
├── larimar_architecture.png
├── data
    ├── wikipedia_data_locations.json
    └── counterfactual_data_locations.json
├── requirements.txt
└── README.md


/larimar_base/pytorch_transformers/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/larimar_base/modules/encoders/__init__.py:
--------------------------------------------------------------------------------
1 | from .enc_lstm import *


--------------------------------------------------------------------------------
/models/model_locations.json:
--------------------------------------------------------------------------------
1 | {
2 |     "larimar-1.3b-c3": "./larimar-1.3b-c3.ckpt"
3 | }
4 | 


--------------------------------------------------------------------------------
/larimar_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/larimar/HEAD/larimar_architecture.png


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/tests/fixtures/input.txt:
--------------------------------------------------------------------------------
1 | Who was Jim Henson ? ||| Jim Henson was a puppeteer
2 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/tests/fixtures/test_sentencepiece.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/larimar/HEAD/larimar_base/pytorch_transformers/tests/fixtures/test_sentencepiece.model


--------------------------------------------------------------------------------
/data/wikipedia_data_locations.json:
--------------------------------------------------------------------------------
1 | {
2 |     "wikipedia-64":  "./wikipedia/blocksize_64",
3 |     "wikipedia-128": "./wikipedia/blocksize_128",
4 |     "wikipedia-256": "./wikipedia/blocksize_256"
5 | }
6 | 


--------------------------------------------------------------------------------
/larimar_base/modules/__init__.py:
--------------------------------------------------------------------------------
1 | from .encoders import *
2 | from .decoders import *
3 | from .vae import *
4 | from .utils import *
5 | from .spacefusion import *
6 | from .cara import *
7 | from .arae import *
8 | from .mem_vae import *
9 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | sentence-transformers==3.3.1
 2 | lightning
 3 | deepspeed
 4 | nltk
 5 | boto3
 6 | sacremoses
 7 | tensorboard
 8 | jupyterlab
 9 | scipy
10 | scikit-learn
11 | jsonargparse[signatures]
12 | spacy
13 | pandas
14 | 
15 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/utils/constants.py:
--------------------------------------------------------------------------------
1 | IMAGENET_DEFAULT_MEAN = [0.485, 0.456, 0.406]
2 | IMAGENET_DEFAULT_STD = [0.229, 0.224, 0.225]
3 | IMAGENET_STANDARD_MEAN = [0.5, 0.5, 0.5]
4 | IMAGENET_STANDARD_STD = [0.5, 0.5, 0.5]
5 | OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
6 | OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
7 | 


--------------------------------------------------------------------------------
/data/counterfactual_data_locations.json:
--------------------------------------------------------------------------------
1 | {
2 |     "attribute_snippets": "https://rome.baulab.info/data/dsets/attribute_snippets.json",
3 |     "counterfactual": "https://rome.baulab.info/data/dsets/counterfact.json",
4 |     "idf": "https://rome.baulab.info/data/dsets/idf.npy",
5 |     "tfidf_vocab": "https://rome.baulab.info/data/dsets/tfidf_vocab.json"
6 | }


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/utils/dummy_keras_nlp_objects.py:
--------------------------------------------------------------------------------
 1 | # This file is autogenerated by the command `make fix-copies`, do not edit.
 2 | from ..utils import DummyObject, requires_backends
 3 | 
 4 | 
 5 | class TFGPT2Tokenizer(metaclass=DummyObject):
 6 |     _backends = ["keras_nlp"]
 7 | 
 8 |     def __init__(self, *args, **kwargs):
 9 |         requires_backends(self, ["keras_nlp"])
10 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/utils/dummy_sentencepiece_and_tokenizers_objects.py:
--------------------------------------------------------------------------------
 1 | # This file is autogenerated by the command `make fix-copies`, do not edit.
 2 | from ..utils import DummyObject, requires_backends
 3 | 
 4 | 
 5 | SLOW_TO_FAST_CONVERTERS = None
 6 | 
 7 | 
 8 | def convert_slow_tokenizer(*args, **kwargs):
 9 |     requires_backends(convert_slow_tokenizer, ["sentencepiece", "tokenizers"])
10 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/utils/dummy_tensorflow_text_objects.py:
--------------------------------------------------------------------------------
 1 | # This file is autogenerated by the command `make fix-copies`, do not edit.
 2 | from ..utils import DummyObject, requires_backends
 3 | 
 4 | 
 5 | class TFBertTokenizer(metaclass=DummyObject):
 6 |     _backends = ["tensorflow_text"]
 7 | 
 8 |     def __init__(self, *args, **kwargs):
 9 |         requires_backends(self, ["tensorflow_text"])
10 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/utils/dummy_detectron2_objects.py:
--------------------------------------------------------------------------------
 1 | # This file is autogenerated by the command `make fix-copies`, do not edit.
 2 | from ..utils import requires_backends
 3 | 
 4 | 
 5 | LAYOUTLM_V2_PRETRAINED_MODEL_ARCHIVE_LIST = None
 6 | 
 7 | 
 8 | class LayoutLMv2Model:
 9 |     def __init__(self, *args, **kwargs):
10 |         requires_backends(self, ["detectron2"])
11 | 
12 |     @classmethod
13 |     def from_pretrained(cls, *args, **kwargs):
14 |         requires_backends(cls, ["detectron2"])
15 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/utils/dummy_music_objects.py:
--------------------------------------------------------------------------------
 1 | # This file is autogenerated by the command `make fix-copies`, do not edit.
 2 | from ..utils import DummyObject, requires_backends
 3 | 
 4 | 
 5 | class Pop2PianoFeatureExtractor(metaclass=DummyObject):
 6 |     _backends = ["music"]
 7 | 
 8 |     def __init__(self, *args, **kwargs):
 9 |         requires_backends(self, ["music"])
10 | 
11 | 
12 | class Pop2PianoTokenizer(metaclass=DummyObject):
13 |     _backends = ["music"]
14 | 
15 |     def __init__(self, *args, **kwargs):
16 |         requires_backends(self, ["music"])
17 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/utils/dummy_speech_objects.py:
--------------------------------------------------------------------------------
 1 | # This file is autogenerated by the command `make fix-copies`, do not edit.
 2 | from ..utils import DummyObject, requires_backends
 3 | 
 4 | 
 5 | class ASTFeatureExtractor(metaclass=DummyObject):
 6 |     _backends = ["speech"]
 7 | 
 8 |     def __init__(self, *args, **kwargs):
 9 |         requires_backends(self, ["speech"])
10 | 
11 | 
12 | class Speech2TextFeatureExtractor(metaclass=DummyObject):
13 |     _backends = ["speech"]
14 | 
15 |     def __init__(self, *args, **kwargs):
16 |         requires_backends(self, ["speech"])
17 | 


--------------------------------------------------------------------------------
/larimar_base/configs/ds_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_micro_batch_size_per_gpu": 2,
 3 |   "gradient_accumulation_steps": 4,
 4 |   "fp16": {
 5 |     "enabled": false,
 6 |     "min_loss_scale": 0.5,
 7 |     "fp16_scale_tolerance": 0.25,
 8 |     "opt_level": "O2"
 9 |   },
10 |   "bf16": { "enabled": true },
11 |   "zero_optimization": {
12 |     "stage": 2,
13 |     "offload_param": {
14 |       "device": "cpu"
15 |     },
16 |     "offload_optimizer": {
17 |       "device": "cpu"
18 |     },
19 |     "allgather_partitions": true,
20 |     "allgather_bucket_size": 5e8,
21 |     "contiguous_gradients": true
22 |   }
23 | }
24 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | # content of conftest.py
 2 | 
 3 | import pytest
 4 | 
 5 | 
 6 | def pytest_addoption(parser):
 7 |     parser.addoption(
 8 |         "--runslow", action="store_true", default=False, help="run slow tests"
 9 |     )
10 | 
11 | 
12 | def pytest_collection_modifyitems(config, items):
13 |     if config.getoption("--runslow"):
14 |         # --runslow given in cli: do not skip slow tests
15 |         return
16 |     skip_slow = pytest.mark.skip(reason="need --runslow option to run")
17 |     for item in items:
18 |         if "slow" in item.keywords:
19 |             item.add_marker(skip_slow)
20 | 


--------------------------------------------------------------------------------
/larimar_base/configs/default_accelerate_config.yaml:
--------------------------------------------------------------------------------
 1 | command_file: null
 2 | commands: null
 3 | compute_environment: LOCAL_MACHINE
 4 | deepspeed_config:
 5 |   deepspeed_config_file: configs/ds_config.json
 6 |   zero3_init_flag: false
 7 | distributed_type: DEEPSPEED
 8 | downcast_bf16: 'yes'
 9 | dynamo_backend: 'NO'
10 | fsdp_config: {}
11 | gpu_ids: null
12 | machine_rank: 0
13 | main_process_ip: null
14 | main_process_port: null
15 | main_training_function: main
16 | megatron_lm_config: {}
17 | num_machines: 1
18 | num_processes: 1
19 | rdzv_backend: static
20 | same_network: true
21 | tpu_name: null
22 | tpu_zone: null
23 | use_cpu: false
24 | 


--------------------------------------------------------------------------------
/larimar_base/eval.sh:
--------------------------------------------------------------------------------
 1 | mode=pyrite
 2 | dataset=counterfact
 3 | cache_dir=../cache
 4 | checkpoint=../models/larimar-1.3b-c3.ckpt
 5 | data_dir=../data/counterfact
 6 | res_dir_name=../eval/results
 7 | num_eval_cases=2000
 8 | scope_detect_threshold=0.3
 9 | 
10 | # scope detection
11 | python counterfact_eval.py \
12 |        --mode ${mode} \
13 |        --dataset ${dataset} \
14 |        --cache_dir  ${cache_dir} \
15 |        --checkpoint ${checkpoint} \
16 |        --data_dir   ${data_dir} \
17 |        --res_dir_name ${res_dir_name} \
18 |        --num_eval_cases ${num_eval_cases} \
19 |        --scope_detect_threshold ${scope_detect_threshold}
20 |        
21 | # no scope
22 | python counterfact_eval.py \
23 |        --mode ${mode} \
24 |        --dataset ${dataset} \
25 |        --cache_dir  ${cache_dir} \
26 |        --checkpoint ${checkpoint} \
27 |        --data_dir   ${data_dir} \
28 |        --res_dir_name ${res_dir_name} \
29 |        --num_eval_cases ${num_eval_cases} \
30 | 
31 | 
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/utils/dummy_essentia_and_librosa_and_pretty_midi_and_scipy_and_torch_objects.py:
--------------------------------------------------------------------------------
 1 | # This file is autogenerated by the command `make fix-copies`, do not edit.
 2 | from ..utils import DummyObject, requires_backends
 3 | 
 4 | 
 5 | class Pop2PianoFeatureExtractor(metaclass=DummyObject):
 6 |     _backends = ["essentia", "librosa", "pretty_midi", "scipy", "torch"]
 7 | 
 8 |     def __init__(self, *args, **kwargs):
 9 |         requires_backends(self, ["essentia", "librosa", "pretty_midi", "scipy", "torch"])
10 | 
11 | 
12 | class Pop2PianoTokenizer(metaclass=DummyObject):
13 |     _backends = ["essentia", "librosa", "pretty_midi", "scipy", "torch"]
14 | 
15 |     def __init__(self, *args, **kwargs):
16 |         requires_backends(self, ["essentia", "librosa", "pretty_midi", "scipy", "torch"])
17 | 
18 | 
19 | class Pop2PianoProcessor(metaclass=DummyObject):
20 |     _backends = ["essentia", "librosa", "pretty_midi", "scipy", "torch"]
21 | 
22 |     def __init__(self, *args, **kwargs):
23 |         requires_backends(self, ["essentia", "librosa", "pretty_midi", "scipy", "torch"])
24 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/utils/bitsandbytes.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import warnings
15 | 
16 | 
17 | warnings.warn(
18 |     "transformers.utils.bitsandbytes module is deprecated and will be removed in a future version. Please import bitsandbytes modules directly from transformers.integrations",
19 |     FutureWarning,
20 | )
21 | 
22 | from ..integrations import (  # noqa
23 |     get_keys_to_not_convert,
24 |     replace_8bit_linear,
25 |     replace_with_bnb_linear,
26 |     set_module_8bit_tensor_to_device,
27 |     set_module_quantized_tensor_to_device,
28 | )
29 | 


--------------------------------------------------------------------------------
/larimar_base/eval_rephrase.sh:
--------------------------------------------------------------------------------
 1 | mode=pyrite
 2 | dataset=counterfact
 3 | cache_dir=../cache
 4 | checkpoint=../models/larimar-1.3b-c3.ckpt
 5 | data_dir=../data/counterfact
 6 | res_dir_name=../eval/results
 7 | num_eval_cases=2000
 8 | scope_detect_threshold=0.3
 9 | 
10 | # scope detection
11 | for num_rephrases in 0 1 2
12 | do
13 |     python counterfact_eval_rephrase.py \
14 | 	   --mode ${mode} \
15 | 	   --dataset ${dataset} \
16 | 	   --cache_dir  ${cache_dir} \
17 | 	   --checkpoint ${checkpoint} \
18 | 	   --data_dir   ${data_dir} \
19 | 	   --res_dir_name ${res_dir_name} \
20 | 	   --num_eval_cases ${num_eval_cases} \
21 | 	   --num_rephrases ${num_rephrases} \
22 | 	   --remove_distraction \
23 | 	   --scope_detect_threshold ${scope_detect_threshold}
24 | done
25 | 
26 | 
27 | 
28 | # no scope
29 | for num_rephrases in 0 1 2
30 | do
31 | python counterfact_eval_rephrase.py \
32 |        --mode ${mode} \
33 |        --dataset ${dataset} \
34 |        --cache_dir  ${cache_dir} \
35 |        --checkpoint ${checkpoint} \
36 |        --data_dir   ${data_dir} \
37 |        --res_dir_name ${res_dir_name} \
38 |        --num_eval_cases ${num_eval_cases} \
39 |        --num_rephrases ${num_rephrases} \
40 |        --remove_distraction
41 | done
42 | 
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/configuration_roberta.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ RoBERTa configuration """
17 | 
18 | from __future__ import (absolute_import, division, print_function,
19 |                         unicode_literals)
20 | 
21 | import logging
22 | 
23 | from .configuration_bert import BertConfig
24 | 
25 | logger = logging.getLogger(__name__)
26 | 
27 | ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
28 |     'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json",
29 |     'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json",
30 |     'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json",
31 | }
32 | 
33 | 
34 | class RobertaConfig(BertConfig):
35 |     pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
36 | 


--------------------------------------------------------------------------------
/larimar_base/modules/encoders/encoder.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | from ..utils import log_sum_exp
 6 | 
 7 | class EncoderBase(nn.Module):
 8 |     """docstring for EncoderBase"""
 9 |     def __init__(self):
10 |         super(EncoderBase, self).__init__()
11 | 
12 |     def forward(self, x):
13 |         """
14 |         Args:
15 |             x: (batch_size, *)
16 |         Returns: the tensors required to parameterize a distribution.
17 |         E.g. for Gaussian encoder it returns the mean and variance tensors
18 |         """
19 | 
20 |         raise NotImplementedError
21 | 
22 |     def sample(self, input, nsamples):
23 |         """sampling from the encoder
24 |         Returns: Tensor1
25 |             Tensor1: the tensor latent z with shape [batch, nsamples, nz]
26 |         """
27 | 
28 |         raise NotImplementedError
29 | 
30 |     def encode(self, input, nsamples):
31 |         """perform the encoding and compute the KL term
32 |         Returns: Tensor1, Tensor2
33 |             Tensor1: the tensor latent z with shape [batch, nsamples, nz]
34 |             Tensor2: the tenor of KL for each x with shape [batch]
35 |         """
36 | 
37 |         raise NotImplementedError
38 | 
39 | 
40 |     def eval_inference_dist(self, x, z, param=None):
41 |         """this function computes log q(z | x)
42 |         Args:
43 |             z: tensor
44 |                 different z points that will be evaluated, with
45 |                 shape [batch, nsamples, nz]
46 |         Returns: Tensor1
47 |             Tensor1: log q(z|x) with shape [batch, nsamples]
48 |         """
49 | 
50 |         raise NotImplementedError
51 | 
52 |     def calc_mi(self, x):
53 |         """Approximate the mutual information between x and z
54 |         I(x, z) = E_xE_{q(z|x)}log(q(z|x)) - E_xE_{q(z|x)}log(q(z))
55 |         Returns: Float
56 |         """
57 | 
58 |         raise NotImplementedError


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/tests/tokenization_dilbert_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import, division, print_function, unicode_literals
16 | 
17 | import os
18 | import unittest
19 | from io import open
20 | 
21 | from pytorch_transformers.tokenization_distilbert import (DistilBertTokenizer)
22 | 
23 | from .tokenization_tests_commons import CommonTestCases
24 | from .tokenization_bert_test import BertTokenizationTest
25 | 
26 | class DistilBertTokenizationTest(BertTokenizationTest):
27 | 
28 |     tokenizer_class = DistilBertTokenizer
29 | 
30 |     def get_tokenizer(self, **kwargs):
31 |         return DistilBertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
32 | 
33 |     def test_sequence_builders(self):
34 |         tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
35 | 
36 |         text = tokenizer.encode("sequence builders")
37 |         text_2 = tokenizer.encode("multi-sequence build")
38 | 
39 |         encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
40 |         encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
41 | 
42 |         assert encoded_sentence == [101] + text + [102]
43 |         assert encoded_pair == [101] + text + [102] + text_2 + [102]
44 | 
45 | if __name__ == '__main__':
46 |     unittest.main()
47 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/tests/tokenization_auto_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import
16 | from __future__ import division
17 | from __future__ import print_function
18 | 
19 | import unittest
20 | import shutil
21 | import pytest
22 | import logging
23 | 
24 | from pytorch_transformers import AutoTokenizer, BertTokenizer, AutoTokenizer, GPT2Tokenizer
25 | from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
26 | from pytorch_transformers.modeling_gpt2 import GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
27 | 
28 | 
29 | class AutoTokenizerTest(unittest.TestCase):
30 |     def test_tokenizer_from_pretrained(self):
31 |         logging.basicConfig(level=logging.INFO)
32 |         for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
33 |             tokenizer = AutoTokenizer.from_pretrained(model_name)
34 |             self.assertIsNotNone(tokenizer)
35 |             self.assertIsInstance(tokenizer, BertTokenizer)
36 |             self.assertGreater(len(tokenizer), 0)
37 | 
38 |         for model_name in list(GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
39 |             tokenizer = AutoTokenizer.from_pretrained(model_name)
40 |             self.assertIsNotNone(tokenizer)
41 |             self.assertIsInstance(tokenizer, GPT2Tokenizer)
42 |             self.assertGreater(len(tokenizer), 0)
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     unittest.main()
47 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/tests/tokenization_utils_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 HuggingFace Inc..
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import
16 | from __future__ import division
17 | from __future__ import print_function
18 | 
19 | import unittest
20 | import six
21 | 
22 | from pytorch_transformers import PreTrainedTokenizer
23 | from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer
24 | 
25 | class TokenizerUtilsTest(unittest.TestCase):
26 |     def check_tokenizer_from_pretrained(self, tokenizer_class):
27 |         s3_models = list(tokenizer_class.max_model_input_sizes.keys())
28 |         for model_name in s3_models[:1]:
29 |             tokenizer = tokenizer_class.from_pretrained(model_name)
30 |             self.assertIsNotNone(tokenizer)
31 |             self.assertIsInstance(tokenizer, tokenizer_class)
32 |             self.assertIsInstance(tokenizer, PreTrainedTokenizer)
33 | 
34 |             for special_tok in tokenizer.all_special_tokens:
35 |                 if six.PY2:
36 |                     self.assertIsInstance(special_tok, unicode)
37 |                 else:
38 |                     self.assertIsInstance(special_tok, str)
39 |                 special_tok_id = tokenizer.convert_tokens_to_ids(special_tok)
40 |                 self.assertIsInstance(special_tok_id, int)
41 | 
42 |     def test_pretrained_tokenizers(self):
43 |         self.check_tokenizer_from_pretrained(GPT2Tokenizer)
44 | 
45 | if __name__ == "__main__":
46 |     unittest.main()
47 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Larimar
 2 | This repo contains a reference implementation of the paper 
 3 | [Larimar: Large Language Models with Episodic Memory Control](https://research.ibm.com/publications/larimar-large-language-models-with-episodic-memory-control).
 4 | 
 5 | <p align="center">  
 6 | <img src="larimar_architecture.png" width="100%"/>
 7 | </p>
 8 | 
 9 | 
10 | ## Install
11 | 
12 | ```
13 | conda create --prefix envs/larimar python=3.10 -y
14 | conda activate envs/larimar
15 | pip install -r requirements.txt
16 | python -m nltk.downloader punkt_tab
17 | python -m spacy download en_core_web_sm
18 | ```
19 | 
20 | ## Use
21 | Single-fact editing demo notebook: `larimar_base/single_fact_editing_demo.ipynb`.
22 | 
23 | Assumes a trained `larimar-1.3b` model checkpoint available as `../models/larimar-1.3b-c3.ckpt`. Please see instructions below.
24 | 
25 | 
26 | ## Train
27 | To train a `larimar-1.3b` model, first download and extract under `../data` [this dataset tarball](https://ibm.box.com/shared/static/d90td7ycpv3u9jw4i1mecv1mt24heq3t.gz) and then: 
28 | 
29 | ```
30 | cd larimar_base/
31 | bash train_larimar.sh
32 | ```
33 | This will train the model with configuration C3 as in the paper. Please, adjust to your environment by editing related entries in `train_larimar.sh` and `configs/config_train_larimar.yaml` files before launching.
34 | 
35 | 
36 | ## Evaluate
37 | Choose the larimar model to evaluate in `eval.sh`, `eval_rephrase.sh` and run:
38 | 
39 | ```
40 | cd larimar_base/
41 | bash eval.sh
42 | bash eval_rephrase.sh
43 | ```
44 | 
45 | 
46 | ## Citation
47 | ```
48 | @misc{das2024larimarlargelanguagemodels,
49 |       title={Larimar: Large Language Models with Episodic Memory Control}, 
50 |       author={Payel Das and Subhajit Chaudhury and Elliot Nelson and Igor Melnyk and Sarath Swaminathan and Sihui Dai and Aurélie Lozano and Georgios Kollias and Vijil Chenthamarakshan and Jiří and Navrátil and Soham Dan and Pin-Yu Chen},
51 |       year={2024},
52 |       eprint={2403.11901},
53 |       archivePrefix={arXiv},
54 |       primaryClass={cs.LG},
55 |       url={https://arxiv.org/abs/2403.11901}, 
56 | }
57 | ```
58 | 


--------------------------------------------------------------------------------
/larimar_base/modules/decoders/decoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class DecoderBase(nn.Module):
 6 |     """docstring for Decoder"""
 7 |     def __init__(self):
 8 |         super(DecoderBase, self).__init__()
 9 | 
10 | 
11 |     def freeze(self):
12 |         for param in self.parameters():
13 |             param.requires_grad = False
14 | 
15 |     def decode(self, x, z):
16 |         """
17 |         Args:
18 |             x: (batch_size, seq_len)
19 |             z: (batch_size, n_sample, nz)
20 |         Returns: Tensor1
21 |             Tensor1: the output logits with size (batch_size * n_sample, seq_len, vocab_size)
22 |         """
23 | 
24 |         raise NotImplementedError
25 | 
26 |     def reconstruct_error(self, x, z):
27 |         """reconstruction loss
28 |         Args:
29 |             x: (batch_size, *)
30 |             z: (batch_size, n_sample, nz)
31 |         Returns:
32 |             loss: (batch_size, n_sample). Loss
33 |             across different sentence and z
34 |         """
35 | 
36 |         raise NotImplementedError
37 | 
38 |     def beam_search_decode(self, z, K):
39 |         """beam search decoding
40 |         Args:
41 |             z: (batch_size, nz)
42 |             K: the beam size
43 |         Returns: List1
44 |             List1: the decoded word sentence list
45 |         """
46 | 
47 |         raise NotImplementedError
48 | 
49 |     def sample_decode(self, z):
50 |         """sampling from z
51 |         Args:
52 |             z: (batch_size, nz)
53 |         Returns: List1
54 |             List1: the decoded word sentence list
55 |         """
56 | 
57 |         raise NotImplementedError
58 | 
59 |     def greedy_decode(self, z):
60 |         """greedy decoding from z
61 |         Args:
62 |             z: (batch_size, nz)
63 |         Returns: List1
64 |             List1: the decoded word sentence list
65 |         """
66 | 
67 |         raise NotImplementedError
68 | 
69 |     def log_probability(self, x, z):
70 |         """
71 |         Args:
72 |             x: (batch_size, *)
73 |             z: (batch_size, n_sample, nz)
74 |         Returns:
75 |             log_p: (batch_size, n_sample).
76 |                 log_p(x|z) across different x and z
77 |         """
78 | 
79 |         raise NotImplementedError


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/dependency_versions_check.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from .dependency_versions_table import deps
16 | from .utils.versions import require_version, require_version_core
17 | 
18 | 
19 | # define which module versions we always want to check at run time
20 | # (usually the ones defined in `install_requires` in setup.py)
21 | #
22 | # order specific notes:
23 | # - tqdm must be checked before tokenizers
24 | 
25 | pkgs_to_check_at_runtime = [
26 |     "python",
27 |     "tqdm",
28 |     "regex",
29 |     "requests",
30 |     "packaging",
31 |     "filelock",
32 |     "numpy",
33 |     "tokenizers",
34 |     "huggingface-hub",
35 |     "safetensors",
36 |     "accelerate",
37 |     "pyyaml",
38 | ]
39 | 
40 | for pkg in pkgs_to_check_at_runtime:
41 |     if pkg in deps:
42 |         if pkg == "tokenizers":
43 |             # must be loaded here, or else tqdm check may fail
44 |             from .utils import is_tokenizers_available
45 | 
46 |             if not is_tokenizers_available():
47 |                 continue  # not required, check version only if installed
48 |         elif pkg == "accelerate":
49 |             # must be loaded here, or else tqdm check may fail
50 |             from .utils import is_accelerate_available
51 | 
52 |             # Maybe switch to is_torch_available in the future here so that Accelerate is hard dep of
53 |             # Transformers with PyTorch
54 |             if not is_accelerate_available():
55 |                 continue  # not required, check version only if installed
56 | 
57 |         require_version_core(deps[pkg])
58 |     else:
59 |         raise ValueError(f"can't find {pkg} in {deps.keys()}, check dependency_versions_table.py")
60 | 
61 | 
62 | def dep_version_check(pkg, hint=None):
63 |     require_version(deps[pkg], hint)
64 | 


--------------------------------------------------------------------------------
/larimar_base/configs/config_train_larimar.yaml:
--------------------------------------------------------------------------------
 1 | ## MODEL
 2 | model:
 3 |   # Encoder
 4 |   encoder_model_type: bert
 5 |   encoder_model_name_or_path: bert-base-cased
 6 |   cache_dir: '../cache'
 7 |   load_pretrained: false
 8 | 
 9 |   # Decoder
10 |   decoder_model_type: gpt2
11 |   decoder_model_name_or_path: gpt2
12 | 
13 |   # Auto-encoder
14 |   latent_size: 768
15 |   do_lower_case: false
16 |   block_size: 64
17 | 
18 |   # Memory
19 |   memory_size: 512
20 |   direct_writing: true
21 |   ordering: false
22 |   pseudoinverse_approx_step: 15
23 |   episode_sizes: [16]
24 |   observation_noise_std: 0.000001
25 |   identity: true
26 |   w_logvar_setting: 3
27 |   deterministic_w: false
28 | 
29 |   # Training
30 |   learning_rate: 5e-5
31 |   adam_epsilon: 1e-8
32 |   warmup_steps: 0
33 |   weight_decay: 0.0
34 |   mlm: false
35 |   mlm_probability: 0.15
36 |   dim_target_kl: 0
37 |   length_weighted_loss: false
38 |   rec_strength: 1.0
39 |   ae_strength: 1.0
40 |   l2_strength: 0
41 |   decode_rec_strength: 0.0
42 |   beta: 0.5
43 |   use_beta_schedule: true
44 |   ratio_increase: 0.25
45 |   ratio_zero: 0.5
46 |   fb_mode: 1
47 |   optimizer: adamw  # or fusedadam or deepspeed
48 | 
49 |   # Evaluation
50 |   bleu: false
51 |   ae_only: true
52 |   ae_read_write: true
53 |   num_samples: 100
54 |   read_iters: 1
55 |   perturb: ""
56 | 
57 |   # Sampling
58 |   temperature: 1
59 |   top_k: 0
60 |   top_p: 1
61 | 
62 | ## DATA
63 | data:
64 |   train_data_file: '../data/wikipedia/train.txt'
65 |   eval_data_file: '../data/wikipedia/test.txt'
66 |   num_data_workers: 4
67 |   train_batch_size: 64
68 |   eval_batch_size: 64
69 |   max_seq_length: 512
70 |   batches_per_bucket: 100
71 |   use_labels: 0
72 |   dataset: 'Wikipedia'
73 |   use_philly: false  # action='store_true'
74 | 
75 | trainer:
76 |   max_epochs: 4
77 |   limit_val_batches: 0
78 |   reload_dataloaders_every_n_epochs: 1 # to ensure reshuffling of data buckets
79 |   limit_val_batches: 0 # don't run eval during training
80 |   num_sanity_val_steps : 0 # don't run eval at the beginning
81 |   default_root_dir: '../train/larimar/checkpoints/bert-base-cased-gpt2-wiki'
82 |   callbacks:
83 |       class_path: 'lightning.pytorch.callbacks.ModelCheckpoint'
84 |       init_args:
85 |       #  every_n_epochs: 1
86 |       #  save_top_k: 3
87 |           monitor: train/LOSS
88 |   logger:
89 |       class_path: 'lightning.pytorch.loggers.TensorBoardLogger'
90 |       init_args:
91 |         save_dir: '../train/larimar'
92 |         name: ''
93 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/model_parallel_utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | from math import ceil
17 | 
18 | 
19 | def assert_device_map(device_map, num_blocks):
20 |     blocks = list(range(0, num_blocks))
21 | 
22 |     device_map_blocks = [item for sublist in list(device_map.values()) for item in sublist]
23 | 
24 |     # Duplicate check
25 |     duplicate_blocks = []
26 |     for i in device_map_blocks:
27 |         if device_map_blocks.count(i) > 1 and i not in duplicate_blocks:
28 |             duplicate_blocks.append(i)
29 |     # Missing blocks
30 |     missing_blocks = [i for i in blocks if i not in device_map_blocks]
31 |     extra_blocks = [i for i in device_map_blocks if i not in blocks]
32 | 
33 |     if len(duplicate_blocks) != 0:
34 |         raise ValueError(
35 |             "Duplicate attention blocks specified in device_map. Attention blocks must be specified to one device."
36 |             " These attention blocks were specified more than once: " + str(duplicate_blocks)
37 |         )
38 |     if len(missing_blocks) != 0:
39 |         raise ValueError(
40 |             "There are attention blocks for this model that are not specified in the device_map. Add these attention "
41 |             "blocks to a device on the device_map: " + str(missing_blocks)
42 |         )
43 |     if len(extra_blocks) != 0:
44 |         raise ValueError(
45 |             "The device_map contains more attention blocks than this model has. Remove these from the device_map:"
46 |             + str(extra_blocks)
47 |         )
48 | 
49 | 
50 | def get_device_map(n_layers, devices):
51 |     """Returns a dictionary of layers distributed evenly across all devices."""
52 |     layers = list(range(n_layers))
53 |     n_blocks = int(ceil(n_layers / len(devices)))
54 |     layers_list = [layers[i : i + n_blocks] for i in range(0, n_layers, n_blocks)]
55 | 
56 |     return dict(zip(devices, layers_list))
57 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/utils/model_parallel_utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | from math import ceil
17 | 
18 | 
19 | def assert_device_map(device_map, num_blocks):
20 |     blocks = list(range(0, num_blocks))
21 | 
22 |     device_map_blocks = [item for sublist in list(device_map.values()) for item in sublist]
23 | 
24 |     # Duplicate check
25 |     duplicate_blocks = []
26 |     for i in device_map_blocks:
27 |         if device_map_blocks.count(i) > 1 and i not in duplicate_blocks:
28 |             duplicate_blocks.append(i)
29 |     # Missing blocks
30 |     missing_blocks = [i for i in blocks if i not in device_map_blocks]
31 |     extra_blocks = [i for i in device_map_blocks if i not in blocks]
32 | 
33 |     if len(duplicate_blocks) != 0:
34 |         raise ValueError(
35 |             "Duplicate attention blocks specified in device_map. Attention blocks must be specified to one device."
36 |             " These attention blocks were specified more than once: " + str(duplicate_blocks)
37 |         )
38 |     if len(missing_blocks) != 0:
39 |         raise ValueError(
40 |             "There are attention blocks for this model that are not specified in the device_map. Add these attention "
41 |             "blocks to a device on the device_map: " + str(missing_blocks)
42 |         )
43 |     if len(extra_blocks) != 0:
44 |         raise ValueError(
45 |             "The device_map contains more attention blocks than this model has. Remove these from the device_map:"
46 |             + str(extra_blocks)
47 |         )
48 | 
49 | 
50 | def get_device_map(n_layers, devices):
51 |     """Returns a dictionary of layers distributed evenly across all devices."""
52 |     layers = list(range(n_layers))
53 |     n_blocks = int(ceil(n_layers / len(devices)))
54 |     layers_list = [layers[i : i + n_blocks] for i in range(0, n_layers, n_blocks)]
55 | 
56 |     return dict(zip(devices, layers_list))
57 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/tests/configuration_common_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2019 HuggingFace Inc.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import
16 | from __future__ import division
17 | from __future__ import print_function
18 | 
19 | import copy
20 | import os
21 | import shutil
22 | import json
23 | import random
24 | import uuid
25 | 
26 | import unittest
27 | import logging
28 | 
29 | 
30 | class ConfigTester(object):
31 |     def __init__(self, parent, config_class=None, **kwargs):
32 |         self.parent = parent
33 |         self.config_class = config_class
34 |         self.inputs_dict = kwargs
35 | 
36 |     def create_and_test_config_common_properties(self):
37 |         config = self.config_class(**self.inputs_dict)
38 |         self.parent.assertTrue(hasattr(config, 'vocab_size'))
39 |         self.parent.assertTrue(hasattr(config, 'hidden_size'))
40 |         self.parent.assertTrue(hasattr(config, 'num_attention_heads'))
41 |         self.parent.assertTrue(hasattr(config, 'num_hidden_layers'))
42 | 
43 |     def create_and_test_config_to_json_string(self):
44 |         config = self.config_class(**self.inputs_dict)
45 |         obj = json.loads(config.to_json_string())
46 |         for key, value in self.inputs_dict.items():
47 |             self.parent.assertEqual(obj[key], value)
48 | 
49 |     def create_and_test_config_to_json_file(self):
50 |         config_first = self.config_class(**self.inputs_dict)
51 |         json_file_path = os.path.join(os.getcwd(), "config_" + str(uuid.uuid4()) + ".json")
52 |         config_first.to_json_file(json_file_path)
53 |         config_second = self.config_class.from_json_file(json_file_path)
54 |         os.remove(json_file_path)
55 |         self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
56 | 
57 |     def run_common_tests(self):
58 |         self.create_and_test_config_common_properties()
59 |         self.create_and_test_config_to_json_string()
60 |         self.create_and_test_config_to_json_file()
61 | 
62 | if __name__ == "__main__":
63 |     unittest.main()


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/tokenization_distilbert.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tokenization classes for DistilBERT."""
16 | 
17 | from __future__ import absolute_import, division, print_function, unicode_literals
18 | 
19 | import collections
20 | import logging
21 | import os
22 | import unicodedata
23 | from io import open
24 | 
25 | from .tokenization_bert import BertTokenizer
26 | 
27 | logger = logging.getLogger(__name__)
28 | 
29 | VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
30 | 
31 | PRETRAINED_VOCAB_FILES_MAP = {
32 |     'vocab_file':
33 |     {
34 |         'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
35 |         'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
36 |     }
37 | }
38 | 
39 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
40 |     'distilbert-base-uncased': 512,
41 |     'distilbert-base-uncased-distilled-squad': 512,
42 | }
43 | 
44 | 
45 | class DistilBertTokenizer(BertTokenizer):
46 |     r"""
47 |     Constructs a DistilBertTokenizer.
48 |     :class:`~pytorch_transformers.DistilBertTokenizer` is identical to BertTokenizer and runs end-to-end tokenization: punctuation splitting + wordpiece
49 | 
50 |     Args:
51 |         vocab_file: Path to a one-wordpiece-per-line vocabulary file
52 |         do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False
53 |         do_basic_tokenize: Whether to do basic tokenization before wordpiece.
54 |         max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the
55 |             minimum of this value (if specified) and the underlying BERT model's sequence length.
56 |         never_split: List of tokens which will never be split during tokenization. Only has an effect when
57 |             do_wordpiece_only=False
58 |     """
59 | 
60 |     vocab_files_names = VOCAB_FILES_NAMES
61 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
62 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
63 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/convert_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert BERT checkpoint."""
16 | 
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 | 
21 | import argparse
22 | import torch
23 | 
24 | from pytorch_transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert
25 | 
26 | import logging
27 | logging.basicConfig(level=logging.INFO)
28 | 
29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
30 |     # Initialise PyTorch model
31 |     config = BertConfig.from_json_file(bert_config_file)
32 |     print("Building PyTorch model from configuration: {}".format(str(config)))
33 |     model = BertForPreTraining(config)
34 | 
35 |     # Load weights from tf checkpoint
36 |     load_tf_weights_in_bert(model, config, tf_checkpoint_path)
37 | 
38 |     # Save pytorch-model
39 |     print("Save PyTorch model to {}".format(pytorch_dump_path))
40 |     torch.save(model.state_dict(), pytorch_dump_path)
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     parser = argparse.ArgumentParser()
45 |     ## Required parameters
46 |     parser.add_argument("--tf_checkpoint_path",
47 |                         default = None,
48 |                         type = str,
49 |                         required = True,
50 |                         help = "Path to the TensorFlow checkpoint path.")
51 |     parser.add_argument("--bert_config_file",
52 |                         default = None,
53 |                         type = str,
54 |                         required = True,
55 |                         help = "The config json file corresponding to the pre-trained BERT model. \n"
56 |                             "This specifies the model architecture.")
57 |     parser.add_argument("--pytorch_dump_path",
58 |                         default = None,
59 |                         type = str,
60 |                         required = True,
61 |                         help = "Path to the output PyTorch model.")
62 |     args = parser.parse_args()
63 |     convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
64 |                                      args.bert_config_file,
65 |                                      args.pytorch_dump_path)
66 | 


--------------------------------------------------------------------------------
/larimar_base/main_pl.py:
--------------------------------------------------------------------------------
 1 | import lightning
 2 | from lightning.pytorch.cli import LightningCLI
 3 | from lightning_model import MemNetLight
 4 | from lightning_data import DataModule
 5 | import os
 6 | import subprocess
 7 | 
 8 | 
 9 | def fix_infiniband():
10 |     ibv = subprocess.run('ibv_devinfo', stdout=subprocess.PIPE, stderr=subprocess.PIPE)
11 |     lines = ibv.stdout.decode('utf-8').split('\n')
12 |     exclude = ''
13 |     for line in lines:
14 |         if 'hca_id:' in line:
15 |             name = line.split(':')[1].strip()
16 |         if '\tport:' in line:
17 |             port = line.split(':')[1].strip()
18 |         if 'link_layer:' in line and 'Ethernet' in line:
19 |             exclude = exclude + f'{name}:{port},'
20 | 
21 |     if exclude:
22 |         exclude = '^' + exclude[:-1]
23 |         print(exclude)
24 |         os.environ['NCCL_IB_HCA'] = exclude
25 | 
26 | 
27 | def set_env(master_port):
28 |         LSB_MCPU_HOSTS = os.environ["LSB_MCPU_HOSTS"].split(' ')  # Parses Node list set by LSF, in format hostname proceeded by number of cores requested
29 |         HOST_LIST = LSB_MCPU_HOSTS[::2]  # Strips the cores per node items in the list
30 |         os.environ["MASTER_ADDR"] = HOST_LIST[0]  # Sets the MasterNode to thefirst node on the list of hosts
31 |         os.environ["MASTER_PORT"] = master_port
32 |         os.environ["NODE_RANK"] = str(HOST_LIST.index(os.environ["HOSTNAME"]))  # Uses the list index for node rank, master node rank must be 0
33 |         os.environ["NCCL_SOCKET_IFNAME"] = 'ib,bond' #"^docker0,lo"  # avoids using docker of loopback interface
34 |         os.environ["NCCL_DEBUG"] = "INFO"  # sets NCCL debug to info, during distributed training, bugs in code show up as nccl errors
35 |         os.environ["NCCL_IB_CUDA_SUPPORT"] = '1'  # Force use of infiniband
36 | 
37 | 
38 | class MyLightningCLI(LightningCLI):
39 |     def add_arguments_to_parser(self, parser):
40 |         parser.link_arguments("model.block_size", "data.block_size")
41 |         parser.link_arguments("model.perturb", "data.perturb")
42 |         parser.link_arguments("model.encoder_model_type", "data.encoder_model_type")
43 |         parser.link_arguments("model.encoder_model_name_or_path", "data.encoder_model_name_or_path")
44 |         parser.link_arguments("model.decoder_model_type", "data.decoder_model_type")
45 |         parser.link_arguments("model.decoder_model_name_or_path", "data.decoder_model_name_or_path")
46 |         parser.link_arguments("model.cache_dir", "data.cache_dir")
47 |         parser.link_arguments("model.do_lower_case", "data.do_lower_case")
48 | 
49 | 
50 | def cli_main():
51 |     MyLightningCLI(model_class=MemNetLight, datamodule_class=DataModule, save_config_kwargs={"overwrite": True})
52 |     #lightning.Trainer
53 |     #lightning.pytorch.callbacks.ModelCheckpoint
54 | 
55 | 
56 | if __name__ == "__main__":
57 |     fix_infiniband()
58 |     set_env('53108')
59 |     cli_main()
60 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/tests/tokenization_openai_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import, division, print_function, unicode_literals
16 | 
17 | import os
18 | import unittest
19 | import json
20 | 
21 | from pytorch_transformers.tokenization_openai import OpenAIGPTTokenizer, VOCAB_FILES_NAMES
22 | 
23 | from .tokenization_tests_commons import CommonTestCases
24 | 
25 | 
26 | class OpenAIGPTTokenizationTest(CommonTestCases.CommonTokenizerTester):
27 | 
28 |     tokenizer_class = OpenAIGPTTokenizer
29 | 
30 |     def setUp(self):
31 |         super(OpenAIGPTTokenizationTest, self).setUp()
32 | 
33 |         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
34 |         vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
35 |                  "w</w>", "r</w>", "t</w>",
36 |                  "lo", "low", "er</w>",
37 |                  "low</w>", "lowest</w>", "newer</w>", "wider</w>", "<unk>"]
38 |         vocab_tokens = dict(zip(vocab, range(len(vocab))))
39 |         merges = ["#version: 0.2", "l o", "lo w", "e r</w>", ""]
40 | 
41 |         self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
42 |         self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
43 |         with open(self.vocab_file, "w") as fp:
44 |             fp.write(json.dumps(vocab_tokens))
45 |         with open(self.merges_file, "w") as fp:
46 |             fp.write("\n".join(merges))
47 | 
48 |     def get_tokenizer(self, **kwargs):
49 |         return OpenAIGPTTokenizer.from_pretrained(self.tmpdirname, **kwargs)
50 | 
51 |     def get_input_output_texts(self):
52 |         input_text = u"lower newer"
53 |         output_text = u"lower newer"
54 |         return input_text, output_text
55 | 
56 | 
57 |     def test_full_tokenizer(self):
58 |         tokenizer = OpenAIGPTTokenizer(self.vocab_file, self.merges_file)
59 | 
60 |         text = "lower"
61 |         bpe_tokens = ["low", "er</w>"]
62 |         tokens = tokenizer.tokenize(text)
63 |         self.assertListEqual(tokens, bpe_tokens)
64 | 
65 |         input_tokens = tokens + ["<unk>"]
66 |         input_bpe_tokens = [14, 15, 20]
67 |         self.assertListEqual(
68 |             tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
69 | 
70 | 
71 | if __name__ == '__main__':
72 |     unittest.main()
73 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/tests/tokenization_transfo_xl_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import, division, print_function, unicode_literals
16 | 
17 | import os
18 | import unittest
19 | from io import open
20 | 
21 | from pytorch_transformers.tokenization_transfo_xl import TransfoXLTokenizer, VOCAB_FILES_NAMES
22 | 
23 | from.tokenization_tests_commons import CommonTestCases
24 | 
25 | class TransfoXLTokenizationTest(CommonTestCases.CommonTokenizerTester):
26 | 
27 |     tokenizer_class = TransfoXLTokenizer
28 | 
29 |     def setUp(self):
30 |         super(TransfoXLTokenizationTest, self).setUp()
31 | 
32 |         vocab_tokens = [
33 |             "<unk>", "[CLS]", "[SEP]", "want", "unwanted", "wa", "un",
34 |             "running", ",", "low", "l",
35 |         ]
36 |         self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
37 |         with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer:
38 |             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
39 | 
40 |     def get_tokenizer(self, **kwargs):
41 |         kwargs['lower_case'] = True
42 |         return TransfoXLTokenizer.from_pretrained(self.tmpdirname, **kwargs)
43 | 
44 |     def get_input_output_texts(self):
45 |         input_text = u"<unk> UNwanted , running"
46 |         output_text = u"<unk> unwanted, running"
47 |         return input_text, output_text
48 | 
49 |     def test_full_tokenizer(self):
50 |         tokenizer = TransfoXLTokenizer(vocab_file=self.vocab_file, lower_case=True)
51 | 
52 |         tokens = tokenizer.tokenize(u"<unk> UNwanted , running")
53 |         self.assertListEqual(tokens, ["<unk>", "unwanted", ",", "running"])
54 | 
55 |         self.assertListEqual(
56 |             tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7])
57 | 
58 |     def test_full_tokenizer_lower(self):
59 |         tokenizer = TransfoXLTokenizer(lower_case=True)
60 | 
61 |         self.assertListEqual(
62 |             tokenizer.tokenize(u" \tHeLLo ! how  \n Are yoU ?  "),
63 |             ["hello", "!", "how", "are", "you", "?"])
64 | 
65 |     def test_full_tokenizer_no_lower(self):
66 |         tokenizer = TransfoXLTokenizer(lower_case=False)
67 | 
68 |         self.assertListEqual(
69 |             tokenizer.tokenize(u" \tHeLLo ! how  \n Are yoU ?  "),
70 |             ["HeLLo", "!", "how", "Are", "yoU", "?"])
71 | 
72 | 
73 | if __name__ == '__main__':
74 |     unittest.main()
75 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/tests/tokenization_gpt2_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import, division, print_function, unicode_literals
16 | 
17 | import os
18 | import unittest
19 | import json
20 | from io import open
21 | 
22 | from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer, VOCAB_FILES_NAMES
23 | 
24 | from .tokenization_tests_commons import CommonTestCases
25 | 
26 | class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester):
27 | 
28 |     tokenizer_class = GPT2Tokenizer
29 | 
30 |     def setUp(self):
31 |         super(GPT2TokenizationTest, self).setUp()
32 | 
33 |         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
34 |         vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
35 |                  "\u0120", "\u0120l", "\u0120n",
36 |                  "\u0120lo", "\u0120low", "er",
37 |                  "\u0120lowest", "\u0120newer", "\u0120wider", "<unk>"]
38 |         vocab_tokens = dict(zip(vocab, range(len(vocab))))
39 |         merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
40 |         self.special_tokens_map = {"unk_token": "<unk>"}
41 | 
42 |         self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
43 |         self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
44 |         with open(self.vocab_file, "w", encoding="utf-8") as fp:
45 |             fp.write(json.dumps(vocab_tokens) + "\n")
46 |         with open(self.merges_file, "w", encoding="utf-8") as fp:
47 |             fp.write("\n".join(merges))
48 | 
49 |     def get_tokenizer(self, **kwargs):
50 |         kwargs.update(self.special_tokens_map)
51 |         return GPT2Tokenizer.from_pretrained(self.tmpdirname, **kwargs)
52 | 
53 |     def get_input_output_texts(self):
54 |         input_text = u"lower newer"
55 |         output_text = u" lower newer"
56 |         return input_text, output_text
57 | 
58 |     def test_full_tokenizer(self):
59 |         tokenizer = GPT2Tokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
60 |         text = "lower newer"
61 |         bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"]
62 |         tokens = tokenizer.tokenize(text)
63 |         self.assertListEqual(tokens, bpe_tokens)
64 | 
65 |         input_tokens = tokens + [tokenizer.unk_token]
66 |         input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
67 |         self.assertListEqual(
68 |             tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
69 | 
70 | 
71 | if __name__ == '__main__':
72 |     unittest.main()
73 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert OpenAI GPT checkpoint."""
16 | 
17 | from __future__ import absolute_import, division, print_function
18 | 
19 | import argparse
20 | import json
21 | from io import open
22 | 
23 | import torch
24 | import numpy
25 | 
26 | from pytorch_transformers import CONFIG_NAME, WEIGHTS_NAME
27 | from pytorch_transformers.tokenization_xlm import VOCAB_FILES_NAMES
28 | 
29 | import logging
30 | logging.basicConfig(level=logging.INFO)
31 | 
32 | def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_path):
33 |     # Load checkpoint
34 |     chkpt = torch.load(xlm_checkpoint_path, map_location='cpu')
35 | 
36 |     model = chkpt['model']
37 | 
38 |     config = chkpt['params']
39 |     config = dict((n, v) for n, v in config.items() if not isinstance(v, (torch.FloatTensor, numpy.ndarray)))
40 | 
41 |     vocab = chkpt['dico_word2id']
42 |     vocab = dict((s + '</w>' if s.find('@@') == -1 and i > 13 else s.replace('@@', ''), i) for s, i in vocab.items())
43 | 
44 |     # Save pytorch-model
45 |     pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
46 |     pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
47 |     pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' +  VOCAB_FILES_NAMES['vocab_file']
48 | 
49 |     print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
50 |     torch.save(model, pytorch_weights_dump_path)
51 | 
52 |     print("Save configuration file to {}".format(pytorch_config_dump_path))
53 |     with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
54 |         f.write(json.dumps(config, indent=2) + "\n")
55 | 
56 |     print("Save vocab file to {}".format(pytorch_config_dump_path))
57 |     with open(pytorch_vocab_dump_path, "w", encoding="utf-8") as f:
58 |         f.write(json.dumps(vocab, indent=2) + "\n")
59 | 
60 | 
61 | if __name__ == "__main__":
62 |     parser = argparse.ArgumentParser()
63 |     ## Required parameters
64 |     parser.add_argument("--xlm_checkpoint_path",
65 |                         default = None,
66 |                         type = str,
67 |                         required = True,
68 |                         help = "Path the official PyTorch dump.")
69 |     parser.add_argument("--pytorch_dump_folder_path",
70 |                         default = None,
71 |                         type = str,
72 |                         required = True,
73 |                         help = "Path to the output PyTorch model.")
74 |     args = parser.parse_args()
75 |     convert_xlm_checkpoint_to_pytorch(args.xlm_checkpoint_path, args.pytorch_dump_folder_path)
76 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert OpenAI GPT checkpoint."""
16 | 
17 | from __future__ import absolute_import, division, print_function
18 | 
19 | import argparse
20 | from io import open
21 | 
22 | import torch
23 | 
24 | from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME,
25 |                                                      GPT2Config,
26 |                                                      GPT2Model,
27 |                                                      load_tf_weights_in_gpt2)
28 | 
29 | import logging
30 | logging.basicConfig(level=logging.INFO)
31 | 
32 | 
33 | def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path):
34 |     # Construct model
35 |     if gpt2_config_file == "":
36 |         config = GPT2Config()
37 |     else:
38 |         config = GPT2Config.from_json_file(gpt2_config_file)
39 |     model = GPT2Model(config)
40 | 
41 |     # Load weights from numpy
42 |     load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path)
43 | 
44 |     # Save pytorch-model
45 |     pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
46 |     pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
47 |     print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
48 |     torch.save(model.state_dict(), pytorch_weights_dump_path)
49 |     print("Save configuration file to {}".format(pytorch_config_dump_path))
50 |     with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
51 |         f.write(config.to_json_string())
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     parser = argparse.ArgumentParser()
56 |     ## Required parameters
57 |     parser.add_argument("--gpt2_checkpoint_path",
58 |                         default = None,
59 |                         type = str,
60 |                         required = True,
61 |                         help = "Path to the TensorFlow checkpoint path.")
62 |     parser.add_argument("--pytorch_dump_folder_path",
63 |                         default = None,
64 |                         type = str,
65 |                         required = True,
66 |                         help = "Path to the output PyTorch model.")
67 |     parser.add_argument("--gpt2_config_file",
68 |                         default = "",
69 |                         type = str,
70 |                         help = "An optional config json file corresponding to the pre-trained OpenAI model. \n"
71 |                             "This specifies the model architecture.")
72 |     args = parser.parse_args()
73 |     convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path,
74 |                                          args.gpt2_config_file,
75 |                                          args.pytorch_dump_folder_path)
76 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/convert_openai_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert OpenAI GPT checkpoint."""
16 | 
17 | from __future__ import absolute_import, division, print_function
18 | 
19 | import argparse
20 | from io import open
21 | 
22 | import torch
23 | 
24 | from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME,
25 |                                                      OpenAIGPTConfig,
26 |                                                      OpenAIGPTModel,
27 |                                                      load_tf_weights_in_openai_gpt)
28 | 
29 | import logging
30 | logging.basicConfig(level=logging.INFO)
31 | 
32 | 
33 | def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path):
34 |     # Construct model
35 |     if openai_config_file == "":
36 |         config = OpenAIGPTConfig()
37 |     else:
38 |         config = OpenAIGPTConfig.from_json_file(openai_config_file)
39 |     model = OpenAIGPTModel(config)
40 | 
41 |     # Load weights from numpy
42 |     load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path)
43 | 
44 |     # Save pytorch-model
45 |     pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
46 |     pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
47 |     print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
48 |     torch.save(model.state_dict(), pytorch_weights_dump_path)
49 |     print("Save configuration file to {}".format(pytorch_config_dump_path))
50 |     with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
51 |         f.write(config.to_json_string())
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     parser = argparse.ArgumentParser()
56 |     ## Required parameters
57 |     parser.add_argument("--openai_checkpoint_folder_path",
58 |                         default = None,
59 |                         type = str,
60 |                         required = True,
61 |                         help = "Path to the TensorFlow checkpoint path.")
62 |     parser.add_argument("--pytorch_dump_folder_path",
63 |                         default = None,
64 |                         type = str,
65 |                         required = True,
66 |                         help = "Path to the output PyTorch model.")
67 |     parser.add_argument("--openai_config_file",
68 |                         default = "",
69 |                         type = str,
70 |                         help = "An optional config json file corresponding to the pre-trained OpenAI model. \n"
71 |                             "This specifies the model architecture.")
72 |     args = parser.parse_args()
73 |     convert_openai_checkpoint_to_pytorch(args.openai_checkpoint_folder_path,
74 |                                          args.openai_config_file,
75 |                                          args.pytorch_dump_folder_path)
76 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/tests/tokenization_xlm_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import, division, print_function, unicode_literals
16 | 
17 | import os
18 | import unittest
19 | import json
20 | 
21 | from pytorch_transformers.tokenization_xlm import XLMTokenizer, VOCAB_FILES_NAMES
22 | 
23 | from .tokenization_tests_commons import CommonTestCases
24 | 
25 | class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
26 | 
27 |     tokenizer_class = XLMTokenizer
28 | 
29 |     def setUp(self):
30 |         super(XLMTokenizationTest, self).setUp()
31 | 
32 |         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
33 |         vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
34 |                  "w</w>", "r</w>", "t</w>",
35 |                  "lo", "low", "er</w>",
36 |                  "low</w>", "lowest</w>", "newer</w>", "wider</w>", "<unk>"]
37 |         vocab_tokens = dict(zip(vocab, range(len(vocab))))
38 |         merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
39 | 
40 |         self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
41 |         self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
42 |         with open(self.vocab_file, "w") as fp:
43 |             fp.write(json.dumps(vocab_tokens))
44 |         with open(self.merges_file, "w") as fp:
45 |             fp.write("\n".join(merges))
46 | 
47 |     def get_tokenizer(self, **kwargs):
48 |         return XLMTokenizer.from_pretrained(self.tmpdirname, **kwargs)
49 | 
50 |     def get_input_output_texts(self):
51 |         input_text = u"lower newer"
52 |         output_text = u"lower newer"
53 |         return input_text, output_text
54 | 
55 |     def test_full_tokenizer(self):
56 |         """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
57 |         tokenizer = XLMTokenizer(self.vocab_file, self.merges_file)
58 | 
59 |         text = "lower"
60 |         bpe_tokens = ["low", "er</w>"]
61 |         tokens = tokenizer.tokenize(text)
62 |         self.assertListEqual(tokens, bpe_tokens)
63 | 
64 |         input_tokens = tokens + ["<unk>"]
65 |         input_bpe_tokens = [14, 15, 20]
66 |         self.assertListEqual(
67 |             tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
68 | 
69 |     def test_sequence_builders(self):
70 |         tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048")
71 | 
72 |         text = tokenizer.encode("sequence builders")
73 |         text_2 = tokenizer.encode("multi-sequence build")
74 | 
75 |         encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
76 |         encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
77 | 
78 |         assert encoded_sentence == [1] + text + [1]
79 |         assert encoded_pair == [1] + text + [1] + text_2 + [1]
80 | 
81 | if __name__ == '__main__':
82 |     unittest.main()
83 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/dependency_versions_table.py:
--------------------------------------------------------------------------------
 1 | # THIS FILE HAS BEEN AUTOGENERATED. To update:
 2 | # 1. modify the `_deps` dict in setup.py
 3 | # 2. run `make deps_table_update``
 4 | deps = {
 5 |     "Pillow": "Pillow<10.0.0",
 6 |     "accelerate": "accelerate>=0.20.3",
 7 |     "av": "av==9.2.0",
 8 |     "beautifulsoup4": "beautifulsoup4",
 9 |     "black": "black~=23.1",
10 |     "codecarbon": "codecarbon==1.2.0",
11 |     "cookiecutter": "cookiecutter==1.7.3",
12 |     "dataclasses": "dataclasses",
13 |     "datasets": "datasets!=2.5.0",
14 |     "decord": "decord==0.6.0",
15 |     "deepspeed": "deepspeed>=0.9.3",
16 |     "diffusers": "diffusers",
17 |     "dill": "dill<0.3.5",
18 |     "evaluate": "evaluate>=0.2.0",
19 |     "fairscale": "fairscale>0.3",
20 |     "faiss-cpu": "faiss-cpu",
21 |     "fastapi": "fastapi",
22 |     "filelock": "filelock",
23 |     "flax": "flax>=0.4.1,<=0.7.0",
24 |     "ftfy": "ftfy",
25 |     "fugashi": "fugashi>=1.0",
26 |     "GitPython": "GitPython<3.1.19",
27 |     "hf-doc-builder": "hf-doc-builder>=0.3.0",
28 |     "huggingface-hub": "huggingface-hub>=0.16.4,<1.0",
29 |     "importlib_metadata": "importlib_metadata",
30 |     "ipadic": "ipadic>=1.0.0,<2.0",
31 |     "isort": "isort>=5.5.4",
32 |     "jax": "jax>=0.4.1,<=0.4.13",
33 |     "jaxlib": "jaxlib>=0.4.1,<=0.4.13",
34 |     "jieba": "jieba",
35 |     "kenlm": "kenlm",
36 |     "keras-nlp": "keras-nlp>=0.3.1",
37 |     "librosa": "librosa",
38 |     "nltk": "nltk",
39 |     "natten": "natten>=0.14.6",
40 |     "numpy": "numpy>=1.17",
41 |     "onnxconverter-common": "onnxconverter-common",
42 |     "onnxruntime-tools": "onnxruntime-tools>=1.4.2",
43 |     "onnxruntime": "onnxruntime>=1.4.0",
44 |     "opencv-python": "opencv-python",
45 |     "optuna": "optuna",
46 |     "optax": "optax>=0.0.8,<=0.1.4",
47 |     "packaging": "packaging>=20.0",
48 |     "parameterized": "parameterized",
49 |     "phonemizer": "phonemizer",
50 |     "protobuf": "protobuf",
51 |     "psutil": "psutil",
52 |     "pyyaml": "pyyaml>=5.1",
53 |     "pydantic": "pydantic<2",
54 |     "pytest": "pytest>=7.2.0",
55 |     "pytest-timeout": "pytest-timeout",
56 |     "pytest-xdist": "pytest-xdist",
57 |     "python": "python>=3.8.0",
58 |     "ray[tune]": "ray[tune]",
59 |     "regex": "regex!=2019.12.17",
60 |     "requests": "requests",
61 |     "rhoknp": "rhoknp>=1.1.0,<1.3.1",
62 |     "rjieba": "rjieba",
63 |     "rouge-score": "rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1",
64 |     "ruff": "ruff>=0.0.241,<=0.0.259",
65 |     "sacrebleu": "sacrebleu>=1.4.12,<2.0.0",
66 |     "sacremoses": "sacremoses",
67 |     "safetensors": "safetensors>=0.3.1",
68 |     "sagemaker": "sagemaker>=2.31.0",
69 |     "scikit-learn": "scikit-learn",
70 |     "sentencepiece": "sentencepiece>=0.1.91,!=0.1.92",
71 |     "sigopt": "sigopt",
72 |     "starlette": "starlette",
73 |     "sudachipy": "sudachipy>=0.6.6",
74 |     "sudachidict_core": "sudachidict_core>=20220729",
75 |     "tensorflow-cpu": "tensorflow-cpu>=2.6,<2.15",
76 |     "tensorflow": "tensorflow>=2.6,<2.15",
77 |     "tensorflow-text": "tensorflow-text<2.15",
78 |     "tf2onnx": "tf2onnx",
79 |     "timeout-decorator": "timeout-decorator",
80 |     "timm": "timm",
81 |     "tokenizers": "tokenizers>=0.21,<0.22",
82 |     # "tokenizers": "tokenizers>=0.13,<0.15",
83 |     # "tokenizers": "tokenizers>=0.14,<0.15",
84 |     "torch": "torch>=1.10,!=1.12.0",
85 |     "torchaudio": "torchaudio",
86 |     "torchvision": "torchvision",
87 |     "pyctcdecode": "pyctcdecode>=0.4.0",
88 |     "tqdm": "tqdm>=4.27",
89 |     "unidic": "unidic>=1.0.2",
90 |     "unidic_lite": "unidic_lite>=1.0.7",
91 |     "urllib3": "urllib3<2.0.0",
92 |     "uvicorn": "uvicorn",
93 | }
94 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/configuration_distilbert.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """ DistilBERT model configuration """
16 | from __future__ import (absolute_import, division, print_function,
17 |                         unicode_literals)
18 | 
19 | import sys
20 | import json
21 | import logging
22 | from io import open
23 | 
24 | from .configuration_utils import PretrainedConfig
25 | 
26 | logger = logging.getLogger(__name__)
27 | 
28 | DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
29 |     'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json",
30 |     'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json"
31 | }
32 | 
33 | 
34 | class DistilBertConfig(PretrainedConfig):
35 |     pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
36 | 
37 |     def __init__(self,
38 |                  vocab_size_or_config_json_file=30522,
39 |                  max_position_embeddings=512,
40 |                  sinusoidal_pos_embds=True,
41 |                  n_layers=6,
42 |                  n_heads=12,
43 |                  dim=768,
44 |                  hidden_dim=4*768,
45 |                  dropout=0.1,
46 |                  attention_dropout=0.1,
47 |                  activation='gelu',
48 |                  initializer_range=0.02,
49 |                  tie_weights_=True,
50 |                  qa_dropout=0.1,
51 |                  seq_classif_dropout=0.2,
52 |                  **kwargs):
53 |         super(DistilBertConfig, self).__init__(**kwargs)
54 | 
55 |         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
56 |                         and isinstance(vocab_size_or_config_json_file, unicode)):
57 |             with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
58 |                 json_config = json.loads(reader.read())
59 |             for key, value in json_config.items():
60 |                 self.__dict__[key] = value
61 |         elif isinstance(vocab_size_or_config_json_file, int):
62 |             self.vocab_size = vocab_size_or_config_json_file
63 |             self.max_position_embeddings = max_position_embeddings
64 |             self.sinusoidal_pos_embds = sinusoidal_pos_embds
65 |             self.n_layers = n_layers
66 |             self.n_heads = n_heads
67 |             self.dim = dim
68 |             self.hidden_dim = hidden_dim
69 |             self.dropout = dropout
70 |             self.attention_dropout = attention_dropout
71 |             self.activation = activation
72 |             self.initializer_range = initializer_range
73 |             self.tie_weights_ = tie_weights_
74 |             self.qa_dropout = qa_dropout
75 |             self.seq_classif_dropout = seq_classif_dropout
76 |         else:
77 |             raise ValueError("First argument must be either a vocabulary size (int)"
78 |                              " or the path to a pretrained model config file (str)")
79 |     @property
80 |     def hidden_size(self):
81 |         return self.dim
82 | 
83 |     @property
84 |     def num_attention_heads(self):
85 |         return self.n_heads
86 | 
87 |     @property
88 |     def num_hidden_layers(self):
89 |         return self.n_layers
90 | 


--------------------------------------------------------------------------------
/larimar_base/train_larimar.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | ####################################################################################################
 3 | # config yaml file
 4 | ####################################################################################################
 5 | config_file=configs/config_train_larimar.yaml
 6 | 
 7 | 
 8 | ####################################################################################################
 9 | # model_* 
10 | ####################################################################################################
11 | model_encoder_model_name_or_path="bert-large-cased"
12 | model_decoder_model_name_or_path="gpt2-large"
13 | model_decode_rec_strength=1.0
14 | model_optimizer=adamw
15 | model_learning_rate=5e-5
16 | model_observation_noise_std=0.000001
17 | model_beta=0.5
18 | 
19 | episode_length=16
20 | model_episode_sizes=[${episode_length}]
21 | 
22 | 
23 | 
24 | ####################################################################################################
25 | # trainer_*
26 | ####################################################################################################
27 | trainer_devices=8
28 | trainer_max_epochs=5
29 | trainer_precision=32-true
30 | trainer_strategy=ddp
31 | trainer_callbacks_init_args_every_n_train_steps=20000
32 | trainer_callbacks_init_args_save_top_k=3
33 | 
34 | 
35 | 
36 | ####################################################################################################
37 | # data_*
38 | ####################################################################################################
39 | data_train_batch_size=16
40 | data_num_chunks=false
41 | 
42 | 
43 | 
44 | ####################################################################################################
45 | # directories and files
46 | ####################################################################################################
47 | 
48 | # cache directory
49 | model_cache_dir=../cache
50 | 
51 | # trained model directory  
52 | decoder_name_save=$(echo "gpt2" | sed 's/\//-/g')
53 | loss_type=decoder_loss
54 | top_larimar_model_dir=../train/larimar/checkpoints
55 | larimar_model_description=${model_encoder_model_name_or_path}-${decoder_name_save}-large-wiki-ep-${episode_length}_${loss_type}_${model_observation_noise_std}
56 | trainer_default_root_dir=${top_larimar_model_dir}/${larimar_model_description}
57 | trainer_logger_init_args_save_dir=${trainer_default_root_dir}
58 | 
59 | 
60 | # training data directory and files
61 | block_size=64
62 | top_training_data_dir=../data
63 | training_data_dir=${top_training_data_dir}/wikipedia/blocksize_${block_size}
64 | data_train_data_file=${training_data_dir}/train.txt
65 | data_eval_data_file=${training_data_dir}/test.txt
66 | 
67 | 
68 | 
69 | 
70 | 
71 | ####################################################################################################
72 | # train
73 | ####################################################################################################
74 | python main_pl.py fit \
75 |        --config ${config_file} \
76 |        --model.cache_dir=${model_cache_dir} \
77 |        --model.encoder_model_name_or_path ${model_encoder_model_name_or_path} \
78 |        --model.decoder_model_name_or_path ${model_decoder_model_name_or_path} \
79 |        --model.optimizer ${model_optimizer} \
80 |        --model.learning_rate ${model_learning_rate} \
81 |        --model.episode_sizes ${model_episode_sizes} \
82 |        --model.decode_rec_strength ${model_decode_rec_strength} \
83 |        --model.observation_noise_std ${model_observation_noise_std} \
84 |        --model.beta ${model_beta} \
85 |        --trainer.devices ${trainer_devices} \
86 |        --trainer.max_epochs ${trainer_max_epochs} \
87 |        --trainer.precision ${trainer_precision} \
88 |        --trainer.strategy ${trainer_strategy} \
89 |        --trainer.default_root_dir  ${trainer_default_root_dir} \
90 |        --trainer.logger.init_args.save_dir ${trainer_logger_init_args_save_dir} \
91 |        --trainer.callbacks.init_args.every_n_train_steps ${trainer_callbacks_init_args_every_n_train_steps}  \
92 |        --trainer.callbacks.init_args.save_top_k ${trainer_callbacks_init_args_save_top_k} \
93 |        --data.train_batch_size ${data_train_batch_size} \
94 |        --data.train_data_file ${data_train_data_file} \
95 |        --data.eval_data_file ${data_eval_data_file} \
96 |        --data.num_chunks  ${data_num_chunks}
97 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/tests/modeling_auto_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import
16 | from __future__ import division
17 | from __future__ import print_function
18 | 
19 | import unittest
20 | import shutil
21 | import pytest
22 | import logging
23 | 
24 | from pytorch_transformers import (AutoConfig, BertConfig,
25 |                                   AutoModel, BertModel,
26 |                                   AutoModelWithLMHead, BertForMaskedLM,
27 |                                   AutoModelForSequenceClassification, BertForSequenceClassification,
28 |                                   AutoModelForQuestionAnswering, BertForQuestionAnswering)
29 | from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
30 | 
31 | from .modeling_common_test import (CommonTestCases, ids_tensor)
32 | from .configuration_common_test import ConfigTester
33 | 
34 | 
35 | class AutoModelTest(unittest.TestCase):
36 |     def test_model_from_pretrained(self):
37 |         logging.basicConfig(level=logging.INFO)
38 |         for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
39 |             config = AutoConfig.from_pretrained(model_name)
40 |             self.assertIsNotNone(config)
41 |             self.assertIsInstance(config, BertConfig)
42 | 
43 |             model = AutoModel.from_pretrained(model_name)
44 |             model, loading_info = AutoModel.from_pretrained(model_name, output_loading_info=True)
45 |             self.assertIsNotNone(model)
46 |             self.assertIsInstance(model, BertModel)
47 |             for value in loading_info.values():
48 |                 self.assertEqual(len(value), 0)
49 | 
50 |     def test_lmhead_model_from_pretrained(self):
51 |         logging.basicConfig(level=logging.INFO)
52 |         for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
53 |             config = AutoConfig.from_pretrained(model_name)
54 |             self.assertIsNotNone(config)
55 |             self.assertIsInstance(config, BertConfig)
56 | 
57 |             model = AutoModelWithLMHead.from_pretrained(model_name)
58 |             model, loading_info = AutoModelWithLMHead.from_pretrained(model_name, output_loading_info=True)
59 |             self.assertIsNotNone(model)
60 |             self.assertIsInstance(model, BertForMaskedLM)
61 | 
62 |     def test_sequence_classification_model_from_pretrained(self):
63 |         logging.basicConfig(level=logging.INFO)
64 |         for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
65 |             config = AutoConfig.from_pretrained(model_name)
66 |             self.assertIsNotNone(config)
67 |             self.assertIsInstance(config, BertConfig)
68 | 
69 |             model = AutoModelForSequenceClassification.from_pretrained(model_name)
70 |             model, loading_info = AutoModelForSequenceClassification.from_pretrained(model_name, output_loading_info=True)
71 |             self.assertIsNotNone(model)
72 |             self.assertIsInstance(model, BertForSequenceClassification)
73 | 
74 |     def test_question_answering_model_from_pretrained(self):
75 |         logging.basicConfig(level=logging.INFO)
76 |         for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
77 |             config = AutoConfig.from_pretrained(model_name)
78 |             self.assertIsNotNone(config)
79 |             self.assertIsInstance(config, BertConfig)
80 | 
81 |             model = AutoModelForQuestionAnswering.from_pretrained(model_name)
82 |             model, loading_info = AutoModelForQuestionAnswering.from_pretrained(model_name, output_loading_info=True)
83 |             self.assertIsNotNone(model)
84 |             self.assertIsInstance(model, BertForQuestionAnswering)
85 | 
86 | 
87 | if __name__ == "__main__":
88 |     unittest.main()
89 | 


--------------------------------------------------------------------------------
/larimar_base/lightning_data.py:
--------------------------------------------------------------------------------
 1 | import lightning as pl
 2 | from utils import BucketingDataLoaderPL
 3 | from lightning_model import prepare_enc_dec_tokenizer
 4 | 
 5 | 
 6 | class DataModule(pl.LightningDataModule):
 7 | 
 8 |     def __init__(self,
 9 |                  train_data_file,
10 |                  train_batch_size,
11 |                  eval_data_file,
12 |                  eval_batch_size,
13 |                  max_seq_length,
14 |                  perturb,
15 |                  use_labels,
16 |                  dataset,
17 |                  use_philly,
18 |                  num_data_workers,
19 |                  batches_per_bucket,
20 |                  block_size,
21 |                  encoder_model_type,
22 |                  encoder_model_name_or_path,
23 |                  decoder_model_type,
24 |                  decoder_model_name_or_path,
25 |                  cache_dir,
26 |                  do_lower_case,
27 |                  num_chunks):
28 | 
29 |         super().__init__()
30 | 
31 |         self.train_data_file = train_data_file
32 |         self.train_batch_size = train_batch_size
33 |         self.max_seq_length = max_seq_length
34 |         self.eval_data_file = eval_data_file
35 |         self.eval_batch_size = eval_batch_size
36 |         self.perturb = perturb
37 |         self.use_labels = use_labels
38 |         self.dataset = dataset
39 |         self.use_philly = use_philly
40 |         self.block_size = block_size
41 |         self.num_data_workers = num_data_workers
42 |         self.batches_per_bucket = batches_per_bucket
43 |         self.num_chunks = num_chunks
44 | 
45 |         tokenizer_encoder, tokenizer_decoder = prepare_enc_dec_tokenizer(encoder_model_type,
46 |                                                                          encoder_model_name_or_path,
47 |                                                                          decoder_model_type,
48 |                                                                          decoder_model_name_or_path,
49 |                                                                          cache_dir,
50 |                                                                          do_lower_case,
51 |                                                                          block_size)
52 | 
53 |         self.tokenizer = [tokenizer_encoder, tokenizer_decoder]
54 | 
55 |     def setup(self, stage=None):
56 | 
57 |         if stage == 'fit':
58 |             self.traindl = BucketingDataLoaderPL(self.train_data_file,
59 |                                                self.train_batch_size,
60 |                                                self.max_seq_length,
61 |                                                self.tokenizer,
62 |                                                self.block_size,
63 |                                                self.use_labels,
64 |                                                self.dataset,
65 |                                                self.use_philly,
66 |                                                self.num_chunks,
67 |                                                self.num_data_workers,
68 |                                                batches_per_bucket=100,
69 |                                                perturb=self.perturb,
70 |                                                shuffle=True)
71 |         if stage in ("fit", "validate"):
72 |             self.valdl = BucketingDataLoaderPL(self.eval_data_file,
73 |                                              self.eval_batch_size,
74 |                                              self.max_seq_length,
75 |                                              self.tokenizer,
76 |                                              self.block_size,
77 |                                              self.use_labels,
78 |                                              self.dataset,
79 |                                              self.use_philly,
80 |                                              self.num_chunks,
81 |                                              self.num_data_workers,
82 |                                              batches_per_bucket=100,
83 |                                              perturb=self.perturb,
84 |                                              shuffle=False)
85 |         else:
86 |             return
87 | 
88 |     def train_dataloader(self):
89 |         return self.traindl.get()
90 | 
91 |     def val_dataloader(self):
92 |         return self.valdl.get()
93 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/tests/fixtures/sample_text.txt:
--------------------------------------------------------------------------------
 1 | This text is included to make sure Unicode is handled properly: 力加勝北区ᴵᴺᵀᵃছজটডণত
 2 | Text should be one-sentence-per-line, with empty lines between documents.
 3 | This sample text is public domain and was randomly selected from Project Guttenberg.
 4 | 
 5 | The rain had only ceased with the gray streaks of morning at Blazing Star, and the settlement awoke to a moral sense of cleanliness, and the finding of forgotten knives, tin cups, and smaller camp utensils, where the heavy showers had washed away the debris and dust heaps before the cabin doors.
 6 | Indeed, it was recorded in Blazing Star that a fortunate early riser had once picked up on the highway a solid chunk of gold quartz which the rain had freed from its incumbering soil, and washed into immediate and glittering popularity.
 7 | Possibly this may have been the reason why early risers in that locality, during the rainy season, adopted a thoughtful habit of body, and seldom lifted their eyes to the rifted or india-ink washed skies above them.
 8 | "Cass" Beard had risen early that morning, but not with a view to discovery.
 9 | A leak in his cabin roof,--quite consistent with his careless, improvident habits,--had roused him at 4 A. M., with a flooded "bunk" and wet blankets.
10 | The chips from his wood pile refused to kindle a fire to dry his bed-clothes, and he had recourse to a more provident neighbor's to supply the deficiency.
11 | This was nearly opposite.
12 | Mr. Cassius crossed the highway, and stopped suddenly.
13 | Something glittered in the nearest red pool before him.
14 | Gold, surely!
15 | But, wonderful to relate, not an irregular, shapeless fragment of crude ore, fresh from Nature's crucible, but a bit of jeweler's handicraft in the form of a plain gold ring.
16 | Looking at it more attentively, he saw that it bore the inscription, "May to Cass."
17 | Like most of his fellow gold-seekers, Cass was superstitious.
18 | 
19 | The fountain of classic wisdom, Hypatia herself.
20 | As the ancient sage--the name is unimportant to a monk--pumped water nightly that he might study by day, so I, the guardian of cloaks and parasols, at the sacred doors of her lecture-room, imbibe celestial knowledge.
21 | From my youth I felt in me a soul above the matter-entangled herd.
22 | She revealed to me the glorious fact, that I am a spark of Divinity itself.
23 | A fallen star, I am, sir!' continued he, pensively, stroking his lean stomach--'a fallen star!--fallen, if the dignity of philosophy will allow of the simile, among the hogs of the lower world--indeed, even into the hog-bucket itself. Well, after all, I will show you the way to the Archbishop's.
24 | There is a philosophic pleasure in opening one's treasures to the modest young.
25 | Perhaps you will assist me by carrying this basket of fruit?' And the little man jumped up, put his basket on Philammon's head, and trotted off up a neighbouring street.
26 | Philammon followed, half contemptuous, half wondering at what this philosophy might be, which could feed the self-conceit of anything so abject as his ragged little apish guide;
27 | but the novel roar and whirl of the street, the perpetual stream of busy faces, the line of curricles, palanquins, laden asses, camels, elephants, which met and passed him, and squeezed him up steps and into doorways, as they threaded their way through the great Moon-gate into the ample street beyond, drove everything from his mind but wondering curiosity, and a vague, helpless dread of that great living wilderness, more terrible than any dead wilderness of sand which he had left behind.
28 | Already he longed for the repose, the silence of the Laura--for faces which knew him and smiled upon him; but it was too late to turn back now.
29 | His guide held on for more than a mile up the great main street, crossed in the centre of the city, at right angles, by one equally magnificent, at each end of which, miles away, appeared, dim and distant over the heads of the living stream of passengers, the yellow sand-hills of the desert;
30 | while at the end of the vista in front of them gleamed the blue harbour, through a network of countless masts.
31 | At last they reached the quay at the opposite end of the street;
32 | and there burst on Philammon's astonished eyes a vast semicircle of blue sea, ringed with palaces and towers.
33 | He stopped involuntarily; and his little guide stopped also, and looked askance at the young monk, to watch the effect which that grand panorama should produce on him.
34 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/tests/tokenization_roberta_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import, division, print_function, unicode_literals
16 | 
17 | import os
18 | import json
19 | import unittest
20 | from io import open
21 | 
22 | from pytorch_transformers.tokenization_roberta import RobertaTokenizer, VOCAB_FILES_NAMES
23 | from .tokenization_tests_commons import CommonTestCases
24 | 
25 | 
26 | class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
27 |     tokenizer_class = RobertaTokenizer
28 | 
29 |     def setUp(self):
30 |         super(RobertaTokenizationTest, self).setUp()
31 | 
32 |         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
33 |         vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
34 |                  "\u0120", "\u0120l", "\u0120n",
35 |                  "\u0120lo", "\u0120low", "er",
36 |                  "\u0120lowest", "\u0120newer", "\u0120wider", "<unk>"]
37 |         vocab_tokens = dict(zip(vocab, range(len(vocab))))
38 |         merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
39 |         self.special_tokens_map = {"unk_token": "<unk>"}
40 | 
41 |         self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
42 |         self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
43 |         with open(self.vocab_file, "w", encoding="utf-8") as fp:
44 |             fp.write(json.dumps(vocab_tokens) + "\n")
45 |         with open(self.merges_file, "w", encoding="utf-8") as fp:
46 |             fp.write("\n".join(merges))
47 | 
48 |     def get_tokenizer(self, **kwargs):
49 |         kwargs.update(self.special_tokens_map)
50 |         return RobertaTokenizer.from_pretrained(self.tmpdirname, **kwargs)
51 | 
52 |     def get_input_output_texts(self):
53 |         input_text = u"lower newer"
54 |         output_text = u" lower newer"
55 |         return input_text, output_text
56 | 
57 |     def test_full_tokenizer(self):
58 |         tokenizer = RobertaTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
59 |         text = "lower newer"
60 |         bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"]
61 |         tokens = tokenizer.tokenize(text)
62 |         self.assertListEqual(tokens, bpe_tokens)
63 | 
64 |         input_tokens = tokens + [tokenizer.unk_token]
65 |         input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
66 |         self.assertListEqual(
67 |             tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
68 | 
69 |     def roberta_dict_integration_testing(self):
70 |         tokenizer = self.get_tokenizer()
71 | 
72 |         self.assertListEqual(
73 |             tokenizer.encode('Hello world!'),
74 |             [0, 31414, 232, 328, 2]
75 |         )
76 |         self.assertListEqual(
77 |             tokenizer.encode('Hello world! cécé herlolip 418'),
78 |             [0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]
79 |         )
80 | 
81 |     def test_sequence_builders(self):
82 |         tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
83 | 
84 |         text = tokenizer.encode("sequence builders")
85 |         text_2 = tokenizer.encode("multi-sequence build")
86 | 
87 |         encoded_text_from_decode = tokenizer.encode("sequence builders", add_special_tokens=True)
88 |         encoded_pair_from_decode = tokenizer.encode("sequence builders", "multi-sequence build", add_special_tokens=True)
89 | 
90 |         encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
91 |         encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
92 | 
93 |         assert encoded_sentence == encoded_text_from_decode
94 |         assert encoded_pair == encoded_pair_from_decode
95 | 
96 | 
97 | if __name__ == '__main__':
98 |     unittest.main()
99 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/tokenization_roberta.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tokenization classes for RoBERTa."""
16 | from __future__ import (absolute_import, division, print_function,
17 |                         unicode_literals)
18 | 
19 | import sys
20 | import json
21 | import logging
22 | import os
23 | import regex as re
24 | from io import open
25 | 
26 | from .tokenization_gpt2 import GPT2Tokenizer
27 | 
28 | try:
29 |     from functools import lru_cache
30 | except ImportError:
31 |     # Just a dummy decorator to get the checks to run on python2
32 |     # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now.
33 |     def lru_cache():
34 |         return lambda func: func
35 | 
36 | logger = logging.getLogger(__name__)
37 | 
38 | VOCAB_FILES_NAMES = {
39 |     'vocab_file': 'vocab.json',
40 |     'merges_file': 'merges.txt',
41 | }
42 | 
43 | PRETRAINED_VOCAB_FILES_MAP = {
44 |     'vocab_file':
45 |     {
46 |         'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json",
47 |         'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json",
48 |         'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-vocab.json",
49 |     },
50 |     'merges_file':
51 |     {
52 |         'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt",
53 |         'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt",
54 |         'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-merges.txt",
55 |     },
56 | }
57 | 
58 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
59 |     'roberta-base': 512,
60 |     'roberta-large': 512,
61 |     'roberta-large-mnli': 512,
62 | }
63 | 
64 | 
65 | class RobertaTokenizer(GPT2Tokenizer):
66 |     """
67 |     RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities:
68 |         - Byte-level Byte-Pair-Encoding
69 |         - Requires a space to start the input string => will add a space is there isn't.
70 |           As a consequence, this tokenizer `encode` and `decode` method will not conserve
71 |           the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"
72 |     """
73 |     vocab_files_names = VOCAB_FILES_NAMES
74 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
75 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
76 | 
77 |     def __init__(self, vocab_file, merges_file, errors='replace', bos_token="<s>", eos_token="</s>", sep_token="</s>",
78 |                  cls_token="<s>", unk_token="<unk>", pad_token='<pad>', mask_token='<mask>', **kwargs):
79 |         super(RobertaTokenizer, self).__init__(vocab_file=vocab_file, merges_file=merges_file, errors=errors,
80 |                                                bos_token=bos_token, eos_token=eos_token, unk_token=unk_token,
81 |                                                sep_token=sep_token, cls_token=cls_token, pad_token=pad_token,
82 |                                                mask_token=mask_token, **kwargs)
83 | 
84 |     def add_special_tokens_single_sentence(self, token_ids):
85 |         """
86 |         Adds special tokens to a sequence for sequence classification tasks.
87 |         A RoBERTa sequence has the following format: <s> X </s>
88 |         """
89 |         return [self.cls_token_id] + token_ids + [self.sep_token_id]
90 | 
91 |     def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
92 |         """
93 |         Adds special tokens to a sequence pair for sequence classification tasks.
94 |         A RoBERTa sequence pair has the following format: <s> A </s></s> B </s>
95 |         """
96 |         sep = [self.sep_token_id]
97 |         cls = [self.cls_token_id]
98 |         return cls + token_ids_0 + sep + sep + token_ids_1 + sep
99 | 


--------------------------------------------------------------------------------
/larimar_base/modules/encoders/enc_lstm.py:
--------------------------------------------------------------------------------
  1 | from itertools import chain
  2 | import math
  3 | import torch
  4 | import torch.nn as nn
  5 | 
  6 | from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence
  7 | from .gaussian_encoder import GaussianEncoderBase
  8 | from ..utils import log_sum_exp
  9 | 
 10 | class GaussianLSTMEncoder(GaussianEncoderBase):
 11 |     """Gaussian LSTM Encoder with constant-length input"""
 12 |     def __init__(self, args, vocab_size, model_init, emb_init):
 13 |         super(GaussianLSTMEncoder, self).__init__()
 14 |         self.ni = args.ni
 15 |         self.nh = args.enc_nh
 16 |         self.nz = args.nz
 17 |         self.args = args
 18 | 
 19 |         self.embed = nn.Embedding(vocab_size, args.ni)
 20 | 
 21 |         self.lstm = nn.LSTM(input_size=args.ni,
 22 |                             hidden_size=args.enc_nh,
 23 |                             num_layers=1,
 24 |                             batch_first=True,
 25 |                             dropout=0)
 26 | 
 27 |         self.linear = nn.Linear(args.enc_nh, 2 * args.nz, bias=False)
 28 | 
 29 |         self.reset_parameters(model_init, emb_init)
 30 | 
 31 |     def reset_parameters(self, model_init, emb_init):
 32 |         # for name, param in self.lstm.named_parameters():
 33 |         #     # self.initializer(param)
 34 |         #     if 'bias' in name:
 35 |         #         nn.init.constant_(param, 0.0)
 36 |         #         # model_init(param)
 37 |         #     elif 'weight' in name:
 38 |         #         model_init(param)
 39 | 
 40 |         # model_init(self.linear.weight)
 41 |         # emb_init(self.embed.weight)
 42 |         for param in self.parameters():
 43 |             model_init(param)
 44 |         emb_init(self.embed.weight)
 45 | 
 46 | 
 47 |     def forward(self, input):
 48 |         """
 49 |         Args:
 50 |             x: (batch_size, seq_len)
 51 |         Returns: Tensor1, Tensor2
 52 |             Tensor1: the mean tensor, shape (batch, nz)
 53 |             Tensor2: the logvar tensor, shape (batch, nz)
 54 |         """
 55 | 
 56 |         # (batch_size, seq_len-1, args.ni)
 57 |         word_embed = self.embed(input)
 58 | 
 59 |         _, (last_state, last_cell) = self.lstm(word_embed)
 60 | 
 61 |         mean, logvar = self.linear(last_state).chunk(2, -1)
 62 | 
 63 |         # fix variance as a pre-defined value
 64 |         if self.args.fix_var > 0:
 65 |             logvar = mean.new_tensor([[[math.log(self.args.fix_var)]]]).expand_as(mean)
 66 |             
 67 |         return mean.squeeze(0), logvar.squeeze(0)
 68 | 
 69 |     # def eval_inference_mode(self, x):
 70 |     #     """compute the mode points in the inference distribution
 71 |     #     (in Gaussian case)
 72 |     #     Returns: Tensor
 73 |     #         Tensor: the posterior mode points with shape (*, nz)
 74 |     #     """
 75 | 
 76 |     #     # (batch_size, nz)
 77 |     #     mu, logvar = self.forward(x)
 78 | 
 79 | 
 80 | class VarLSTMEncoder(GaussianLSTMEncoder):
 81 |     """Gaussian LSTM Encoder with variable-length input"""
 82 |     def __init__(self, args, vocab_size, model_init, emb_init):
 83 |         super(VarLSTMEncoder, self).__init__(args, vocab_size, model_init, emb_init)
 84 | 
 85 | 
 86 |     def forward(self, input):
 87 |         """
 88 |         Args:
 89 |             input: tuple which contains x and sents_len
 90 |                     x: (batch_size, seq_len)
 91 |                     sents_len: long tensor of sentence lengths
 92 |         Returns: Tensor1, Tensor2
 93 |             Tensor1: the mean tensor, shape (batch, nz)
 94 |             Tensor2: the logvar tensor, shape (batch, nz)
 95 |         """
 96 | 
 97 |         input, sents_len = input
 98 |         # (batch_size, seq_len, args.ni)
 99 |         word_embed = self.embed(input)
100 | 
101 |         packed_embed = pack_padded_sequence(word_embed, sents_len.tolist(), batch_first=True)
102 | 
103 |         _, (last_state, last_cell) = self.lstm(packed_embed)
104 | 
105 |         mean, logvar = self.linear(last_state).chunk(2, -1)
106 | 
107 |         return mean.squeeze(0), logvar.squeeze(0)
108 | 
109 |     def encode(self, input, nsamples):
110 |         """perform the encoding and compute the KL term
111 |         Args:
112 |             input: tuple which contains x and sents_len
113 |         Returns: Tensor1, Tensor2
114 |             Tensor1: the tensor latent z with shape [batch, nsamples, nz]
115 |             Tensor2: the tenor of KL for each x with shape [batch]
116 |         """
117 | 
118 |         # (batch_size, nz)
119 |         mu, logvar = self.forward(input)
120 | 
121 |         # (batch, nsamples, nz)
122 |         z = self.reparameterize(mu, logvar, nsamples)
123 | 
124 |         KL = 0.5 * (mu.pow(2) + logvar.exp() - logvar - 1).sum(dim=1)
125 | 
126 |         return z, KL
127 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/convert_xlnet_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Convert BERT checkpoint."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import os
 22 | import argparse
 23 | import torch
 24 | 
 25 | from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME,
 26 |                                                     XLNetConfig,
 27 |                                                     XLNetLMHeadModel, XLNetForQuestionAnswering,
 28 |                                                     XLNetForSequenceClassification,
 29 |                                                     load_tf_weights_in_xlnet)
 30 | 
 31 | GLUE_TASKS_NUM_LABELS = {
 32 |     "cola": 2,
 33 |     "mnli": 3,
 34 |     "mrpc": 2,
 35 |     "sst-2": 2,
 36 |     "sts-b": 1,
 37 |     "qqp": 2,
 38 |     "qnli": 2,
 39 |     "rte": 2,
 40 |     "wnli": 2,
 41 | }
 42 | 
 43 | import logging
 44 | logging.basicConfig(level=logging.INFO)
 45 | 
 46 | def convert_xlnet_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_folder_path, finetuning_task=None):
 47 |     # Initialise PyTorch model
 48 |     config = XLNetConfig.from_json_file(bert_config_file)
 49 | 
 50 |     finetuning_task = finetuning_task.lower() if finetuning_task is not None else ""
 51 |     if finetuning_task in GLUE_TASKS_NUM_LABELS:
 52 |         print("Building PyTorch XLNetForSequenceClassification model from configuration: {}".format(str(config)))
 53 |         config.finetuning_task = finetuning_task
 54 |         config.num_labels = GLUE_TASKS_NUM_LABELS[finetuning_task]
 55 |         model = XLNetForSequenceClassification(config)
 56 |     elif 'squad' in finetuning_task:
 57 |         config.finetuning_task = finetuning_task
 58 |         model = XLNetForQuestionAnswering(config)
 59 |     else:
 60 |         model = XLNetLMHeadModel(config)
 61 | 
 62 |     # Load weights from tf checkpoint
 63 |     load_tf_weights_in_xlnet(model, config, tf_checkpoint_path)
 64 | 
 65 |     # Save pytorch-model
 66 |     pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
 67 |     pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME)
 68 |     print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path)))
 69 |     torch.save(model.state_dict(), pytorch_weights_dump_path)
 70 |     print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path)))
 71 |     with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
 72 |         f.write(config.to_json_string())
 73 | 
 74 | 
 75 | if __name__ == "__main__":
 76 |     parser = argparse.ArgumentParser()
 77 |     ## Required parameters
 78 |     parser.add_argument("--tf_checkpoint_path",
 79 |                         default = None,
 80 |                         type = str,
 81 |                         required = True,
 82 |                         help = "Path to the TensorFlow checkpoint path.")
 83 |     parser.add_argument("--xlnet_config_file",
 84 |                         default = None,
 85 |                         type = str,
 86 |                         required = True,
 87 |                         help = "The config json file corresponding to the pre-trained XLNet model. \n"
 88 |                                "This specifies the model architecture.")
 89 |     parser.add_argument("--pytorch_dump_folder_path",
 90 |                         default = None,
 91 |                         type = str,
 92 |                         required = True,
 93 |                         help = "Path to the folder to store the PyTorch model or dataset/vocab.")
 94 |     parser.add_argument("--finetuning_task",
 95 |                         default = None,
 96 |                         type = str,
 97 |                         help = "Name of a task on which the XLNet TensorFloaw model was fine-tuned")
 98 |     args = parser.parse_args()
 99 |     print(args)
100 | 
101 |     convert_xlnet_checkpoint_to_pytorch(args.tf_checkpoint_path,
102 |                                         args.xlnet_config_file,
103 |                                         args.pytorch_dump_folder_path,
104 |                                         args.finetuning_task)
105 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/utils/versions.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """
 15 | Utilities for working with package versions
 16 | """
 17 | 
 18 | import importlib.metadata
 19 | import operator
 20 | import re
 21 | import sys
 22 | from typing import Optional
 23 | 
 24 | from packaging import version
 25 | 
 26 | 
 27 | ops = {
 28 |     "<": operator.lt,
 29 |     "<=": operator.le,
 30 |     "==": operator.eq,
 31 |     "!=": operator.ne,
 32 |     ">=": operator.ge,
 33 |     ">": operator.gt,
 34 | }
 35 | 
 36 | 
 37 | def _compare_versions(op, got_ver, want_ver, requirement, pkg, hint):
 38 |     if got_ver is None or want_ver is None:
 39 |         raise ValueError(
 40 |             f"Unable to compare versions for {requirement}: need={want_ver} found={got_ver}. This is unusual. Consider"
 41 |             f" reinstalling {pkg}."
 42 |         )
 43 |     if not ops[op](version.parse(got_ver), version.parse(want_ver)):
 44 |         raise ImportError(
 45 |             f"{requirement} is required for a normal functioning of this module, but found {pkg}=={got_ver}.{hint}"
 46 |         )
 47 | 
 48 | 
 49 | def require_version(requirement: str, hint: Optional[str] = None) -> None:
 50 |     """
 51 |     Perform a runtime check of the dependency versions, using the exact same syntax used by pip.
 52 | 
 53 |     The installed module version comes from the *site-packages* dir via *importlib.metadata*.
 54 | 
 55 |     Args:
 56 |         requirement (`str`): pip style definition, e.g.,  "tokenizers==0.9.4", "tqdm>=4.27", "numpy"
 57 |         hint (`str`, *optional*): what suggestion to print in case of requirements not being met
 58 | 
 59 |     Example:
 60 | 
 61 |     ```python
 62 |     require_version("pandas>1.1.2")
 63 |     require_version("numpy>1.18.5", "this is important to have for whatever reason")
 64 |     ```"""
 65 | 
 66 |     hint = f"\n{hint}" if hint is not None else ""
 67 | 
 68 |     # non-versioned check
 69 |     if re.match(r"^[\w_\-\d]+$", requirement):
 70 |         pkg, op, want_ver = requirement, None, None
 71 |     else:
 72 |         match = re.findall(r"^([^!=<>\s]+)([\s!=<>]{1,2}.+)", requirement)
 73 |         if not match:
 74 |             raise ValueError(
 75 |                 "requirement needs to be in the pip package format, .e.g., package_a==1.23, or package_b>=1.23, but"
 76 |                 f" got {requirement}"
 77 |             )
 78 |         pkg, want_full = match[0]
 79 |         want_range = want_full.split(",")  # there could be multiple requirements
 80 |         wanted = {}
 81 |         for w in want_range:
 82 |             match = re.findall(r"^([\s!=<>]{1,2})(.+)", w)
 83 |             if not match:
 84 |                 raise ValueError(
 85 |                     "requirement needs to be in the pip package format, .e.g., package_a==1.23, or package_b>=1.23,"
 86 |                     f" but got {requirement}"
 87 |                 )
 88 |             op, want_ver = match[0]
 89 |             wanted[op] = want_ver
 90 |             if op not in ops:
 91 |                 raise ValueError(f"{requirement}: need one of {list(ops.keys())}, but got {op}")
 92 | 
 93 |     # special case
 94 |     if pkg == "python":
 95 |         got_ver = ".".join([str(x) for x in sys.version_info[:3]])
 96 |         for op, want_ver in wanted.items():
 97 |             _compare_versions(op, got_ver, want_ver, requirement, pkg, hint)
 98 |         return
 99 | 
100 |     # check if any version is installed
101 |     try:
102 |         got_ver = importlib.metadata.version(pkg)
103 |     except importlib.metadata.PackageNotFoundError:
104 |         raise importlib.metadata.PackageNotFoundError(
105 |             f"The '{requirement}' distribution was not found and is required by this application. {hint}"
106 |         )
107 | 
108 |     # check that the right version is installed if version number or a range was provided
109 |     if want_ver is not None:
110 |         for op, want_ver in wanted.items():
111 |             _compare_versions(op, got_ver, want_ver, requirement, pkg, hint)
112 | 
113 | 
114 | def require_version_core(requirement):
115 |     """require_version wrapper which emits a core-specific hint on failure"""
116 |     hint = "Try: pip install transformers -U or pip install -e '.[dev]' if you're working with git main"
117 |     return require_version(requirement, hint)
118 | 


--------------------------------------------------------------------------------
/larimar_base/modules/encoders/gaussian_encoder.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | import torch.nn as nn
  4 | 
  5 | from .encoder import EncoderBase
  6 | from ..utils import log_sum_exp
  7 | 
  8 | class GaussianEncoderBase(EncoderBase):
  9 |     """docstring for EncoderBase"""
 10 |     def __init__(self):
 11 |         super(GaussianEncoderBase, self).__init__()
 12 | 
 13 |     def freeze(self):
 14 |         for param in self.parameters():
 15 |             param.requires_grad = False
 16 | 
 17 |     def forward(self, x):
 18 |         """
 19 |         Args:
 20 |             x: (batch_size, *)
 21 |         Returns: Tensor1, Tensor2
 22 |             Tensor1: the mean tensor, shape (batch, nz)
 23 |             Tensor2: the logvar tensor, shape (batch, nz)
 24 |         """
 25 | 
 26 |         raise NotImplementedError
 27 | 
 28 |     def encode_stats(self, x):
 29 | 
 30 |         return self.forward(x)
 31 | 
 32 |     def sample(self, input, nsamples):
 33 |         """sampling from the encoder
 34 |         Returns: Tensor1
 35 |             Tensor1: the tensor latent z with shape [batch, nsamples, nz]
 36 |         """
 37 | 
 38 |         # (batch_size, nz)
 39 |         mu, logvar = self.forward(input)
 40 | 
 41 |         # (batch, nsamples, nz)
 42 |         z = self.reparameterize(mu, logvar, nsamples)
 43 | 
 44 |         return z, (mu, logvar)
 45 | 
 46 |     def encode(self, input, nsamples):
 47 |         """perform the encoding and compute the KL term
 48 |         Returns: Tensor1, Tensor2
 49 |             Tensor1: the tensor latent z with shape [batch, nsamples, nz]
 50 |             Tensor2: the tenor of KL for each x with shape [batch]
 51 |         """
 52 | 
 53 |         # (batch_size, nz)
 54 |         mu, logvar = self.forward(input)
 55 | 
 56 |         # (batch, nsamples, nz)
 57 |         z = self.reparameterize(mu, logvar, nsamples)
 58 | 
 59 |         KL = 0.5 * (mu.pow(2) + logvar.exp() - logvar - 1).sum(dim=1)
 60 | 
 61 |         return z, KL
 62 | 
 63 |     def reparameterize(self, mu, logvar, nsamples=1):
 64 |         """sample from posterior Gaussian family
 65 |         Args:
 66 |             mu: Tensor
 67 |                 Mean of gaussian distribution with shape (batch, nz)
 68 |             logvar: Tensor
 69 |                 logvar of gaussian distibution with shape (batch, nz)
 70 |         Returns: Tensor
 71 |             Sampled z with shape (batch, nsamples, nz)
 72 |         """
 73 |         batch_size, nz = mu.size()
 74 |         std = logvar.mul(0.5).exp()
 75 | 
 76 |         mu_expd = mu.unsqueeze(1).expand(batch_size, nsamples, nz)
 77 |         std_expd = std.unsqueeze(1).expand(batch_size, nsamples, nz)
 78 | 
 79 |         eps = torch.zeros_like(std_expd).normal_()
 80 | 
 81 |         return mu_expd + torch.mul(eps, std_expd)
 82 | 
 83 |     def eval_inference_dist(self, x, z, param=None):
 84 |         """this function computes log q(z | x)
 85 |         Args:
 86 |             z: tensor
 87 |                 different z points that will be evaluated, with
 88 |                 shape [batch, nsamples, nz]
 89 |         Returns: Tensor1
 90 |             Tensor1: log q(z|x) with shape [batch, nsamples]
 91 |         """
 92 | 
 93 |         nz = z.size(2)
 94 | 
 95 |         if not param:
 96 |             mu, logvar = self.forward(x)
 97 |         else:
 98 |             mu, logvar = param
 99 | 
100 |         # (batch_size, 1, nz)
101 |         mu, logvar = mu.unsqueeze(1), logvar.unsqueeze(1)
102 |         var = logvar.exp()
103 | 
104 |         # (batch_size, nsamples, nz)
105 |         dev = z - mu
106 | 
107 |         # (batch_size, nsamples)
108 |         log_density = -0.5 * ((dev ** 2) / var).sum(dim=-1) - \
109 |             0.5 * (nz * math.log(2 * math.pi) + logvar.sum(-1))
110 | 
111 |         return log_density
112 | 
113 | 
114 | 
115 |     def calc_mi(self, x):
116 |         """Approximate the mutual information between x and z
117 |         I(x, z) = E_xE_{q(z|x)}log(q(z|x)) - E_xE_{q(z|x)}log(q(z))
118 |         Returns: Float
119 |         """
120 | 
121 |         # [x_batch, nz]
122 |         mu, logvar = self.forward(x)
123 | 
124 |         x_batch, nz = mu.size()
125 | 
126 |         # E_{q(z|x)}log(q(z|x)) = -0.5*nz*log(2*\pi) - 0.5*(1+logvar).sum(-1)
127 |         neg_entropy = (-0.5 * nz * math.log(2 * math.pi)- 0.5 * (1 + logvar).sum(-1)).mean()
128 | 
129 |         # [z_batch, 1, nz]
130 |         z_samples = self.reparameterize(mu, logvar, 1)
131 | 
132 |         # [1, x_batch, nz]
133 |         mu, logvar = mu.unsqueeze(0), logvar.unsqueeze(0)
134 |         var = logvar.exp()
135 | 
136 |         # (z_batch, x_batch, nz)
137 |         dev = z_samples - mu
138 | 
139 |         # (z_batch, x_batch)
140 |         log_density = -0.5 * ((dev ** 2) / var).sum(dim=-1) - \
141 |             0.5 * (nz * math.log(2 * math.pi) + logvar.sum(-1))
142 | 
143 |         # log q(z): aggregate posterior
144 |         # [z_batch]
145 |         log_qz = log_sum_exp(log_density, dim=1) - math.log(x_batch)
146 | 
147 |         return (neg_entropy - log_qz.mean(-1)).item()


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/__init__.py:
--------------------------------------------------------------------------------
 1 | __version__ = "1.2.0"
 2 | # Work around to update TensorFlow's absl.logging threshold which alters the
 3 | # default Python logging output behavior when present.
 4 | # see: https://github.com/abseil/abseil-py/issues/99
 5 | # and: https://github.com/tensorflow/tensorflow/issues/26691#issuecomment-500369493
 6 | try:
 7 |     import absl.logging
 8 |     absl.logging.set_verbosity('info')
 9 |     absl.logging.set_stderrthreshold('info')
10 |     absl.logging._warn_preinit_stderr = False
11 | except:
12 |     pass
13 | 
14 | # Tokenizer
15 | from .tokenization_utils import (PreTrainedTokenizer)
16 | from .tokenization_auto import AutoTokenizer
17 | from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
18 | from .tokenization_openai import OpenAIGPTTokenizer
19 | from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
20 | from .tokenization_gpt2 import GPT2Tokenizer
21 | # from .tokenization_gptj import GPTJTokenizer
22 | from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
23 | from .tokenization_xlm import XLMTokenizer
24 | from .tokenization_roberta import RobertaTokenizer
25 | from .tokenization_distilbert import DistilBertTokenizer
26 | 
27 | # Configurations
28 | from .configuration_utils import PretrainedConfig
29 | from .configuration_auto import AutoConfig
30 | from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
31 | from .configuration_openai import OpenAIGPTConfig, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
32 | from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
33 | from .configuration_gpt2 import GPT2Config, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
34 | from .configuration_gptj import GPTJConfig, GPTJ_PRETRAINED_CONFIG_ARCHIVE_MAP
35 | from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
36 | from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
37 | from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
38 | from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
39 | 
40 | # Modeling
41 | from .modeling_utils import (PreTrainedModel, prune_layer, Conv1D)
42 | from .modeling_auto import (AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering,
43 |                             AutoModelWithLMHead)
44 | 
45 | from .modeling_bert import (BertPreTrainedModel, BertModel, BertForLatentConnector, BertForPreTraining,BertForSequenceClassificationLatentConnector,
46 |                             BertForMaskedLM, BertForNextSentencePrediction,
47 |                             BertForSequenceClassification, BertForMultipleChoice,
48 |                             BertForTokenClassification, BertForQuestionAnswering,
49 |                             load_tf_weights_in_bert)#, BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
50 | from .modeling_openai import (OpenAIGPTPreTrainedModel, OpenAIGPTModel,
51 |                               OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
52 |                               load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
53 | from .modeling_transfo_xl import (TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel,
54 |                                   load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
55 | from .modeling_gpt2 import (GPT2PreTrainedModel, GPT2Model, GPT2ForLatentConnector, 
56 |                             GPT2LMHeadModel, GPT2DoubleHeadsModel,
57 |                             load_tf_weights_in_gpt2, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
58 | from .modeling_gptj import (GPTJPreTrainedModel, GPTJModel, GPTJForLatentConnector)
59 | from .modeling_xlnet import (XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel,
60 |                              XLNetForSequenceClassification, XLNetForQuestionAnswering, XLNetForMultipleChoice,
61 |                              load_tf_weights_in_xlnet, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
62 | from .modeling_xlm import (XLMPreTrainedModel , XLMModel,
63 |                            XLMWithLMHeadModel, XLMForSequenceClassification,
64 |                            XLMForQuestionAnswering, XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
65 | from .modeling_roberta import (RobertaForMaskedLM, RobertaModel, RobertaForSequenceClassification,
66 |                                RobertaForMultipleChoice, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
67 | from .modeling_distilbert import (DistilBertForMaskedLM, DistilBertModel,
68 |                                DistilBertForSequenceClassification, DistilBertForQuestionAnswering,
69 |                                DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
70 | 
71 | # Optimization
72 | from .optimization import (AdamW, WarmupLinearSchedule, )
73 | 
74 | # Files and general utilities
75 | from .file_utils import (PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE,
76 |                          cached_path, add_start_docstrings, add_end_docstrings,
77 |                          WEIGHTS_NAME, TF_WEIGHTS_NAME, CONFIG_NAME)
78 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/integrations/__init__.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 The HuggingFace Team. All rights reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | from typing import TYPE_CHECKING
 15 | 
 16 | from ..utils import _LazyModule
 17 | 
 18 | 
 19 | _import_structure = {
 20 |     "bitsandbytes": [
 21 |         "get_keys_to_not_convert",
 22 |         "replace_8bit_linear",
 23 |         "replace_with_bnb_linear",
 24 |         "set_module_8bit_tensor_to_device",
 25 |         "set_module_quantized_tensor_to_device",
 26 |     ],
 27 |     "deepspeed": [
 28 |         "HfDeepSpeedConfig",
 29 |         "HfTrainerDeepSpeedConfig",
 30 |         "deepspeed_config",
 31 |         "deepspeed_init",
 32 |         "deepspeed_load_checkpoint",
 33 |         "deepspeed_optim_sched",
 34 |         "is_deepspeed_available",
 35 |         "is_deepspeed_zero3_enabled",
 36 |         "set_hf_deepspeed_config",
 37 |         "unset_hf_deepspeed_config",
 38 |     ],
 39 |     "integration_utils": [
 40 |         "INTEGRATION_TO_CALLBACK",
 41 |         "AzureMLCallback",
 42 |         "ClearMLCallback",
 43 |         "CodeCarbonCallback",
 44 |         "CometCallback",
 45 |         "DagsHubCallback",
 46 |         "FlyteCallback",
 47 |         "MLflowCallback",
 48 |         "NeptuneCallback",
 49 |         "NeptuneMissingConfiguration",
 50 |         "TensorBoardCallback",
 51 |         "WandbCallback",
 52 |         "get_available_reporting_integrations",
 53 |         "get_reporting_integration_callbacks",
 54 |         "hp_params",
 55 |         "is_azureml_available",
 56 |         "is_clearml_available",
 57 |         "is_codecarbon_available",
 58 |         "is_comet_available",
 59 |         "is_dagshub_available",
 60 |         "is_fairscale_available",
 61 |         "is_flyte_deck_standard_available",
 62 |         "is_flytekit_available",
 63 |         "is_mlflow_available",
 64 |         "is_neptune_available",
 65 |         "is_optuna_available",
 66 |         "is_ray_available",
 67 |         "is_ray_tune_available",
 68 |         "is_sigopt_available",
 69 |         "is_tensorboard_available",
 70 |         "is_wandb_available",
 71 |         "rewrite_logs",
 72 |         "run_hp_search_optuna",
 73 |         "run_hp_search_ray",
 74 |         "run_hp_search_sigopt",
 75 |         "run_hp_search_wandb",
 76 |     ],
 77 |     "peft": ["PeftAdapterMixin"],
 78 | }
 79 | 
 80 | if TYPE_CHECKING:
 81 |     from .bitsandbytes import (
 82 |         get_keys_to_not_convert,
 83 |         replace_8bit_linear,
 84 |         replace_with_bnb_linear,
 85 |         set_module_8bit_tensor_to_device,
 86 |         set_module_quantized_tensor_to_device,
 87 |     )
 88 |     from .deepspeed import (
 89 |         HfDeepSpeedConfig,
 90 |         HfTrainerDeepSpeedConfig,
 91 |         deepspeed_config,
 92 |         deepspeed_init,
 93 |         deepspeed_load_checkpoint,
 94 |         deepspeed_optim_sched,
 95 |         is_deepspeed_available,
 96 |         is_deepspeed_zero3_enabled,
 97 |         set_hf_deepspeed_config,
 98 |         unset_hf_deepspeed_config,
 99 |     )
100 |     from .integration_utils import (
101 |         INTEGRATION_TO_CALLBACK,
102 |         AzureMLCallback,
103 |         ClearMLCallback,
104 |         CodeCarbonCallback,
105 |         CometCallback,
106 |         DagsHubCallback,
107 |         FlyteCallback,
108 |         MLflowCallback,
109 |         NeptuneCallback,
110 |         NeptuneMissingConfiguration,
111 |         TensorBoardCallback,
112 |         WandbCallback,
113 |         get_available_reporting_integrations,
114 |         get_reporting_integration_callbacks,
115 |         hp_params,
116 |         is_azureml_available,
117 |         is_clearml_available,
118 |         is_codecarbon_available,
119 |         is_comet_available,
120 |         is_dagshub_available,
121 |         is_fairscale_available,
122 |         is_flyte_deck_standard_available,
123 |         is_flytekit_available,
124 |         is_mlflow_available,
125 |         is_neptune_available,
126 |         is_optuna_available,
127 |         is_ray_available,
128 |         is_ray_tune_available,
129 |         is_sigopt_available,
130 |         is_tensorboard_available,
131 |         is_wandb_available,
132 |         rewrite_logs,
133 |         run_hp_search_optuna,
134 |         run_hp_search_ray,
135 |         run_hp_search_sigopt,
136 |         run_hp_search_wandb,
137 |     )
138 |     from .peft import PeftAdapterMixin
139 | else:
140 |     import sys
141 | 
142 |     sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
143 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Convert Huggingface Pytorch checkpoint to Tensorflow checkpoint."""
 17 | 
 18 | import os
 19 | import argparse
 20 | import torch
 21 | import numpy as np
 22 | import tensorflow as tf
 23 | from pytorch_transformers import BertModel
 24 | 
 25 | 
 26 | def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:str):
 27 | 
 28 |     """
 29 |     :param model:BertModel Pytorch model instance to be converted
 30 |     :param ckpt_dir: Tensorflow model directory
 31 |     :param model_name: model name
 32 |     :return:
 33 | 
 34 |     Currently supported HF models:
 35 |         Y BertModel
 36 |         N BertForMaskedLM
 37 |         N BertForPreTraining
 38 |         N BertForMultipleChoice
 39 |         N BertForNextSentencePrediction
 40 |         N BertForSequenceClassification
 41 |         N BertForQuestionAnswering
 42 |     """
 43 | 
 44 |     tensors_to_transpose = (
 45 |         "dense.weight",
 46 |         "attention.self.query",
 47 |         "attention.self.key",
 48 |         "attention.self.value"
 49 |     )
 50 | 
 51 |     var_map = (
 52 |         ('layer.', 'layer_'),
 53 |         ('word_embeddings.weight', 'word_embeddings'),
 54 |         ('position_embeddings.weight', 'position_embeddings'),
 55 |         ('token_type_embeddings.weight', 'token_type_embeddings'),
 56 |         ('.', '/'),
 57 |         ('LayerNorm/weight', 'LayerNorm/gamma'),
 58 |         ('LayerNorm/bias', 'LayerNorm/beta'),
 59 |         ('weight', 'kernel')
 60 |     )
 61 | 
 62 |     if not os.path.isdir(ckpt_dir):
 63 |         os.makedirs(ckpt_dir)
 64 | 
 65 |     state_dict = model.state_dict()
 66 | 
 67 |     def to_tf_var_name(name:str):
 68 |         for patt, repl in iter(var_map):
 69 |             name = name.replace(patt, repl)
 70 |         return 'bert/{}'.format(name)
 71 | 
 72 |     def create_tf_var(tensor:np.ndarray, name:str, session:tf.Session):
 73 |         tf_dtype = tf.dtypes.as_dtype(tensor.dtype)
 74 |         tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name, initializer=tf.zeros_initializer())
 75 |         session.run(tf.variables_initializer([tf_var]))
 76 |         session.run(tf_var)
 77 |         return tf_var
 78 | 
 79 |     tf.reset_default_graph()
 80 |     with tf.Session() as session:
 81 |         for var_name in state_dict:
 82 |             tf_name = to_tf_var_name(var_name)
 83 |             torch_tensor = state_dict[var_name].numpy()
 84 |             if any([x in var_name for x in tensors_to_transpose]):
 85 |                 torch_tensor = torch_tensor.T
 86 |             tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session)
 87 |             tf.keras.backend.set_value(tf_var, torch_tensor)
 88 |             tf_weight = session.run(tf_var)
 89 |             print("Successfully created {}: {}".format(tf_name, np.allclose(tf_weight, torch_tensor)))
 90 | 
 91 |         saver = tf.train.Saver(tf.trainable_variables())
 92 |         saver.save(session, os.path.join(ckpt_dir, model_name.replace("-", "_") + ".ckpt"))
 93 | 
 94 | 
 95 | def main(raw_args=None):
 96 |     parser = argparse.ArgumentParser()
 97 |     parser.add_argument("--model_name",
 98 |                         type=str,
 99 |                         required=True,
100 |                         help="model name e.g. bert-base-uncased")
101 |     parser.add_argument("--cache_dir",
102 |                         type=str,
103 |                         default=None,
104 |                         required=False,
105 |                         help="Directory containing pytorch model")
106 |     parser.add_argument("--pytorch_model_path",
107 |                         type=str,
108 |                         required=True,
109 |                         help="/path/to/<pytorch-model-name>.bin")
110 |     parser.add_argument("--tf_cache_dir",
111 |                         type=str,
112 |                         required=True,
113 |                         help="Directory in which to save tensorflow model")
114 |     args = parser.parse_args(raw_args)
115 |     
116 |     model = BertModel.from_pretrained(
117 |         pretrained_model_name_or_path=args.model_name,
118 |         state_dict=torch.load(args.pytorch_model_path),
119 |         cache_dir=args.cache_dir
120 |     )
121 |     
122 |     convert_pytorch_checkpoint_to_tf(
123 |         model=model,
124 |         ckpt_dir=args.tf_cache_dir,
125 |         model_name=args.model_name
126 |     )
127 | 
128 | 
129 | if __name__ == "__main__":
130 |     main()
131 | 


--------------------------------------------------------------------------------
/larimar_base/modules/utils.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | 
  4 | def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float("Inf")):
  5 |     """Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
  6 |     Args:
  7 |         logits: logits distribution shape (vocabulary size)
  8 |         top_k >0: keep only top k tokens with highest probability (top-k filtering).
  9 |         top_p >0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
 10 |             Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
 11 |     """
 12 |     assert (
 13 |         logits.dim() == 1
 14 |     )  # batch size 1 for now - could be updated for more but the code would be less clear
 15 |     top_k = min(top_k, logits.size(-1))  # Safety check
 16 |     if top_k > 0:
 17 |         # Remove all tokens with a probability less than the last token of the top-k
 18 |         indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
 19 |         logits[indices_to_remove] = filter_value
 20 | 
 21 |     if top_p > 0.0:
 22 |         sorted_logits, sorted_indices = torch.sort(logits, descending=True)
 23 |         cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
 24 | 
 25 |         # Remove tokens with cumulative probability above the threshold
 26 |         sorted_indices_to_remove = cumulative_probs > top_p
 27 |         # Shift the indices to the right to keep also the first token above the threshold
 28 |         sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
 29 |         sorted_indices_to_remove[..., 0] = 0
 30 | 
 31 |         indices_to_remove = sorted_indices[sorted_indices_to_remove]
 32 |         logits[indices_to_remove] = filter_value
 33 |     return logits
 34 | 
 35 | def top_k_top_p_filtering_batch(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
 36 |     """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
 37 |         Args:
 38 |             logits: logits distribution shape (vocabulary size)
 39 |             top_k > 0: keep only top k tokens with highest probability (top-k filtering).
 40 |             top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
 41 |                 Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
 42 |         From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
 43 |     """
 44 |     # assert logits.dim() == 1  # batch size 1 for now - could be updated for more but the code would be less clear
 45 | 
 46 |     top_k = min(top_k, logits.size(-1))  # Safety check
 47 | 
 48 |     if top_k > 0:
 49 |         # Remove all tokens with a probability less than the last token of the top-k
 50 |         indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
 51 |         # logits.masked_fill_(logits < threshold, filter_value)  # (B, vocab_size)
 52 |         logits[indices_to_remove] = filter_value
 53 | 
 54 |     if top_p > 0.0:
 55 |         sorted_logits, sorted_indices = torch.sort(logits, descending=True)  # (B, vocab_size)
 56 |         cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)  # (B, vocab_size)
 57 | 
 58 |         # Remove tokens with cumulative probability above the threshold
 59 |         sorted_indices_to_remove = cumulative_probs > top_p
 60 | 
 61 |         # Shift the indices to the right to keep also the first token above the threshold
 62 |         sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
 63 |         sorted_indices_to_remove[..., 0] = 0
 64 | 
 65 |         # indices_to_remove = sorted_indices[sorted_indices_to_remove]
 66 | 
 67 |         # logits.masked_fill_(indices_to_remove, filter_value)
 68 |         indices_to_remove = sorted_indices_to_remove.scatter(dim=1, index=sorted_indices, src=sorted_indices_to_remove)
 69 |         logits[indices_to_remove] = filter_value
 70 | 
 71 |     return logits
 72 | 
 73 | def safe_log(z):
 74 |     return torch.log(z + 1e-7)
 75 | 
 76 | def log_sum_exp(value, dim=None, keepdim=False):
 77 |     """Numerically stable implementation of the operation
 78 |     value.exp().sum(dim, keepdim).log()
 79 |     """
 80 |     if dim is not None:
 81 |         m, _ = torch.max(value, dim=dim, keepdim=True)
 82 |         value0 = value - m
 83 |         if keepdim is False:
 84 |             m = m.squeeze(dim)
 85 |         return m + torch.log(torch.sum(torch.exp(value0), dim=dim, keepdim=keepdim))
 86 |     else:
 87 |         m = torch.max(value)
 88 |         sum_exp = torch.sum(torch.exp(value - m))
 89 |         return m + torch.log(sum_exp)
 90 | 
 91 | 
 92 | def generate_grid(zmin, zmax, dz, device, ndim=2):
 93 |     """generate a 1- or 2-dimensional grid
 94 |     Returns: Tensor, int
 95 |         Tensor: The grid tensor with shape (k^2, 2),
 96 |             where k=(zmax - zmin)/dz
 97 |         int: k
 98 |     """
 99 | 
100 |     if ndim == 2:
101 |         x = torch.arange(zmin, zmax, dz)
102 |         k = x.size(0)
103 | 
104 |         x1 = x.unsqueeze(1).repeat(1, k).view(-1)
105 |         x2 = x.repeat(k)
106 | 
107 |         return torch.cat((x1.unsqueeze(-1), x2.unsqueeze(-1)), dim=-1).to(device), k
108 | 
109 |     elif ndim == 1:
110 |         return torch.arange(zmin, zmax, dz).unsqueeze(1).to(device)
111 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/utils/peft_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 The HuggingFace Team. All rights reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | import importlib
 15 | import os
 16 | from typing import Dict, Optional, Union
 17 | 
 18 | from packaging import version
 19 | 
 20 | from .hub import cached_file
 21 | from .import_utils import is_peft_available
 22 | 
 23 | 
 24 | ADAPTER_CONFIG_NAME = "adapter_config.json"
 25 | ADAPTER_WEIGHTS_NAME = "adapter_model.bin"
 26 | ADAPTER_SAFE_WEIGHTS_NAME = "adapter_model.safetensors"
 27 | 
 28 | 
 29 | def find_adapter_config_file(
 30 |     model_id: str,
 31 |     cache_dir: Optional[Union[str, os.PathLike]] = None,
 32 |     force_download: bool = False,
 33 |     resume_download: bool = False,
 34 |     proxies: Optional[Dict[str, str]] = None,
 35 |     token: Optional[Union[bool, str]] = None,
 36 |     revision: Optional[str] = None,
 37 |     local_files_only: bool = False,
 38 |     subfolder: str = "",
 39 |     _commit_hash: Optional[str] = None,
 40 | ) -> Optional[str]:
 41 |     r"""
 42 |     Simply checks if the model stored on the Hub or locally is an adapter model or not, return the path the the adapter
 43 |     config file if it is, None otherwise.
 44 | 
 45 |     Args:
 46 |         model_id (`str`):
 47 |             The identifier of the model to look for, can be either a local path or an id to the repository on the Hub.
 48 |         cache_dir (`str` or `os.PathLike`, *optional*):
 49 |             Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
 50 |             cache should not be used.
 51 |         force_download (`bool`, *optional*, defaults to `False`):
 52 |             Whether or not to force to (re-)download the configuration files and override the cached versions if they
 53 |             exist.
 54 |         resume_download (`bool`, *optional*, defaults to `False`):
 55 |             Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists.
 56 |         proxies (`Dict[str, str]`, *optional*):
 57 |             A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
 58 |             'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
 59 |         token (`str` or *bool*, *optional*):
 60 |             The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
 61 |             when running `huggingface-cli login` (stored in `~/.huggingface`).
 62 |         revision (`str`, *optional*, defaults to `"main"`):
 63 |             The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
 64 |             git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
 65 |             identifier allowed by git.
 66 | 
 67 |             <Tip>
 68 | 
 69 |             To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>".
 70 | 
 71 |             </Tip>
 72 | 
 73 |         local_files_only (`bool`, *optional*, defaults to `False`):
 74 |             If `True`, will only try to load the tokenizer configuration from local files.
 75 |         subfolder (`str`, *optional*, defaults to `""`):
 76 |             In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
 77 |             specify the folder name here.
 78 |     """
 79 |     adapter_cached_filename = None
 80 |     if model_id is None:
 81 |         return None
 82 |     elif os.path.isdir(model_id):
 83 |         list_remote_files = os.listdir(model_id)
 84 |         if ADAPTER_CONFIG_NAME in list_remote_files:
 85 |             adapter_cached_filename = os.path.join(model_id, ADAPTER_CONFIG_NAME)
 86 |     else:
 87 |         adapter_cached_filename = cached_file(
 88 |             model_id,
 89 |             ADAPTER_CONFIG_NAME,
 90 |             cache_dir=cache_dir,
 91 |             force_download=force_download,
 92 |             resume_download=resume_download,
 93 |             proxies=proxies,
 94 |             token=token,
 95 |             revision=revision,
 96 |             local_files_only=local_files_only,
 97 |             subfolder=subfolder,
 98 |             _commit_hash=_commit_hash,
 99 |             _raise_exceptions_for_missing_entries=False,
100 |             _raise_exceptions_for_connection_errors=False,
101 |         )
102 | 
103 |     return adapter_cached_filename
104 | 
105 | 
106 | def check_peft_version(min_version: str) -> None:
107 |     r"""
108 |     Checks if the version of PEFT is compatible.
109 | 
110 |     Args:
111 |         version (`str`):
112 |             The version of PEFT to check against.
113 |     """
114 |     if not is_peft_available():
115 |         raise ValueError("PEFT is not installed. Please install it with `pip install peft`")
116 | 
117 |     is_peft_version_compatible = version.parse(importlib.metadata.version("peft")) >= version.parse(min_version)
118 | 
119 |     if not is_peft_version_compatible:
120 |         raise ValueError(
121 |             f"The version of PEFT you are using is not compatible, please use a version that is greater"
122 |             f" than {min_version}"
123 |         )
124 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/tests/tokenization_xlnet_test.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | from __future__ import absolute_import, division, print_function, unicode_literals
 16 | 
 17 | import os
 18 | import unittest
 19 | 
 20 | from pytorch_transformers.tokenization_xlnet import (XLNetTokenizer, SPIECE_UNDERLINE)
 21 | 
 22 | from .tokenization_tests_commons import CommonTestCases
 23 | 
 24 | SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)),
 25 |                     'fixtures/test_sentencepiece.model')
 26 | 
 27 | class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
 28 | 
 29 |     tokenizer_class = XLNetTokenizer
 30 | 
 31 |     def setUp(self):
 32 |         super(XLNetTokenizationTest, self).setUp()
 33 | 
 34 |         # We have a SentencePiece fixture for testing
 35 |         tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
 36 |         tokenizer.save_pretrained(self.tmpdirname)
 37 | 
 38 |     def get_tokenizer(self, **kwargs):
 39 |         return XLNetTokenizer.from_pretrained(self.tmpdirname, **kwargs)
 40 | 
 41 |     def get_input_output_texts(self):
 42 |         input_text = u"This is a test"
 43 |         output_text = u"This is a test"
 44 |         return input_text, output_text
 45 | 
 46 | 
 47 |     def test_full_tokenizer(self):
 48 |         tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
 49 | 
 50 |         tokens = tokenizer.tokenize(u'This is a test')
 51 |         self.assertListEqual(tokens, [u'▁This', u'▁is', u'▁a', u'▁t', u'est'])
 52 | 
 53 |         self.assertListEqual(
 54 |             tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382])
 55 | 
 56 |         tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
 57 |         self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
 58 |                                     u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
 59 |                                     u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
 60 |                                     SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's', u'é', u'.'])
 61 |         ids = tokenizer.convert_tokens_to_ids(tokens)
 62 |         self.assertListEqual(
 63 |             ids, [8, 21, 84, 55, 24, 19, 7, 0,
 64 |                 602, 347, 347, 347, 3, 12, 66,
 65 |                 46, 72, 80, 6, 0, 4])
 66 | 
 67 |         back_tokens = tokenizer.convert_ids_to_tokens(ids)
 68 |         self.assertListEqual(back_tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
 69 |                                         u'or', u'n', SPIECE_UNDERLINE + u'in',
 70 |                                         SPIECE_UNDERLINE + u'', u'<unk>', u'2', u'0', u'0', u'0', u',',
 71 |                                         SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
 72 |                                         SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's',
 73 |                                         u'<unk>', u'.'])
 74 | 
 75 |     def test_tokenizer_lower(self):
 76 |         tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=True)
 77 |         tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
 78 |         self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'', u'i', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
 79 |                                       u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
 80 |                                       u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
 81 |                                       SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u'se', u'.'])
 82 |         self.assertListEqual(tokenizer.tokenize(u"H\u00E9llo"), [u"▁he", u"ll", u"o"])
 83 | 
 84 |     def test_tokenizer_no_lower(self):
 85 |         tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=False)
 86 |         tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
 87 |         self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b', u'or',
 88 |                                       u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
 89 |                                       u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
 90 |                                       SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u'se', u'.'])
 91 | 
 92 |     def test_sequence_builders(self):
 93 |         tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
 94 | 
 95 |         text = tokenizer.encode("sequence builders")
 96 |         text_2 = tokenizer.encode("multi-sequence build")
 97 | 
 98 |         encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
 99 |         encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
100 | 
101 |         assert encoded_sentence == text + [4, 3]
102 |         assert encoded_pair == text + [4] + text_2 + [4, 3]
103 | 
104 | 
105 | if __name__ == '__main__':
106 |     unittest.main()
107 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/utils/hp_naming.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import copy
 16 | import re
 17 | 
 18 | 
 19 | class TrialShortNamer:
 20 |     PREFIX = "hp"
 21 |     DEFAULTS = {}
 22 |     NAMING_INFO = None
 23 | 
 24 |     @classmethod
 25 |     def set_defaults(cls, prefix, defaults):
 26 |         cls.PREFIX = prefix
 27 |         cls.DEFAULTS = defaults
 28 |         cls.build_naming_info()
 29 | 
 30 |     @staticmethod
 31 |     def shortname_for_word(info, word):
 32 |         if len(word) == 0:
 33 |             return ""
 34 |         short_word = None
 35 |         if any(char.isdigit() for char in word):
 36 |             raise Exception(f"Parameters should not contain numbers: '{word}' contains a number")
 37 |         if word in info["short_word"]:
 38 |             return info["short_word"][word]
 39 |         for prefix_len in range(1, len(word) + 1):
 40 |             prefix = word[:prefix_len]
 41 |             if prefix in info["reverse_short_word"]:
 42 |                 continue
 43 |             else:
 44 |                 short_word = prefix
 45 |                 break
 46 | 
 47 |         if short_word is None:
 48 |             # Paranoid fallback
 49 |             def int_to_alphabetic(integer):
 50 |                 s = ""
 51 |                 while integer != 0:
 52 |                     s = chr(ord("A") + integer % 10) + s
 53 |                     integer //= 10
 54 |                 return s
 55 | 
 56 |             i = 0
 57 |             while True:
 58 |                 sword = word + "#" + int_to_alphabetic(i)
 59 |                 if sword in info["reverse_short_word"]:
 60 |                     continue
 61 |                 else:
 62 |                     short_word = sword
 63 |                     break
 64 | 
 65 |         info["short_word"][word] = short_word
 66 |         info["reverse_short_word"][short_word] = word
 67 |         return short_word
 68 | 
 69 |     @staticmethod
 70 |     def shortname_for_key(info, param_name):
 71 |         words = param_name.split("_")
 72 | 
 73 |         shortname_parts = [TrialShortNamer.shortname_for_word(info, word) for word in words]
 74 | 
 75 |         # We try to create a separatorless short name, but if there is a collision we have to fallback
 76 |         # to a separated short name
 77 |         separators = ["", "_"]
 78 | 
 79 |         for separator in separators:
 80 |             shortname = separator.join(shortname_parts)
 81 |             if shortname not in info["reverse_short_param"]:
 82 |                 info["short_param"][param_name] = shortname
 83 |                 info["reverse_short_param"][shortname] = param_name
 84 |                 return shortname
 85 | 
 86 |         return param_name
 87 | 
 88 |     @staticmethod
 89 |     def add_new_param_name(info, param_name):
 90 |         short_name = TrialShortNamer.shortname_for_key(info, param_name)
 91 |         info["short_param"][param_name] = short_name
 92 |         info["reverse_short_param"][short_name] = param_name
 93 | 
 94 |     @classmethod
 95 |     def build_naming_info(cls):
 96 |         if cls.NAMING_INFO is not None:
 97 |             return
 98 | 
 99 |         info = {
100 |             "short_word": {},
101 |             "reverse_short_word": {},
102 |             "short_param": {},
103 |             "reverse_short_param": {},
104 |         }
105 | 
106 |         field_keys = list(cls.DEFAULTS.keys())
107 | 
108 |         for k in field_keys:
109 |             cls.add_new_param_name(info, k)
110 | 
111 |         cls.NAMING_INFO = info
112 | 
113 |     @classmethod
114 |     def shortname(cls, params):
115 |         cls.build_naming_info()
116 |         assert cls.PREFIX is not None
117 |         name = [copy.copy(cls.PREFIX)]
118 | 
119 |         for k, v in params.items():
120 |             if k not in cls.DEFAULTS:
121 |                 raise Exception(f"You should provide a default value for the param name {k} with value {v}")
122 |             if v == cls.DEFAULTS[k]:
123 |                 # The default value is not added to the name
124 |                 continue
125 | 
126 |             key = cls.NAMING_INFO["short_param"][k]
127 | 
128 |             if isinstance(v, bool):
129 |                 v = 1 if v else 0
130 | 
131 |             sep = "" if isinstance(v, (int, float)) else "-"
132 |             e = f"{key}{sep}{v}"
133 |             name.append(e)
134 | 
135 |         return "_".join(name)
136 | 
137 |     @classmethod
138 |     def parse_repr(cls, repr):
139 |         repr = repr[len(cls.PREFIX) + 1 :]
140 |         if repr == "":
141 |             values = []
142 |         else:
143 |             values = repr.split("_")
144 | 
145 |         parameters = {}
146 | 
147 |         for value in values:
148 |             if "-" in value:
149 |                 p_k, p_v = value.split("-")
150 |             else:
151 |                 p_k = re.sub("[0-9.]", "", value)
152 |                 p_v = float(re.sub("[^0-9.]", "", value))
153 | 
154 |             key = cls.NAMING_INFO["reverse_short_param"][p_k]
155 | 
156 |             parameters[key] = p_v
157 | 
158 |         for k in cls.DEFAULTS:
159 |             if k not in parameters:
160 |                 parameters[k] = cls.DEFAULTS[k]
161 | 
162 |         return parameters
163 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/configuration_openai.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ OpenAI GPT configuration """
 17 | 
 18 | from __future__ import absolute_import, division, print_function, unicode_literals
 19 | 
 20 | import json
 21 | import logging
 22 | import sys
 23 | from io import open
 24 | 
 25 | from .configuration_utils import PretrainedConfig
 26 | 
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 30 |     "openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json"
 31 | }
 32 | 
 33 | class OpenAIGPTConfig(PretrainedConfig):
 34 |     """
 35 |     Configuration class to store the configuration of a `OpenAIGPTModel`.
 36 | 
 37 |     Args:
 38 |         vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
 39 |         n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
 40 |         n_positions: Number of positional embeddings.
 41 |         n_ctx: Size of the causal mask (usually same as n_positions).
 42 |         n_embd: Dimensionality of the embeddings and hidden states.
 43 |         n_layer: Number of hidden layers in the Transformer encoder.
 44 |         n_head: Number of attention heads for each attention layer in
 45 |             the Transformer encoder.
 46 |         afn: The non-linear activation function (function or string) in the
 47 |             encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
 48 |         resid_pdrop: The dropout probabilitiy for all fully connected
 49 |             layers in the embeddings, encoder, and pooler.
 50 |         attn_pdrop: The dropout ratio for the attention
 51 |             probabilities.
 52 |         embd_pdrop: The dropout ratio for the embeddings.
 53 |         layer_norm_epsilon: epsilon to use in the layer norm layers
 54 |         initializer_range: The sttdev of the truncated_normal_initializer for
 55 |             initializing all weight matrices.
 56 |         predict_special_tokens: should we predict special tokens (when the model has a LM head)
 57 |     """
 58 |     pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
 59 | 
 60 |     def __init__(
 61 |         self,
 62 |         vocab_size_or_config_json_file=40478,
 63 |         n_positions=512,
 64 |         n_ctx=512,
 65 |         n_embd=768,
 66 |         n_layer=12,
 67 |         n_head=12,
 68 |         afn="gelu",
 69 |         resid_pdrop=0.1,
 70 |         embd_pdrop=0.1,
 71 |         attn_pdrop=0.1,
 72 |         layer_norm_epsilon=1e-5,
 73 |         initializer_range=0.02,
 74 |         predict_special_tokens=True,
 75 | 
 76 |         num_labels=1,
 77 |         summary_type='cls_index',
 78 |         summary_use_proj=True,
 79 |         summary_activation=None,
 80 |         summary_proj_to_labels=True,
 81 |         summary_first_dropout=0.1,
 82 |         **kwargs
 83 |     ):
 84 |         """Constructs OpenAIGPTConfig.
 85 |         """
 86 |         super(OpenAIGPTConfig, self).__init__(**kwargs)
 87 | 
 88 |         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
 89 |                         and isinstance(vocab_size_or_config_json_file, unicode)):
 90 |             with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
 91 |                 json_config = json.loads(reader.read())
 92 |             for key, value in json_config.items():
 93 |                 self.__dict__[key] = value
 94 |         elif isinstance(vocab_size_or_config_json_file, int):
 95 |             self.vocab_size = vocab_size_or_config_json_file
 96 |             self.n_ctx = n_ctx
 97 |             self.n_positions = n_positions
 98 |             self.n_embd = n_embd
 99 |             self.n_layer = n_layer
100 |             self.n_head = n_head
101 |             self.afn = afn
102 |             self.resid_pdrop = resid_pdrop
103 |             self.embd_pdrop = embd_pdrop
104 |             self.attn_pdrop = attn_pdrop
105 |             self.layer_norm_epsilon = layer_norm_epsilon
106 |             self.initializer_range = initializer_range
107 |             self.predict_special_tokens = predict_special_tokens
108 | 
109 |             self.num_labels = num_labels
110 |             self.summary_type = summary_type
111 |             self.summary_use_proj = summary_use_proj
112 |             self.summary_activation = summary_activation
113 |             self.summary_first_dropout = summary_first_dropout
114 |             self.summary_proj_to_labels = summary_proj_to_labels
115 |         else:
116 |             raise ValueError(
117 |                 "First argument must be either a vocabulary size (int)"
118 |                 "or the path to a pretrained model config file (str)"
119 |             )
120 | 
121 |     @property
122 |     def max_position_embeddings(self):
123 |         return self.n_positions
124 | 
125 |     @property
126 |     def hidden_size(self):
127 |         return self.n_embd
128 | 
129 |     @property
130 |     def num_attention_heads(self):
131 |         return self.n_head
132 | 
133 |     @property
134 |     def num_hidden_layers(self):
135 |         return self.n_layer
136 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/tests/tokenization_bert_test.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | from __future__ import absolute_import, division, print_function, unicode_literals
 16 | 
 17 | import os
 18 | import unittest
 19 | from io import open
 20 | 
 21 | from pytorch_transformers.tokenization_bert import (BasicTokenizer,
 22 |                                                     BertTokenizer,
 23 |                                                     WordpieceTokenizer,
 24 |                                                     _is_control, _is_punctuation,
 25 |                                                     _is_whitespace, VOCAB_FILES_NAMES)
 26 | 
 27 | from .tokenization_tests_commons import CommonTestCases
 28 | 
 29 | class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
 30 | 
 31 |     tokenizer_class = BertTokenizer
 32 | 
 33 |     def setUp(self):
 34 |         super(BertTokenizationTest, self).setUp()
 35 | 
 36 |         vocab_tokens = [
 37 |             "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
 38 |             "##ing", ",", "low", "lowest",
 39 |         ]
 40 |         self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
 41 |         with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer:
 42 |             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 43 | 
 44 |     def get_tokenizer(self, **kwargs):
 45 |         return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
 46 | 
 47 |     def get_input_output_texts(self):
 48 |         input_text = u"UNwant\u00E9d,running"
 49 |         output_text = u"unwanted, running"
 50 |         return input_text, output_text
 51 | 
 52 |     def test_full_tokenizer(self):
 53 |         tokenizer = self.tokenizer_class(self.vocab_file)
 54 | 
 55 |         tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
 56 |         self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
 57 |         self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
 58 | 
 59 |     def test_chinese(self):
 60 |         tokenizer = BasicTokenizer()
 61 | 
 62 |         self.assertListEqual(
 63 |             tokenizer.tokenize(u"ah\u535A\u63A8zz"),
 64 |             [u"ah", u"\u535A", u"\u63A8", u"zz"])
 65 | 
 66 |     def test_basic_tokenizer_lower(self):
 67 |         tokenizer = BasicTokenizer(do_lower_case=True)
 68 | 
 69 |         self.assertListEqual(
 70 |             tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
 71 |             ["hello", "!", "how", "are", "you", "?"])
 72 |         self.assertListEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"])
 73 | 
 74 |     def test_basic_tokenizer_no_lower(self):
 75 |         tokenizer = BasicTokenizer(do_lower_case=False)
 76 | 
 77 |         self.assertListEqual(
 78 |             tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
 79 |             ["HeLLo", "!", "how", "Are", "yoU", "?"])
 80 | 
 81 |     def test_wordpiece_tokenizer(self):
 82 |         vocab_tokens = [
 83 |             "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
 84 |             "##ing"
 85 |         ]
 86 | 
 87 |         vocab = {}
 88 |         for (i, token) in enumerate(vocab_tokens):
 89 |             vocab[token] = i
 90 |         tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")
 91 | 
 92 |         self.assertListEqual(tokenizer.tokenize(""), [])
 93 | 
 94 |         self.assertListEqual(
 95 |             tokenizer.tokenize("unwanted running"),
 96 |             ["un", "##want", "##ed", "runn", "##ing"])
 97 | 
 98 |         self.assertListEqual(
 99 |             tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
100 | 
101 |     def test_is_whitespace(self):
102 |         self.assertTrue(_is_whitespace(u" "))
103 |         self.assertTrue(_is_whitespace(u"\t"))
104 |         self.assertTrue(_is_whitespace(u"\r"))
105 |         self.assertTrue(_is_whitespace(u"\n"))
106 |         self.assertTrue(_is_whitespace(u"\u00A0"))
107 | 
108 |         self.assertFalse(_is_whitespace(u"A"))
109 |         self.assertFalse(_is_whitespace(u"-"))
110 | 
111 |     def test_is_control(self):
112 |         self.assertTrue(_is_control(u"\u0005"))
113 | 
114 |         self.assertFalse(_is_control(u"A"))
115 |         self.assertFalse(_is_control(u" "))
116 |         self.assertFalse(_is_control(u"\t"))
117 |         self.assertFalse(_is_control(u"\r"))
118 | 
119 |     def test_is_punctuation(self):
120 |         self.assertTrue(_is_punctuation(u"-"))
121 |         self.assertTrue(_is_punctuation(u"$"))
122 |         self.assertTrue(_is_punctuation(u"`"))
123 |         self.assertTrue(_is_punctuation(u"."))
124 | 
125 |         self.assertFalse(_is_punctuation(u"A"))
126 |         self.assertFalse(_is_punctuation(u" "))
127 | 
128 |     def test_sequence_builders(self):
129 |         tokenizer = self.tokenizer_class.from_pretrained("bert-base-uncased")
130 | 
131 |         text = tokenizer.encode("sequence builders")
132 |         text_2 = tokenizer.encode("multi-sequence build")
133 | 
134 |         encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
135 |         encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
136 | 
137 |         assert encoded_sentence == [101] + text + [102]
138 |         assert encoded_pair == [101] + text + [102] + text_2 + [102]
139 | 
140 | if __name__ == '__main__':
141 |     unittest.main()
142 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Convert Transformer XL checkpoint and datasets."""
 16 | 
 17 | from __future__ import absolute_import, division, print_function
 18 | 
 19 | import argparse
 20 | import os
 21 | import sys
 22 | from io import open
 23 | 
 24 | import torch
 25 | 
 26 | import pytorch_transformers.tokenization_transfo_xl as data_utils
 27 | 
 28 | from pytorch_transformers import CONFIG_NAME, WEIGHTS_NAME
 29 | from pytorch_transformers import (TransfoXLConfig, TransfoXLLMHeadModel,
 30 |                                                       load_tf_weights_in_transfo_xl)
 31 | from pytorch_transformers.tokenization_transfo_xl import (CORPUS_NAME, VOCAB_FILES_NAMES)
 32 | 
 33 | if sys.version_info[0] == 2:
 34 |     import cPickle as pickle
 35 | else:
 36 |     import pickle
 37 | 
 38 | import logging
 39 | logging.basicConfig(level=logging.INFO)
 40 | 
 41 | # We do this to be able to load python 2 datasets pickles
 42 | # See e.g. https://stackoverflow.com/questions/2121874/python-pickling-after-changing-a-modules-directory/2121918#2121918
 43 | data_utils.Vocab = data_utils.TransfoXLTokenizer
 44 | data_utils.Corpus = data_utils.TransfoXLCorpus
 45 | sys.modules['data_utils'] = data_utils
 46 | sys.modules['vocabulary'] = data_utils
 47 | 
 48 | def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
 49 |                                              transfo_xl_config_file,
 50 |                                              pytorch_dump_folder_path,
 51 |                                              transfo_xl_dataset_file):
 52 |     if transfo_xl_dataset_file:
 53 |         # Convert a pre-processed corpus (see original TensorFlow repo)
 54 |         with open(transfo_xl_dataset_file, "rb") as fp:
 55 |             corpus = pickle.load(fp, encoding="latin1")
 56 |         # Save vocabulary and dataset cache as Dictionaries (should be better than pickles for the long-term)
 57 |         pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' + VOCAB_FILES_NAMES['pretrained_vocab_file']
 58 |         print("Save vocabulary to {}".format(pytorch_vocab_dump_path))
 59 |         corpus_vocab_dict = corpus.vocab.__dict__
 60 |         torch.save(corpus_vocab_dict, pytorch_vocab_dump_path)
 61 | 
 62 |         corpus_dict_no_vocab = corpus.__dict__
 63 |         corpus_dict_no_vocab.pop('vocab', None)
 64 |         pytorch_dataset_dump_path = pytorch_dump_folder_path + '/' + CORPUS_NAME
 65 |         print("Save dataset to {}".format(pytorch_dataset_dump_path))
 66 |         torch.save(corpus_dict_no_vocab, pytorch_dataset_dump_path)
 67 | 
 68 |     if tf_checkpoint_path:
 69 |         # Convert a pre-trained TensorFlow model
 70 |         config_path = os.path.abspath(transfo_xl_config_file)
 71 |         tf_path = os.path.abspath(tf_checkpoint_path)
 72 | 
 73 |         print("Converting Transformer XL checkpoint from {} with config at {}".format(tf_path, config_path))
 74 |         # Initialise PyTorch model
 75 |         if transfo_xl_config_file == "":
 76 |             config = TransfoXLConfig()
 77 |         else:
 78 |             config = TransfoXLConfig.from_json_file(transfo_xl_config_file)
 79 |         print("Building PyTorch model from configuration: {}".format(str(config)))
 80 |         model = TransfoXLLMHeadModel(config)
 81 | 
 82 |         model = load_tf_weights_in_transfo_xl(model, config, tf_path)
 83 |         # Save pytorch-model
 84 |         pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
 85 |         pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME)
 86 |         print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path)))
 87 |         torch.save(model.state_dict(), pytorch_weights_dump_path)
 88 |         print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path)))
 89 |         with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
 90 |             f.write(config.to_json_string())
 91 | 
 92 | 
 93 | if __name__ == "__main__":
 94 |     parser = argparse.ArgumentParser()
 95 |     parser.add_argument("--pytorch_dump_folder_path",
 96 |                         default = None,
 97 |                         type = str,
 98 |                         required = True,
 99 |                         help = "Path to the folder to store the PyTorch model or dataset/vocab.")
100 |     parser.add_argument("--tf_checkpoint_path",
101 |                         default = "",
102 |                         type = str,
103 |                         help = "An optional path to a TensorFlow checkpoint path to be converted.")
104 |     parser.add_argument("--transfo_xl_config_file",
105 |                         default = "",
106 |                         type = str,
107 |                         help = "An optional config json file corresponding to the pre-trained BERT model. \n"
108 |                             "This specifies the model architecture.")
109 |     parser.add_argument("--transfo_xl_dataset_file",
110 |                         default = "",
111 |                         type = str,
112 |                         help = "An optional dataset file to be converted in a vocabulary.")
113 |     args = parser.parse_args()
114 |     convert_transfo_xl_checkpoint_to_pytorch(args.tf_checkpoint_path,
115 |                                      args.transfo_xl_config_file,
116 |                                      args.pytorch_dump_folder_path,
117 |                                      args.transfo_xl_dataset_file)
118 | 


--------------------------------------------------------------------------------
/larimar_base/modules/spacefusion.py:
--------------------------------------------------------------------------------
  1 | from .vae import VAE
  2 | import numpy as np
  3 | import torch, copy, pdb
  4 | import torch.nn.functional as F
  5 | 
  6 | from torch import nn
  7 | 
  8 | import pdb
  9 | 
 10 | 
 11 | def set_trainable(module, value):
 12 |     for param in module.parameters():
 13 |         param.requires_grad = value
 14 | 
 15 | class SpaceFusion(VAE):
 16 |     def __init__(self, encoder, decoder,  tokenizer_encoder, tokenizer_decoder, args): 
 17 |         super(SpaceFusion, self).__init__(encoder, decoder,  tokenizer_encoder, tokenizer_decoder, args)
 18 |         children = [v for v in encoder.encoder.layer.children()]    # list of 12 BertLayer
 19 | 
 20 |         self.num_s2s_bert_layer = args.num_s2s_bert_layer
 21 |         self.S2S_layers = nn.ModuleList([copy.deepcopy(c) for c in children[-args.num_s2s_bert_layer:] ])    # the last layer of encoder
 22 |         self.S2S_pooler = copy.deepcopy(encoder.pooler)
 23 |         self.ix_turn_sep = tokenizer_encoder.convert_tokens_to_ids('[SEP]')
 24 |         if args.freeze_bert:
 25 |             print('@'*20 + f' freezing BERT {args.num_frozen_bert_layer} layers')
 26 |             for child in children[:args.num_frozen_bert_layer]:
 27 |                 set_trainable(child, False)
 28 | 
 29 | 
 30 | 
 31 |     def ids2speaker(self, ids):
 32 |         # 0 for speaker A, 1 for speaker B
 33 |         N, T = ids.shape
 34 |         speaker = np.zeros((N, T))
 35 |         sep = ids == self.ix_turn_sep
 36 |         for i in range(N):
 37 |             is_B = False    # start with speaker A
 38 |             for t in range(T):
 39 |                 speaker[i,t] = int(is_B)
 40 |                 if sep[i,t].item():
 41 |                     is_B = not is_B
 42 | 
 43 |         # make sure the final speaker is speaker B (so response is always speaker A)
 44 |         if not is_B:
 45 |             speaker = 1 - speaker
 46 | 
 47 |         return torch.LongTensor(speaker).to(ids.device)
 48 | 
 49 |     def forward(self, inputs_src, inputs_tgt, labels_tgt, return_vec=False):  # [batch, time]
 50 |         # toggle config to get desired encoder output
 51 |         self.encoder.encoder.output_attentions = False
 52 |         self.encoder.encoder.output_hidden_states = True
 53 | 
 54 |         
 55 |         # AE encoder
 56 |         mask = (inputs_tgt > 0).float().to(inputs_src.device)
 57 |         outputs = self.encoder(inputs_tgt, attention_mask=mask)
 58 |         z_AE, _ = self.connect(outputs[1])
 59 |         z_AE = z_AE.squeeze(1)
 60 | 
 61 |         # S2S encoder
 62 |         mask = (inputs_src > 0).float()
 63 |         speaker = self.ids2speaker(inputs_src)
 64 |         outputs = self.encoder(inputs_src, attention_mask=mask, token_type_ids=speaker)
 65 |         _, _, all_layer_attn = outputs      # last_layer_attn, pooled, all_layer_attn = outputs
 66 |         seq_z_prev = all_layer_attn[-self.num_s2s_bert_layer-1]     # seq of z at layer 11 ()
 67 | 
 68 |         for s2s in self.S2S_layers: 
 69 |             layer_outputs = s2s(seq_z_prev, attention_mask=mask.unsqueeze(1).unsqueeze(1))
 70 |             seq_z_prev = layer_outputs[0]
 71 | 
 72 |         z_S2S = self.encoder.pooler(layer_outputs[0])
 73 |         z_S2S, _ = self.connect(z_S2S)
 74 |         z_S2S = z_S2S.squeeze(1)
 75 | 
 76 |         if return_vec:
 77 |             return z_AE, z_S2S
 78 | 
 79 |         # interpolation/smoothness
 80 |         u = torch.FloatTensor(np.random.random((z_AE.shape[0], 1))).to(inputs_tgt.device)
 81 |         z_interp = u * z_AE + (1 - u) * z_S2S
 82 |         std = 0.1
 83 |         noise = torch.FloatTensor(np.random.normal(size=z_interp.shape) * std).to(z_interp.device)
 84 |         z_interp = z_interp + noise
 85 | 
 86 |         loss_rec = 0
 87 |         z_idx = 0
 88 |         for z in [z_AE, z_S2S, z_interp]:
 89 |             #pdb.set_trace()
 90 |             past = z # past = self.decoder.linear(z)
 91 |             outputs = self.decoder(input_ids=labels_tgt, past=past, labels=labels_tgt, label_ignore=self.pad_token_id)
 92 |             if z_idx == 1:
 93 |                 loss_rec = loss_rec + 1.0 * outputs[0]
 94 |             else:
 95 |                 loss_rec = loss_rec + outputs[0]
 96 |             z_idx += 1
 97 |         loss_rec = loss_rec/3
 98 |         
 99 |         # fusion/regularization
100 |         L_pull = self.dist_pair(z_AE, z_S2S)
101 |         L_push = torch.stack([self.dist_batch(z) for z in [z_AE, z_S2S]]).min()
102 |         loss_reg = (L_pull - L_push * 2) / np.sqrt(z.shape[-1])
103 |         
104 |         loss = loss_rec + self.args.beta * loss_reg
105 |         return loss_rec, loss_reg, loss
106 | 
107 |     def sent2latent(self, inputs_src):
108 |         # toggle config to get desired encoder output
109 |         self.encoder.encoder.output_attentions = False
110 |         self.encoder.encoder.output_hidden_states = True
111 | 
112 |         # S2S encoder
113 |         mask = (inputs_src > 0).float()
114 |         speaker = self.ids2speaker(inputs_src)
115 |         outputs = self.encoder(inputs_src, attention_mask=mask, token_type_ids=speaker)
116 | 
117 |         _, _, all_layer_attn = outputs      # last_layer_attn, pooled, all_layer_attn = outputs
118 |         # seq_z_prev = all_layer_attn[-2]     # seq of z at layer 11 ()
119 |         # layer_outputs = self.S2S_layer(seq_z_prev, attention_mask=mask.unsqueeze(1).unsqueeze(1))
120 | 
121 |         seq_z_prev = all_layer_attn[-self.num_s2s_bert_layer-1]     # seq of z at layer 11 ()
122 |         for s2s in self.S2S_layers: 
123 |             layer_outputs = s2s(seq_z_prev, attention_mask=mask.unsqueeze(1).unsqueeze(1))
124 |             seq_z_prev = layer_outputs[0]
125 | 
126 |         z_S2S = self.encoder.pooler(layer_outputs[0])
127 |         z_S2S, _ = self.connect(z_S2S)
128 |         z_S2S = z_S2S.squeeze(1)
129 |         
130 |         return z_S2S
131 | 
132 | 
133 |     def dist_pair(self, a, b):
134 |         return F.pairwise_distance(a, b).mean()
135 | 
136 | 
137 |     def dist_batch(self, vec):
138 |         n = vec.shape[0]
139 |         dmin = []
140 |         for i in range(n):
141 |             dd = F.pairwise_distance(vec[i:i+1,:].repeat(n,1), vec)
142 |             dmin.append(dd.min())
143 |         return torch.stack(dmin).mean()


--------------------------------------------------------------------------------
/larimar_base/ddp.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import subprocess
  4 | import numpy as np
  5 | 
  6 | import torch.distributed as dist
  7 | 
  8 | 
  9 | def get_nccl_socket_ifname():
 10 |     ipa = subprocess.run(['ip', 'a'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 11 |     lines = ipa.stdout.decode('utf-8').split('\n')
 12 |     all_names = []
 13 |     name = None
 14 |     for line in lines:
 15 |         if line and not line[0] == ' ':
 16 |             name = line.split(':')[1].strip()
 17 |             continue
 18 |         if 'link/infiniband' in line:
 19 |             all_names.append(name)
 20 |     os.environ['NCCL_SOCKET_IFNAME'] = ','.join(all_names)
 21 | 
 22 | 
 23 | def fix_infiniband():
 24 |     # os.environ['NCCL_SOCKET_IFNAME'] = "^lo,docker,virbr,vmnet,vboxnet,wl,ww,ppp,bond"
 25 | 
 26 |     # ifname = os.environ.get('NCCL_SOCKET_IFNAME', None)
 27 |     # if ifname is None:
 28 |     #     os.environ['NCCL_SOCKET_IFNAME'] = '^lo,docker0'
 29 |     get_nccl_socket_ifname()
 30 |     os.environ['NCCL_IB_CUDA_SUPPORT'] = '1'
 31 |     ibv = subprocess.run('ibv_devinfo', stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 32 |     lines = ibv.stdout.decode('utf-8').split('\n')
 33 |     exclude = ''
 34 |     include = ''
 35 |     for line in lines:
 36 |         if 'hca_id:' in line:
 37 |             name = line.split(':')[1].strip()
 38 |         if '\tport:' in line:
 39 |             port = line.split(':')[1].strip()
 40 |         if 'link_layer:' in line and 'Ethernet' in line:
 41 |             exclude = exclude + f'{name}:{port},'
 42 |         if 'link_layer:' in line and 'infiniband' in line.lower():
 43 |             include = include + f'{name}:{port},'
 44 |     if exclude:
 45 |         exclude = '^' + exclude[:-1]
 46 |         # print(exclude)
 47 |         os.environ['NCCL_IB_HCA'] = exclude
 48 |     else:
 49 |         os.environ['NCCL_IB_HCA'] = include[:-1]
 50 | 
 51 | 
 52 | 
 53 | fix_inifiniband = fix_infiniband  # For backwards compatibility
 54 | 
 55 | def init_ddp_process_group(local_rank: int = None, port: int = None, world_size: int = None, dist_rank: int = None,
 56 |                            overwrite_env_vars=True):
 57 |     logger = logging.getLogger('InitDDP')
 58 |     if os.environ.get('LSB_JOBID', False):
 59 |         local_rank = int(os.environ.get('LSF_PM_XPROCID', 1)) - 1 if local_rank is None else local_rank
 60 | 
 61 |         hostname = os.environ.get('HOSTNAME', 'localhost')
 62 |         num_gpus = len(os.environ.get('CUDA_VISIBLE_DEVICES', '').split(','))
 63 |         node_rank = int(os.environ.get('LSF_PM_XMACHID', 1)) - 1
 64 |         dist_rank = node_rank * num_gpus + local_rank if dist_rank is None else dist
 65 |         num_hosts = len(os.environ.get('LSB_MCPU_HOSTS', 'localhost cpus').split()) // 2
 66 |         rng = np.random.RandomState(seed=int(os.environ.get('LSB_JOBID', 0)))
 67 |         master_host = os.environ.get('LSF_FROM_HOST', 'localhost')
 68 |         port = rng.randint(10000, 20000) if port is None else port
 69 |         if num_hosts > 1:
 70 |             fix_inifiniband()
 71 |         prefix = f'{hostname}, Local Rank {local_rank}/{num_gpus}, Global Rank {dist_rank}/{world_size}:'
 72 | 
 73 |         logger.info(f'{prefix} Trying to init process group')
 74 |         logger.debug(f'{prefix} CUDA_VISIBLE_DEVICES=', os.environ.get('CUDA_VISIBLE_DEVICES', ''))
 75 |         logger.debug(f'{prefix} LSF_PM_XMACHID=', os.environ.get('LSF_PM_XMACHID', ''))
 76 |         logger.debug(f'{prefix} LSF_PM_XPROCID=', os.environ.get('LSF_PM_XPROCID', ''))
 77 |         logger.debug(f'{prefix} LSB_MCPU_HOSTS=', os.environ.get('LSB_MCPU_HOSTS', ''))
 78 |         logger.debug(f'{prefix} MASTER_ADDR=', master_host)
 79 |         logger.debug(f'{prefix} MASTER_PORT=', port)
 80 |     elif os.environ.get('SLURM_JOB_ID', False):
 81 | 
 82 |         num_gpus = len(os.environ.get('CUDA_VISIBLE_DEVICES', '0').split(','))
 83 |         local_rank = int(os.environ.get('SLURM_PROCID', 0)) % num_gpus if local_rank is None else local_rank
 84 |         node_rank = int(os.environ.get('SLURM_NODEID', 0))
 85 | 
 86 |         hostlist = subprocess.run(['scontrol', 'show', 'hostnames', os.environ.get('SLURM_JOB_NODELIST', 'localhost')],
 87 |                                   stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 88 |         hostlist = hostlist.stdout.decode('utf8').strip().split('\n')
 89 |         num_hosts = len(hostlist)
 90 |         master_host = hostlist[0]
 91 |         hostname = os.environ.get('HOSTNAME', 'localhost')
 92 |         dist_rank = node_rank * num_gpus + local_rank if dist_rank is None else dist
 93 |         rng = np.random.RandomState(seed=int(os.environ.get('SLURM_JOB_ID', 0)))
 94 |         port = rng.randint(10000, 20000) if port is None else port
 95 | 
 96 |         prefix = f'{hostname}, Local Rank {local_rank}/{num_gpus}, Global Rank {dist_rank}/{world_size}:'
 97 | 
 98 |         logger.info(f'{prefix} Trying to init process group')
 99 |         logger.debug(f'{prefix} CUDA_VISIBLE_DEVICES=', os.environ.get('CUDA_VISIBLE_DEVICES', ''))
100 |         logger.debug(f'{prefix} SLURM_NODEID=', os.environ.get('SLURM_NODEID', ''))
101 |         logger.debug(f'{prefix} SLURM_PROCID=', os.environ.get('SLURM_PROCID', ''))
102 |         logger.debug(f'{prefix} SLURM_JOB_NODELIST=', os.environ.get('SLURM_JOB_NODELIST', ''))
103 |         logger.debug(f'{prefix} MASTER_ADDR=', master_host)
104 |         logger.debug(f'{prefix} MASTER_PORT=', port)
105 | 
106 |     else:
107 |         return dist.init_process_group(backend='nccl', init_method='env://')
108 |     world_size = num_gpus * num_hosts if world_size is None else world_size
109 | 
110 |     if 'RANK' not in os.environ.keys() or overwrite_env_vars:
111 |         os.environ['RANK'] = str(dist_rank)
112 |     if 'LOCAL_RANK' not in os.environ.keys() or overwrite_env_vars:
113 |         os.environ['LOCAL_RANK'] = str(local_rank)
114 |     if 'NODE_RANK' not in os.environ.keys() or overwrite_env_vars:
115 |         os.environ['NODE_RANK'] = str(node_rank)
116 |     if 'MASTER_ADDR' not in os.environ.keys() or overwrite_env_vars:
117 |         os.environ['MASTER_ADDR'] = master_host
118 |     if 'WORLD_SIZE' not in os.environ.keys() or overwrite_env_vars:
119 |         os.environ['WORLD_SIZE'] = str(world_size)
120 | 
121 |     if port is not None:
122 |         os.environ['MASTER_PORT'] = str(port)
123 |     elif 'MASTER_PORT' not in os.environ.keys() or overwrite_env_vars:
124 | 
125 |         os.environ['MASTER_PORT'] = str(port)
126 | 
127 | 
128 |     group = dist.init_process_group(backend='nccl', init_method='env://')
129 | 
130 |     logger.info(f'{prefix} Done init process group')
131 |     return group
132 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/utils/sentencepiece_model_pb2_new.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Generated by the protocol buffer compiler.  DO NOT EDIT!
 3 | # source: sentencepiece_model.proto
 4 | """Generated protocol buffer code."""
 5 | from google.protobuf import descriptor as _descriptor
 6 | from google.protobuf import descriptor_pool as _descriptor_pool
 7 | from google.protobuf import symbol_database as _symbol_database
 8 | from google.protobuf.internal import builder as _builder
 9 | 
10 | 
11 | # @@protoc_insertion_point(imports)
12 | 
13 | _sym_db = _symbol_database.Default()
14 | 
15 | 
16 | DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
17 |     b'\n\x19sentencepiece_model.proto\x12\rsentencepiece"\x80\x0c\n\x0bTrainerSpec\x12\r\n\x05input\x18\x01 \x03(\t\x12\x14\n\x0cinput_format\x18\x07 \x01(\t\x12\x14\n\x0cmodel_prefix\x18\x02 \x01(\t\x12\x41\n\nmodel_type\x18\x03 \x01(\x0e\x32$.sentencepiece.TrainerSpec.ModelType:\x07UNIGRAM\x12\x18\n\nvocab_size\x18\x04 \x01(\x05:\x04\x38\x30\x30\x30\x12\x17\n\x0f\x61\x63\x63\x65pt_language\x18\x05 \x03(\t\x12 \n\x15self_test_sample_size\x18\x06 \x01(\x05:\x01\x30\x12*\n\x1b\x65nable_differential_privacy\x18\x32 \x01(\x08:\x05\x66\x61lse\x12+\n differential_privacy_noise_level\x18\x33 \x01(\x02:\x01\x30\x12\x32\n\'differential_privacy_clipping_threshold\x18\x34 \x01(\x04:\x01\x30\x12"\n\x12\x63haracter_coverage\x18\n \x01(\x02:\x06\x30.9995\x12\x1e\n\x13input_sentence_size\x18\x0b \x01(\x04:\x01\x30\x12$\n\x16shuffle_input_sentence\x18\x13 \x01(\x08:\x04true\x12 \n\x14mining_sentence_size\x18\x0c \x01(\x05\x42\x02\x18\x01\x12"\n\x16training_sentence_size\x18\r \x01(\x05\x42\x02\x18\x01\x12(\n\x17seed_sentencepiece_size\x18\x0e \x01(\x05:\x07\x31\x30\x30\x30\x30\x30\x30\x12\x1e\n\x10shrinking_factor\x18\x0f \x01(\x02:\x04\x30.75\x12!\n\x13max_sentence_length\x18\x12 \x01(\x05:\x04\x34\x31\x39\x32\x12\x17\n\x0bnum_threads\x18\x10 \x01(\x05:\x02\x31\x36\x12\x1d\n\x12num_sub_iterations\x18\x11 \x01(\x05:\x01\x32\x12$\n\x18max_sentencepiece_length\x18\x14 \x01(\x05:\x02\x31\x36\x12%\n\x17split_by_unicode_script\x18\x15 \x01(\x08:\x04true\x12\x1d\n\x0fsplit_by_number\x18\x17 \x01(\x08:\x04true\x12!\n\x13split_by_whitespace\x18\x16 \x01(\x08:\x04true\x12)\n\x1atreat_whitespace_as_suffix\x18\x18 \x01(\x08:\x05\x66\x61lse\x12+\n\x1c\x61llow_whitespace_only_pieces\x18\x1a \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x0csplit_digits\x18\x19 \x01(\x08:\x05\x66\x61lse\x12#\n\x19pretokenization_delimiter\x18\x35 \x01(\t:\x00\x12\x17\n\x0f\x63ontrol_symbols\x18\x1e \x03(\t\x12\x1c\n\x14user_defined_symbols\x18\x1f \x03(\t\x12\x16\n\x0erequired_chars\x18$ \x01(\t\x12\x1c\n\rbyte_fallback\x18# \x01(\x08:\x05\x66\x61lse\x12+\n\x1dvocabulary_output_piece_score\x18  \x01(\x08:\x04true\x12\x1e\n\x10hard_vocab_limit\x18! \x01(\x08:\x04true\x12\x1c\n\ruse_all_vocab\x18" \x01(\x08:\x05\x66\x61lse\x12\x11\n\x06unk_id\x18( \x01(\x05:\x01\x30\x12\x11\n\x06\x62os_id\x18) \x01(\x05:\x01\x31\x12\x11\n\x06\x65os_id\x18* \x01(\x05:\x01\x32\x12\x12\n\x06pad_id\x18+ \x01(\x05:\x02-1\x12\x18\n\tunk_piece\x18- \x01(\t:\x05<unk>\x12\x16\n\tbos_piece\x18. \x01(\t:\x03<s>\x12\x17\n\teos_piece\x18/ \x01(\t:\x04</s>\x12\x18\n\tpad_piece\x18\x30 \x01(\t:\x05<pad>\x12\x1a\n\x0bunk_surface\x18, \x01(\t:\x05 \xe2\x81\x87 \x12+\n\x1ctrain_extremely_large_corpus\x18\x31 \x01(\x08:\x05\x66\x61lse"5\n\tModelType\x12\x0b\n\x07UNIGRAM\x10\x01\x12\x07\n\x03\x42PE\x10\x02\x12\x08\n\x04WORD\x10\x03\x12\x08\n\x04\x43HAR\x10\x04*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02"\xd1\x01\n\x0eNormalizerSpec\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1c\n\x14precompiled_charsmap\x18\x02 \x01(\x0c\x12\x1e\n\x10\x61\x64\x64_dummy_prefix\x18\x03 \x01(\x08:\x04true\x12&\n\x18remove_extra_whitespaces\x18\x04 \x01(\x08:\x04true\x12 \n\x12\x65scape_whitespaces\x18\x05 \x01(\x08:\x04true\x12\x1e\n\x16normalization_rule_tsv\x18\x06 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02"y\n\x0cSelfTestData\x12\x33\n\x07samples\x18\x01 \x03(\x0b\x32".sentencepiece.SelfTestData.Sample\x1a)\n\x06Sample\x12\r\n\x05input\x18\x01 \x01(\t\x12\x10\n\x08\x65xpected\x18\x02 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02"\xfe\x03\n\nModelProto\x12\x37\n\x06pieces\x18\x01 \x03(\x0b\x32\'.sentencepiece.ModelProto.SentencePiece\x12\x30\n\x0ctrainer_spec\x18\x02 \x01(\x0b\x32\x1a.sentencepiece.TrainerSpec\x12\x36\n\x0fnormalizer_spec\x18\x03 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x12\x33\n\x0eself_test_data\x18\x04 \x01(\x0b\x32\x1b.sentencepiece.SelfTestData\x12\x38\n\x11\x64\x65normalizer_spec\x18\x05 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x1a\xd2\x01\n\rSentencePiece\x12\r\n\x05piece\x18\x01 \x01(\t\x12\r\n\x05score\x18\x02 \x01(\x02\x12\x42\n\x04type\x18\x03 \x01(\x0e\x32,.sentencepiece.ModelProto.SentencePiece.Type:\x06NORMAL"T\n\x04Type\x12\n\n\x06NORMAL\x10\x01\x12\x0b\n\x07UNKNOWN\x10\x02\x12\x0b\n\x07\x43ONTROL\x10\x03\x12\x10\n\x0cUSER_DEFINED\x10\x04\x12\x08\n\x04\x42YTE\x10\x06\x12\n\n\x06UNUSED\x10\x05*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\x42\x02H\x03'
18 | )
19 | 
20 | _globals = globals()
21 | _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
22 | _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "sentencepiece_model_pb2", _globals)
23 | if _descriptor._USE_C_DESCRIPTORS is False:
24 |     DESCRIPTOR._options = None
25 |     DESCRIPTOR._serialized_options = b"H\003"
26 |     # (generated by protobuf compiler, but `_TRAINERSPEC` is not defined)
27 |     # _TRAINERSPEC.fields_by_name["mining_sentence_size"]._options = None
28 |     # _TRAINERSPEC.fields_by_name["mining_sentence_size"]._serialized_options = b"\030\001"
29 |     # _TRAINERSPEC.fields_by_name["training_sentence_size"]._options = None
30 |     # _TRAINERSPEC.fields_by_name["training_sentence_size"]._serialized_options = b"\030\001"
31 |     _globals["_TRAINERSPEC"]._serialized_start = 45
32 |     _globals["_TRAINERSPEC"]._serialized_end = 1581
33 |     _globals["_TRAINERSPEC_MODELTYPE"]._serialized_start = 1517
34 |     _globals["_TRAINERSPEC_MODELTYPE"]._serialized_end = 1570
35 |     _globals["_NORMALIZERSPEC"]._serialized_start = 1584
36 |     _globals["_NORMALIZERSPEC"]._serialized_end = 1793
37 |     _globals["_SELFTESTDATA"]._serialized_start = 1795
38 |     _globals["_SELFTESTDATA"]._serialized_end = 1916
39 |     _globals["_SELFTESTDATA_SAMPLE"]._serialized_start = 1864
40 |     _globals["_SELFTESTDATA_SAMPLE"]._serialized_end = 1905
41 |     _globals["_MODELPROTO"]._serialized_start = 1919
42 |     _globals["_MODELPROTO"]._serialized_end = 2429
43 |     _globals["_MODELPROTO_SENTENCEPIECE"]._serialized_start = 2208
44 |     _globals["_MODELPROTO_SENTENCEPIECE"]._serialized_end = 2418
45 |     _globals["_MODELPROTO_SENTENCEPIECE_TYPE"]._serialized_start = 2323
46 |     _globals["_MODELPROTO_SENTENCEPIECE_TYPE"]._serialized_end = 2407
47 | # @@protoc_insertion_point(module_scope)
48 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/tests/optimization_test.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | from __future__ import absolute_import
 16 | from __future__ import division
 17 | from __future__ import print_function
 18 | 
 19 | import unittest
 20 | import os
 21 | 
 22 | import torch
 23 | 
 24 | from pytorch_transformers import (AdamW, ConstantLRSchedule, WarmupConstantSchedule,
 25 |                                   WarmupCosineSchedule, WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
 26 | 
 27 | from .tokenization_tests_commons import TemporaryDirectory
 28 | 
 29 | 
 30 | def unwrap_schedule(scheduler, num_steps=10):
 31 |     lrs = []
 32 |     for _ in range(num_steps):
 33 |         scheduler.step()
 34 |         lrs.append(scheduler.get_lr())
 35 |     return lrs
 36 | 
 37 | def unwrap_and_save_reload_schedule(scheduler, num_steps=10):
 38 |     lrs = []
 39 |     for step in range(num_steps):
 40 |         scheduler.step()
 41 |         lrs.append(scheduler.get_lr())
 42 |         if step == num_steps // 2:
 43 |             with TemporaryDirectory() as tmpdirname:
 44 |                 file_name = os.path.join(tmpdirname, 'schedule.bin')
 45 |                 torch.save(scheduler.state_dict(), file_name)
 46 | 
 47 |                 state_dict = torch.load(file_name)
 48 |                 scheduler.load_state_dict(state_dict)
 49 |     return lrs
 50 | 
 51 | class OptimizationTest(unittest.TestCase):
 52 | 
 53 |     def assertListAlmostEqual(self, list1, list2, tol):
 54 |         self.assertEqual(len(list1), len(list2))
 55 |         for a, b in zip(list1, list2):
 56 |             self.assertAlmostEqual(a, b, delta=tol)
 57 | 
 58 |     def test_adam_w(self):
 59 |         w = torch.tensor([0.1, -0.2, -0.1], requires_grad=True)
 60 |         target = torch.tensor([0.4, 0.2, -0.5])
 61 |         criterion = torch.nn.MSELoss()
 62 |         # No warmup, constant schedule, no gradient clipping
 63 |         optimizer = AdamW(params=[w], lr=2e-1, weight_decay=0.0)
 64 |         for _ in range(100):
 65 |             loss = criterion(w, target)
 66 |             loss.backward()
 67 |             optimizer.step()
 68 |             w.grad.detach_() # No zero_grad() function on simple tensors. we do it ourselves.
 69 |             w.grad.zero_()
 70 |         self.assertListAlmostEqual(w.tolist(), [0.4, 0.2, -0.5], tol=1e-2)
 71 | 
 72 | 
 73 | class ScheduleInitTest(unittest.TestCase):
 74 |     m = torch.nn.Linear(50, 50)
 75 |     optimizer = AdamW(m.parameters(), lr=10.)
 76 |     num_steps = 10
 77 | 
 78 |     def assertListAlmostEqual(self, list1, list2, tol):
 79 |         self.assertEqual(len(list1), len(list2))
 80 |         for a, b in zip(list1, list2):
 81 |             self.assertAlmostEqual(a, b, delta=tol)
 82 | 
 83 |     def test_constant_scheduler(self):
 84 |         scheduler = ConstantLRSchedule(self.optimizer)
 85 |         lrs = unwrap_schedule(scheduler, self.num_steps)
 86 |         expected_learning_rates = [10.] * self.num_steps
 87 |         self.assertEqual(len(lrs[0]), 1)
 88 |         self.assertListEqual([l[0] for l in lrs], expected_learning_rates)
 89 | 
 90 |         scheduler = ConstantLRSchedule(self.optimizer)
 91 |         lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps)
 92 |         self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
 93 | 
 94 |     def test_warmup_constant_scheduler(self):
 95 |         scheduler = WarmupConstantSchedule(self.optimizer, warmup_steps=4)
 96 |         lrs = unwrap_schedule(scheduler, self.num_steps)
 97 |         expected_learning_rates = [2.5, 5.0, 7.5, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0]
 98 |         self.assertEqual(len(lrs[0]), 1)
 99 |         self.assertListEqual([l[0] for l in lrs], expected_learning_rates)
100 | 
101 |         scheduler = WarmupConstantSchedule(self.optimizer, warmup_steps=4)
102 |         lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps)
103 |         self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
104 | 
105 |     def test_warmup_linear_scheduler(self):
106 |         scheduler = WarmupLinearSchedule(self.optimizer, warmup_steps=2, t_total=10)
107 |         lrs = unwrap_schedule(scheduler, self.num_steps)
108 |         expected_learning_rates = [5.0, 10.0, 8.75, 7.5, 6.25, 5.0, 3.75, 2.5, 1.25, 0.0]
109 |         self.assertEqual(len(lrs[0]), 1)
110 |         self.assertListEqual([l[0] for l in lrs], expected_learning_rates)
111 | 
112 |         scheduler = WarmupLinearSchedule(self.optimizer, warmup_steps=2, t_total=10)
113 |         lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps)
114 |         self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
115 | 
116 |     def test_warmup_cosine_scheduler(self):
117 |         scheduler = WarmupCosineSchedule(self.optimizer, warmup_steps=2, t_total=10)
118 |         lrs = unwrap_schedule(scheduler, self.num_steps)
119 |         expected_learning_rates = [5.0, 10.0, 9.61, 8.53, 6.91, 5.0, 3.08, 1.46, 0.38, 0.0]
120 |         self.assertEqual(len(lrs[0]), 1)
121 |         self.assertListAlmostEqual([l[0] for l in lrs], expected_learning_rates, tol=1e-2)
122 | 
123 |         scheduler = WarmupCosineSchedule(self.optimizer, warmup_steps=2, t_total=10)
124 |         lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps)
125 |         self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
126 | 
127 |     def test_warmup_cosine_hard_restart_scheduler(self):
128 |         scheduler = WarmupCosineWithHardRestartsSchedule(self.optimizer, warmup_steps=2, cycles=2, t_total=10)
129 |         lrs = unwrap_schedule(scheduler, self.num_steps)
130 |         expected_learning_rates = [5.0, 10.0, 8.53, 5.0, 1.46, 10.0, 8.53, 5.0, 1.46, 0.0]
131 |         self.assertEqual(len(lrs[0]), 1)
132 |         self.assertListAlmostEqual([l[0] for l in lrs], expected_learning_rates, tol=1e-2)
133 | 
134 |         scheduler = WarmupCosineWithHardRestartsSchedule(self.optimizer, warmup_steps=2, cycles=2, t_total=10)
135 |         lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps)
136 |         self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
137 | 
138 | if __name__ == "__main__":
139 |     unittest.main()
140 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/configuration_bert.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ BERT model configuration """
 17 | 
 18 | from __future__ import absolute_import, division, print_function, unicode_literals
 19 | 
 20 | import json
 21 | import logging
 22 | import sys
 23 | from io import open
 24 | 
 25 | from .configuration_utils import PretrainedConfig
 26 | 
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 30 |     'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json",
 31 |     'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json",
 32 |     'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json",
 33 |     'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json",
 34 |     'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json",
 35 |     'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json",
 36 |     'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json",
 37 |     'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json",
 38 |     'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json",
 39 |     'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json",
 40 |     'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json",
 41 |     'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json",
 42 |     'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
 43 | }
 44 | 
 45 | 
 46 | class BertConfig(PretrainedConfig):
 47 |     r"""
 48 |         :class:`~pytorch_transformers.BertConfig` is the configuration class to store the configuration of a
 49 |         `BertModel`.
 50 | 
 51 | 
 52 |         Arguments:
 53 |             vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
 54 |             hidden_size: Size of the encoder layers and the pooler layer.
 55 |             num_hidden_layers: Number of hidden layers in the Transformer encoder.
 56 |             num_attention_heads: Number of attention heads for each attention layer in
 57 |                 the Transformer encoder.
 58 |             intermediate_size: The size of the "intermediate" (i.e., feed-forward)
 59 |                 layer in the Transformer encoder.
 60 |             hidden_act: The non-linear activation function (function or string) in the
 61 |                 encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
 62 |             hidden_dropout_prob: The dropout probabilitiy for all fully connected
 63 |                 layers in the embeddings, encoder, and pooler.
 64 |             attention_probs_dropout_prob: The dropout ratio for the attention
 65 |                 probabilities.
 66 |             max_position_embeddings: The maximum sequence length that this model might
 67 |                 ever be used with. Typically set this to something large just in case
 68 |                 (e.g., 512 or 1024 or 2048).
 69 |             type_vocab_size: The vocabulary size of the `token_type_ids` passed into
 70 |                 `BertModel`.
 71 |             initializer_range: The sttdev of the truncated_normal_initializer for
 72 |                 initializing all weight matrices.
 73 |             layer_norm_eps: The epsilon used by LayerNorm.
 74 |     """
 75 |     pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 76 | 
 77 |     def __init__(self,
 78 |                  vocab_size_or_config_json_file=28996,
 79 |                  hidden_size=768,
 80 |                  num_hidden_layers=12,
 81 |                  num_attention_heads=12,
 82 |                  intermediate_size=3072,
 83 |                  hidden_act="gelu",
 84 |                  hidden_dropout_prob=0.1,
 85 |                  attention_probs_dropout_prob=0.1,
 86 |                  max_position_embeddings=512,
 87 |                  type_vocab_size=2,
 88 |                  initializer_range=0.02,
 89 |                  layer_norm_eps=1e-12,
 90 |                  **kwargs):
 91 |         super(BertConfig, self).__init__(**kwargs)
 92 |         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
 93 |                         and isinstance(vocab_size_or_config_json_file, unicode)):
 94 |             with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
 95 |                 json_config = json.loads(reader.read())
 96 |             for key, value in json_config.items():
 97 |                 self.__dict__[key] = value
 98 |         elif isinstance(vocab_size_or_config_json_file, int):
 99 |             self.vocab_size = vocab_size_or_config_json_file
100 |             self.hidden_size = hidden_size
101 |             self.num_hidden_layers = num_hidden_layers
102 |             self.num_attention_heads = num_attention_heads
103 |             self.hidden_act = hidden_act
104 |             self.intermediate_size = intermediate_size
105 |             self.hidden_dropout_prob = hidden_dropout_prob
106 |             self.attention_probs_dropout_prob = attention_probs_dropout_prob
107 |             self.max_position_embeddings = max_position_embeddings
108 |             self.type_vocab_size = type_vocab_size
109 |             self.initializer_range = initializer_range
110 |             self.layer_norm_eps = layer_norm_eps
111 |         else:
112 |             raise ValueError("First argument must be either a vocabulary size (int)"
113 |                              " or the path to a pretrained model config file (str)")
114 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/utils/dummy_sentencepiece_objects.py:
--------------------------------------------------------------------------------
  1 | # This file is autogenerated by the command `make fix-copies`, do not edit.
  2 | from ..utils import DummyObject, requires_backends
  3 | 
  4 | 
  5 | class AlbertTokenizer(metaclass=DummyObject):
  6 |     _backends = ["sentencepiece"]
  7 | 
  8 |     def __init__(self, *args, **kwargs):
  9 |         requires_backends(self, ["sentencepiece"])
 10 | 
 11 | 
 12 | class BarthezTokenizer(metaclass=DummyObject):
 13 |     _backends = ["sentencepiece"]
 14 | 
 15 |     def __init__(self, *args, **kwargs):
 16 |         requires_backends(self, ["sentencepiece"])
 17 | 
 18 | 
 19 | class BartphoTokenizer(metaclass=DummyObject):
 20 |     _backends = ["sentencepiece"]
 21 | 
 22 |     def __init__(self, *args, **kwargs):
 23 |         requires_backends(self, ["sentencepiece"])
 24 | 
 25 | 
 26 | class BertGenerationTokenizer(metaclass=DummyObject):
 27 |     _backends = ["sentencepiece"]
 28 | 
 29 |     def __init__(self, *args, **kwargs):
 30 |         requires_backends(self, ["sentencepiece"])
 31 | 
 32 | 
 33 | class BigBirdTokenizer(metaclass=DummyObject):
 34 |     _backends = ["sentencepiece"]
 35 | 
 36 |     def __init__(self, *args, **kwargs):
 37 |         requires_backends(self, ["sentencepiece"])
 38 | 
 39 | 
 40 | class CamembertTokenizer(metaclass=DummyObject):
 41 |     _backends = ["sentencepiece"]
 42 | 
 43 |     def __init__(self, *args, **kwargs):
 44 |         requires_backends(self, ["sentencepiece"])
 45 | 
 46 | 
 47 | class CodeLlamaTokenizer(metaclass=DummyObject):
 48 |     _backends = ["sentencepiece"]
 49 | 
 50 |     def __init__(self, *args, **kwargs):
 51 |         requires_backends(self, ["sentencepiece"])
 52 | 
 53 | 
 54 | class CpmTokenizer(metaclass=DummyObject):
 55 |     _backends = ["sentencepiece"]
 56 | 
 57 |     def __init__(self, *args, **kwargs):
 58 |         requires_backends(self, ["sentencepiece"])
 59 | 
 60 | 
 61 | class DebertaV2Tokenizer(metaclass=DummyObject):
 62 |     _backends = ["sentencepiece"]
 63 | 
 64 |     def __init__(self, *args, **kwargs):
 65 |         requires_backends(self, ["sentencepiece"])
 66 | 
 67 | 
 68 | class ErnieMTokenizer(metaclass=DummyObject):
 69 |     _backends = ["sentencepiece"]
 70 | 
 71 |     def __init__(self, *args, **kwargs):
 72 |         requires_backends(self, ["sentencepiece"])
 73 | 
 74 | 
 75 | class FNetTokenizer(metaclass=DummyObject):
 76 |     _backends = ["sentencepiece"]
 77 | 
 78 |     def __init__(self, *args, **kwargs):
 79 |         requires_backends(self, ["sentencepiece"])
 80 | 
 81 | 
 82 | class GPTSw3Tokenizer(metaclass=DummyObject):
 83 |     _backends = ["sentencepiece"]
 84 | 
 85 |     def __init__(self, *args, **kwargs):
 86 |         requires_backends(self, ["sentencepiece"])
 87 | 
 88 | 
 89 | class LayoutXLMTokenizer(metaclass=DummyObject):
 90 |     _backends = ["sentencepiece"]
 91 | 
 92 |     def __init__(self, *args, **kwargs):
 93 |         requires_backends(self, ["sentencepiece"])
 94 | 
 95 | 
 96 | class LlamaTokenizer(metaclass=DummyObject):
 97 |     _backends = ["sentencepiece"]
 98 | 
 99 |     def __init__(self, *args, **kwargs):
100 |         requires_backends(self, ["sentencepiece"])
101 | 
102 | 
103 | class M2M100Tokenizer(metaclass=DummyObject):
104 |     _backends = ["sentencepiece"]
105 | 
106 |     def __init__(self, *args, **kwargs):
107 |         requires_backends(self, ["sentencepiece"])
108 | 
109 | 
110 | class MarianTokenizer(metaclass=DummyObject):
111 |     _backends = ["sentencepiece"]
112 | 
113 |     def __init__(self, *args, **kwargs):
114 |         requires_backends(self, ["sentencepiece"])
115 | 
116 | 
117 | class MBart50Tokenizer(metaclass=DummyObject):
118 |     _backends = ["sentencepiece"]
119 | 
120 |     def __init__(self, *args, **kwargs):
121 |         requires_backends(self, ["sentencepiece"])
122 | 
123 | 
124 | class MBartTokenizer(metaclass=DummyObject):
125 |     _backends = ["sentencepiece"]
126 | 
127 |     def __init__(self, *args, **kwargs):
128 |         requires_backends(self, ["sentencepiece"])
129 | 
130 | 
131 | class MLukeTokenizer(metaclass=DummyObject):
132 |     _backends = ["sentencepiece"]
133 | 
134 |     def __init__(self, *args, **kwargs):
135 |         requires_backends(self, ["sentencepiece"])
136 | 
137 | 
138 | class MT5Tokenizer(metaclass=DummyObject):
139 |     _backends = ["sentencepiece"]
140 | 
141 |     def __init__(self, *args, **kwargs):
142 |         requires_backends(self, ["sentencepiece"])
143 | 
144 | 
145 | class NllbTokenizer(metaclass=DummyObject):
146 |     _backends = ["sentencepiece"]
147 | 
148 |     def __init__(self, *args, **kwargs):
149 |         requires_backends(self, ["sentencepiece"])
150 | 
151 | 
152 | class PegasusTokenizer(metaclass=DummyObject):
153 |     _backends = ["sentencepiece"]
154 | 
155 |     def __init__(self, *args, **kwargs):
156 |         requires_backends(self, ["sentencepiece"])
157 | 
158 | 
159 | class PLBartTokenizer(metaclass=DummyObject):
160 |     _backends = ["sentencepiece"]
161 | 
162 |     def __init__(self, *args, **kwargs):
163 |         requires_backends(self, ["sentencepiece"])
164 | 
165 | 
166 | class ReformerTokenizer(metaclass=DummyObject):
167 |     _backends = ["sentencepiece"]
168 | 
169 |     def __init__(self, *args, **kwargs):
170 |         requires_backends(self, ["sentencepiece"])
171 | 
172 | 
173 | class RemBertTokenizer(metaclass=DummyObject):
174 |     _backends = ["sentencepiece"]
175 | 
176 |     def __init__(self, *args, **kwargs):
177 |         requires_backends(self, ["sentencepiece"])
178 | 
179 | 
180 | class Speech2TextTokenizer(metaclass=DummyObject):
181 |     _backends = ["sentencepiece"]
182 | 
183 |     def __init__(self, *args, **kwargs):
184 |         requires_backends(self, ["sentencepiece"])
185 | 
186 | 
187 | class SpeechT5Tokenizer(metaclass=DummyObject):
188 |     _backends = ["sentencepiece"]
189 | 
190 |     def __init__(self, *args, **kwargs):
191 |         requires_backends(self, ["sentencepiece"])
192 | 
193 | 
194 | class T5Tokenizer(metaclass=DummyObject):
195 |     _backends = ["sentencepiece"]
196 | 
197 |     def __init__(self, *args, **kwargs):
198 |         requires_backends(self, ["sentencepiece"])
199 | 
200 | 
201 | class XGLMTokenizer(metaclass=DummyObject):
202 |     _backends = ["sentencepiece"]
203 | 
204 |     def __init__(self, *args, **kwargs):
205 |         requires_backends(self, ["sentencepiece"])
206 | 
207 | 
208 | class XLMProphetNetTokenizer(metaclass=DummyObject):
209 |     _backends = ["sentencepiece"]
210 | 
211 |     def __init__(self, *args, **kwargs):
212 |         requires_backends(self, ["sentencepiece"])
213 | 
214 | 
215 | class XLMRobertaTokenizer(metaclass=DummyObject):
216 |     _backends = ["sentencepiece"]
217 | 
218 |     def __init__(self, *args, **kwargs):
219 |         requires_backends(self, ["sentencepiece"])
220 | 
221 | 
222 | class XLNetTokenizer(metaclass=DummyObject):
223 |     _backends = ["sentencepiece"]
224 | 
225 |     def __init__(self, *args, **kwargs):
226 |         requires_backends(self, ["sentencepiece"])
227 | 


--------------------------------------------------------------------------------
/larimar_base/pytorch_transformers/generation/stopping_criteria.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import warnings
  3 | from abc import ABC
  4 | from copy import deepcopy
  5 | from typing import Optional
  6 | 
  7 | import torch
  8 | 
  9 | from ..utils import add_start_docstrings, logging
 10 | 
 11 | 
 12 | logger = logging.get_logger(__name__)
 13 | 
 14 | 
 15 | STOPPING_CRITERIA_INPUTS_DOCSTRING = r"""
 16 |     Args:
 17 |         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
 18 |             Indices of input sequence tokens in the vocabulary.
 19 | 
 20 |             Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
 21 |             [`PreTrainedTokenizer.__call__`] for details.
 22 | 
 23 |             [What are input IDs?](../glossary#input-ids)
 24 |         scores (`torch.FloatTensor` of shape `(batch_size, config.vocab_size)`):
 25 |             Prediction scores of a language modeling head. These can be scores for each vocabulary token before SoftMax
 26 |             or scores for each vocabulary token after SoftMax.
 27 |         kwargs (`Dict[str, Any]`, *optional*):
 28 |             Additional stopping criteria specific kwargs.
 29 | 
 30 |     Return:
 31 |         `bool`. `False` indicates we should continue, `True` indicates we should stop.
 32 | 
 33 | """
 34 | 
 35 | 
 36 | class StoppingCriteria(ABC):
 37 |     """Abstract base class for all stopping criteria that can be applied during generation."""
 38 | 
 39 |     @add_start_docstrings(STOPPING_CRITERIA_INPUTS_DOCSTRING)
 40 |     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
 41 |         raise NotImplementedError("StoppingCriteria needs to be subclassed")
 42 | 
 43 | 
 44 | class MaxLengthCriteria(StoppingCriteria):
 45 |     """
 46 |     This class can be used to stop generation whenever the full generated number of tokens exceeds `max_length`. Keep
 47 |     in mind for decoder-only type of transformers, this will include the initial prompted tokens.
 48 | 
 49 |     Args:
 50 |         max_length (`int`):
 51 |             The maximum length that the output sequence can have in number of tokens.
 52 |         max_position_embeddings (`int`, `optional`):
 53 |             The maximum model length, as defined by the model's `config.max_position_embeddings` attribute.
 54 |     """
 55 | 
 56 |     def __init__(self, max_length: int, max_position_embeddings: Optional[int] = None):
 57 |         self.max_length = max_length
 58 |         self.max_position_embeddings = max_position_embeddings
 59 | 
 60 |     @add_start_docstrings(STOPPING_CRITERIA_INPUTS_DOCSTRING)
 61 |     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
 62 |         cur_len = input_ids.shape[-1]
 63 |         is_done = cur_len >= self.max_length
 64 |         if self.max_position_embeddings is not None and not is_done and cur_len >= self.max_position_embeddings:
 65 |             logger.warning_once(
 66 |                 "This is a friendly reminder - the current text generation call will exceed the model's predefined "
 67 |                 f"maximum length ({self.max_position_embeddings}). Depending on the model, you may observe "
 68 |                 "exceptions, performance degradation, or nothing at all."
 69 |             )
 70 |         return is_done
 71 | 
 72 | 
 73 | class MaxNewTokensCriteria(StoppingCriteria):
 74 |     """
 75 |     This class can be used to stop generation whenever the generated number of tokens exceeds `max_new_tokens`. Keep in
 76 |     mind for decoder-only type of transformers, this will **not** include the initial prompted tokens. This is very
 77 |     close to `MaxLengthCriteria` but ignores the number of initial tokens.
 78 | 
 79 |     Args:
 80 |         start_length (`int`):
 81 |             The number of initial tokens.
 82 |         max_new_tokens (`int`):
 83 |             The maximum number of tokens to generate.
 84 |     """
 85 | 
 86 |     def __init__(self, start_length: int, max_new_tokens: int):
 87 |         warnings.warn(
 88 |             "The class `MaxNewTokensCriteria` is deprecated. "
 89 |             f"Please use `MaxLengthCriteria(max_length={start_length + max_new_tokens})` "
 90 |             "with `max_length = start_length + max_new_tokens` instead.",
 91 |             FutureWarning,
 92 |         )
 93 |         self.start_length = start_length
 94 |         self.max_new_tokens = max_new_tokens
 95 |         self.max_length = start_length + max_new_tokens
 96 | 
 97 |     @add_start_docstrings(STOPPING_CRITERIA_INPUTS_DOCSTRING)
 98 |     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
 99 |         return input_ids.shape[-1] >= self.max_length
100 | 
101 | 
102 | class MaxTimeCriteria(StoppingCriteria):
103 |     """
104 |     This class can be used to stop generation whenever the full generation exceeds some amount of time. By default, the
105 |     time will start being counted when you initialize this function. You can override this by passing an
106 |     `initial_time`.
107 | 
108 |     Args:
109 |         max_time (`float`):
110 |             The maximum allowed time in seconds for the generation.
111 |         initial_time (`float`, *optional*, defaults to `time.time()`):
112 |             The start of the generation allowed time.
113 |     """
114 | 
115 |     def __init__(self, max_time: float, initial_timestamp: Optional[float] = None):
116 |         self.max_time = max_time
117 |         self.initial_timestamp = time.time() if initial_timestamp is None else initial_timestamp
118 | 
119 |     @add_start_docstrings(STOPPING_CRITERIA_INPUTS_DOCSTRING)
120 |     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
121 |         return time.time() - self.initial_timestamp > self.max_time
122 | 
123 | 
124 | class StoppingCriteriaList(list):
125 |     @add_start_docstrings(STOPPING_CRITERIA_INPUTS_DOCSTRING)
126 |     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
127 |         return any(criteria(input_ids, scores) for criteria in self)
128 | 
129 |     @property
130 |     def max_length(self) -> Optional[int]:
131 |         for stopping_criterium in self:
132 |             if isinstance(stopping_criterium, MaxLengthCriteria):
133 |                 return stopping_criterium.max_length
134 |             elif isinstance(stopping_criterium, MaxNewTokensCriteria):
135 |                 return stopping_criterium.max_length
136 |         return None
137 | 
138 | 
139 | def validate_stopping_criteria(stopping_criteria: StoppingCriteriaList, max_length: int) -> StoppingCriteriaList:
140 |     stopping_max_length = stopping_criteria.max_length
141 |     new_stopping_criteria = deepcopy(stopping_criteria)
142 |     if stopping_max_length is not None and stopping_max_length != max_length:
143 |         warnings.warn("You set different `max_length` for stopping criteria and `max_length` parameter", UserWarning)
144 |     elif stopping_max_length is None:
145 |         new_stopping_criteria.append(MaxLengthCriteria(max_length=max_length))
146 |     return new_stopping_criteria
147 | 


--------------------------------------------------------------------------------