├── SentEval ├── senteval │ ├── tools │ │ ├── __init__.py │ │ └── relatedness.py │ ├── __init__.py │ ├── utils.py │ ├── trec.py │ ├── binary.py │ ├── sst.py │ ├── mrpc.py │ ├── snli.py │ ├── rank.py │ └── engine.py ├── data │ └── downstream │ │ └── download_dataset.sh ├── .gitignore ├── setup.py ├── LICENSE └── examples │ ├── skipthought.py │ ├── googleuse.py │ ├── gensen.py │ ├── infersent.py │ └── bow.py ├── sentence_transformers_congen ├── losses │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ └── ConGenLoss.cpython-36.pyc │ └── ConGenLoss.py ├── __init__.py ├── __pycache__ │ ├── util.cpython-36.pyc │ ├── __init__.cpython-36.pyc │ ├── SentenceTransformer.cpython-36.pyc │ └── model_card_templates.cpython-36.pyc ├── models │ ├── __pycache__ │ │ ├── BoW.cpython-36.pyc │ │ ├── BoW.cpython-39.pyc │ │ ├── CNN.cpython-36.pyc │ │ ├── CNN.cpython-39.pyc │ │ ├── Asym.cpython-36.pyc │ │ ├── Asym.cpython-39.pyc │ │ ├── Dense.cpython-36.pyc │ │ ├── Dense.cpython-39.pyc │ │ ├── LSTM.cpython-36.pyc │ │ ├── LSTM.cpython-39.pyc │ │ ├── Dropout.cpython-36.pyc │ │ ├── Dropout.cpython-39.pyc │ │ ├── Pooling.cpython-36.pyc │ │ ├── Pooling.cpython-39.pyc │ │ ├── __init__.cpython-36.pyc │ │ ├── __init__.cpython-39.pyc │ │ ├── CLIPModel.cpython-36.pyc │ │ ├── CLIPModel.cpython-39.pyc │ │ ├── LayerNorm.cpython-36.pyc │ │ ├── LayerNorm.cpython-39.pyc │ │ ├── Normalize.cpython-36.pyc │ │ ├── Normalize.cpython-39.pyc │ │ ├── Transformer.cpython-36.pyc │ │ ├── Transformer.cpython-39.pyc │ │ ├── WordWeights.cpython-36.pyc │ │ ├── WordWeights.cpython-39.pyc │ │ ├── WordEmbeddings.cpython-36.pyc │ │ ├── WordEmbeddings.cpython-39.pyc │ │ ├── WeightedLayerPooling.cpython-36.pyc │ │ └── WeightedLayerPooling.cpython-39.pyc │ ├── tokenizer │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── __init__.cpython-39.pyc │ │ │ ├── WordTokenizer.cpython-36.pyc │ │ │ ├── WordTokenizer.cpython-39.pyc │ │ │ ├── PhraseTokenizer.cpython-36.pyc │ │ │ ├── PhraseTokenizer.cpython-39.pyc │ │ │ ├── WhitespaceTokenizer.cpython-36.pyc │ │ │ └── WhitespaceTokenizer.cpython-39.pyc │ │ ├── __init__.py │ │ ├── WhitespaceTokenizer.py │ │ ├── WordTokenizer.py │ │ └── PhraseTokenizer.py │ ├── __init__.py │ ├── Normalize.py │ ├── Dropout.py │ ├── LayerNorm.py │ ├── WeightedLayerPooling.py │ ├── LSTM.py │ ├── Dense.py │ ├── CNN.py │ ├── CLIPModel.py │ ├── WordWeights.py │ ├── BoW.py │ ├── T5.py │ ├── Asym.py │ ├── Pooling.py │ ├── WordEmbeddings.py │ └── Transformer.py ├── readers │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ ├── __init__.cpython-37.pyc │ │ ├── __init__.cpython-39.pyc │ │ ├── InputExample.cpython-36.pyc │ │ ├── InputExample.cpython-37.pyc │ │ ├── InputExample.cpython-39.pyc │ │ ├── NLIDataReader.cpython-36.pyc │ │ ├── NLIDataReader.cpython-37.pyc │ │ ├── NLIDataReader.cpython-39.pyc │ │ ├── STSDataReader.cpython-36.pyc │ │ ├── STSDataReader.cpython-37.pyc │ │ ├── STSDataReader.cpython-39.pyc │ │ ├── TripletReader.cpython-36.pyc │ │ ├── TripletReader.cpython-37.pyc │ │ ├── TripletReader.cpython-39.pyc │ │ ├── LabelSentenceReader.cpython-36.pyc │ │ ├── LabelSentenceReader.cpython-37.pyc │ │ └── LabelSentenceReader.cpython-39.pyc │ ├── __init__.py │ ├── InputExample.py │ ├── PairedFilesReader.py │ ├── LabelSentenceReader.py │ ├── TripletReader.py │ ├── NLIDataReader.py │ └── STSDataReader.py ├── evaluation │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ ├── __init__.cpython-37.pyc │ │ ├── __init__.cpython-39.pyc │ │ ├── MSEEvaluator.cpython-36.pyc │ │ ├── MSEEvaluator.cpython-39.pyc │ │ ├── SentenceEvaluator.cpython-36.pyc │ │ ├── SentenceEvaluator.cpython-37.pyc │ │ ├── SentenceEvaluator.cpython-39.pyc │ │ ├── TripletEvaluator.cpython-36.pyc │ │ ├── TripletEvaluator.cpython-39.pyc │ │ ├── RerankingEvaluator.cpython-36.pyc │ │ ├── RerankingEvaluator.cpython-39.pyc │ │ ├── SequentialEvaluator.cpython-36.pyc │ │ ├── SequentialEvaluator.cpython-39.pyc │ │ ├── SimilarityFunction.cpython-36.pyc │ │ ├── SimilarityFunction.cpython-37.pyc │ │ ├── SimilarityFunction.cpython-39.pyc │ │ ├── LabelAccuracyEvaluator.cpython-36.pyc │ │ ├── LabelAccuracyEvaluator.cpython-39.pyc │ │ ├── TranslationEvaluator.cpython-36.pyc │ │ ├── TranslationEvaluator.cpython-39.pyc │ │ ├── MSEEvaluatorFromDataFrame.cpython-36.pyc │ │ ├── MSEEvaluatorFromDataFrame.cpython-39.pyc │ │ ├── ParaphraseMiningEvaluator.cpython-36.pyc │ │ ├── ParaphraseMiningEvaluator.cpython-39.pyc │ │ ├── BinaryClassificationEvaluator.cpython-36.pyc │ │ ├── BinaryClassificationEvaluator.cpython-37.pyc │ │ ├── BinaryClassificationEvaluator.cpython-39.pyc │ │ ├── EmbeddingSimilarityEvaluator.cpython-36.pyc │ │ ├── EmbeddingSimilarityEvaluator.cpython-39.pyc │ │ ├── InformationRetrievalEvaluator.cpython-36.pyc │ │ └── InformationRetrievalEvaluator.cpython-39.pyc │ ├── SimilarityFunction.py │ ├── __init__.py │ ├── SequentialEvaluator.py │ ├── SentenceEvaluator.py │ ├── LabelAccuracyEvaluator.py │ ├── MSEEvaluator.py │ ├── MSEEvaluatorFromDataFrame.py │ ├── TranslationEvaluator.py │ ├── RerankingEvaluator.py │ ├── EmbeddingSimilarityEvaluator.py │ └── TripletEvaluator.py └── model_card_templates.py ├── requirements.txt ├── ConGen__Unsupervised_Control_and_Generalization_Distillation_For_Sentence_Representation.pdf ├── setup.py ├── train_congen.sh └── evaluation.py /SentEval/senteval/tools/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sentence_transformers_congen/losses/__init__.py: -------------------------------------------------------------------------------- 1 | from .ConGenLoss import * -------------------------------------------------------------------------------- /sentence_transformers_congen/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "1.0.0" 2 | from .SentenceTransformer import SentenceTransformer -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch==1.8.1 2 | transformers==4.9.0 3 | sentence-transformers==2.0.0 4 | tensorflow==2.5.0 5 | protobuf==3.20.* 6 | -------------------------------------------------------------------------------- /SentEval/data/downstream/download_dataset.sh: -------------------------------------------------------------------------------- 1 | wget https://huggingface.co/datasets/princeton-nlp/datasets-for-simcse/resolve/main/senteval.tar 2 | tar xvf senteval.tar 3 | -------------------------------------------------------------------------------- /sentence_transformers_congen/__pycache__/util.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/__pycache__/util.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/models/__pycache__/BoW.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/BoW.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/models/__pycache__/BoW.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/BoW.cpython-39.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/models/__pycache__/CNN.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/CNN.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/models/__pycache__/CNN.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/CNN.cpython-39.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/models/__pycache__/Asym.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/Asym.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/models/__pycache__/Asym.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/Asym.cpython-39.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/models/__pycache__/Dense.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/Dense.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/models/__pycache__/Dense.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/Dense.cpython-39.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/models/__pycache__/LSTM.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/LSTM.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/models/__pycache__/LSTM.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/LSTM.cpython-39.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/losses/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/losses/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/models/__pycache__/Dropout.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/Dropout.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/models/__pycache__/Dropout.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/Dropout.cpython-39.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/models/__pycache__/Pooling.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/Pooling.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/models/__pycache__/Pooling.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/Pooling.cpython-39.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/models/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/models/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/losses/__pycache__/ConGenLoss.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/losses/__pycache__/ConGenLoss.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/models/__pycache__/CLIPModel.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/CLIPModel.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/models/__pycache__/CLIPModel.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/CLIPModel.cpython-39.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/models/__pycache__/LayerNorm.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/LayerNorm.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/models/__pycache__/LayerNorm.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/LayerNorm.cpython-39.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/models/__pycache__/Normalize.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/Normalize.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/models/__pycache__/Normalize.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/Normalize.cpython-39.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/models/__pycache__/Transformer.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/Transformer.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/models/__pycache__/Transformer.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/Transformer.cpython-39.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/models/__pycache__/WordWeights.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/WordWeights.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/models/__pycache__/WordWeights.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/WordWeights.cpython-39.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/readers/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/readers/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/readers/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/readers/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/readers/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/readers/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/__pycache__/SentenceTransformer.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/__pycache__/SentenceTransformer.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/__pycache__/model_card_templates.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/__pycache__/model_card_templates.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/evaluation/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/evaluation/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/evaluation/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/readers/__pycache__/InputExample.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/readers/__pycache__/InputExample.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/readers/__pycache__/InputExample.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/readers/__pycache__/InputExample.cpython-37.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/readers/__pycache__/InputExample.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/readers/__pycache__/InputExample.cpython-39.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/evaluation/__pycache__/MSEEvaluator.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/MSEEvaluator.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/evaluation/__pycache__/MSEEvaluator.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/MSEEvaluator.cpython-39.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/models/__pycache__/WordEmbeddings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/WordEmbeddings.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/models/__pycache__/WordEmbeddings.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/WordEmbeddings.cpython-39.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/readers/__pycache__/NLIDataReader.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/readers/__pycache__/NLIDataReader.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/readers/__pycache__/NLIDataReader.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/readers/__pycache__/NLIDataReader.cpython-37.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/readers/__pycache__/NLIDataReader.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/readers/__pycache__/NLIDataReader.cpython-39.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/readers/__pycache__/STSDataReader.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/readers/__pycache__/STSDataReader.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/readers/__pycache__/STSDataReader.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/readers/__pycache__/STSDataReader.cpython-37.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/readers/__pycache__/STSDataReader.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/readers/__pycache__/STSDataReader.cpython-39.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/readers/__pycache__/TripletReader.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/readers/__pycache__/TripletReader.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/readers/__pycache__/TripletReader.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/readers/__pycache__/TripletReader.cpython-37.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/readers/__pycache__/TripletReader.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/readers/__pycache__/TripletReader.cpython-39.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/models/tokenizer/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/tokenizer/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/models/tokenizer/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/tokenizer/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/evaluation/__pycache__/SentenceEvaluator.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/SentenceEvaluator.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/evaluation/__pycache__/SentenceEvaluator.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/SentenceEvaluator.cpython-37.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/evaluation/__pycache__/SentenceEvaluator.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/SentenceEvaluator.cpython-39.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/evaluation/__pycache__/TripletEvaluator.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/TripletEvaluator.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/evaluation/__pycache__/TripletEvaluator.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/TripletEvaluator.cpython-39.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/models/__pycache__/WeightedLayerPooling.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/WeightedLayerPooling.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/models/__pycache__/WeightedLayerPooling.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/WeightedLayerPooling.cpython-39.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/readers/__pycache__/LabelSentenceReader.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/readers/__pycache__/LabelSentenceReader.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/readers/__pycache__/LabelSentenceReader.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/readers/__pycache__/LabelSentenceReader.cpython-37.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/readers/__pycache__/LabelSentenceReader.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/readers/__pycache__/LabelSentenceReader.cpython-39.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/evaluation/SimilarityFunction.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | class SimilarityFunction(Enum): 4 | COSINE = 0 5 | EUCLIDEAN = 1 6 | MANHATTAN = 2 7 | DOT_PRODUCT = 3 8 | 9 | -------------------------------------------------------------------------------- /sentence_transformers_congen/evaluation/__pycache__/RerankingEvaluator.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/RerankingEvaluator.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/evaluation/__pycache__/RerankingEvaluator.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/RerankingEvaluator.cpython-39.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/evaluation/__pycache__/SequentialEvaluator.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/SequentialEvaluator.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/evaluation/__pycache__/SequentialEvaluator.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/SequentialEvaluator.cpython-39.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/evaluation/__pycache__/SimilarityFunction.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/SimilarityFunction.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/evaluation/__pycache__/SimilarityFunction.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/SimilarityFunction.cpython-37.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/evaluation/__pycache__/SimilarityFunction.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/SimilarityFunction.cpython-39.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/models/tokenizer/__pycache__/WordTokenizer.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/tokenizer/__pycache__/WordTokenizer.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/models/tokenizer/__pycache__/WordTokenizer.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/tokenizer/__pycache__/WordTokenizer.cpython-39.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/evaluation/__pycache__/LabelAccuracyEvaluator.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/LabelAccuracyEvaluator.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/evaluation/__pycache__/LabelAccuracyEvaluator.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/LabelAccuracyEvaluator.cpython-39.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/evaluation/__pycache__/TranslationEvaluator.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/TranslationEvaluator.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/evaluation/__pycache__/TranslationEvaluator.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/TranslationEvaluator.cpython-39.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/models/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | from .WordTokenizer import WordTokenizer, ENGLISH_STOP_WORDS 2 | from .WhitespaceTokenizer import WhitespaceTokenizer 3 | from .PhraseTokenizer import PhraseTokenizer 4 | -------------------------------------------------------------------------------- /sentence_transformers_congen/models/tokenizer/__pycache__/PhraseTokenizer.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/tokenizer/__pycache__/PhraseTokenizer.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/models/tokenizer/__pycache__/PhraseTokenizer.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/tokenizer/__pycache__/PhraseTokenizer.cpython-39.pyc -------------------------------------------------------------------------------- /ConGen__Unsupervised_Control_and_Generalization_Distillation_For_Sentence_Representation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/ConGen__Unsupervised_Control_and_Generalization_Distillation_For_Sentence_Representation.pdf -------------------------------------------------------------------------------- /sentence_transformers_congen/evaluation/__pycache__/MSEEvaluatorFromDataFrame.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/MSEEvaluatorFromDataFrame.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/evaluation/__pycache__/MSEEvaluatorFromDataFrame.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/MSEEvaluatorFromDataFrame.cpython-39.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/evaluation/__pycache__/ParaphraseMiningEvaluator.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/ParaphraseMiningEvaluator.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/evaluation/__pycache__/ParaphraseMiningEvaluator.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/ParaphraseMiningEvaluator.cpython-39.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/models/tokenizer/__pycache__/WhitespaceTokenizer.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/tokenizer/__pycache__/WhitespaceTokenizer.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/models/tokenizer/__pycache__/WhitespaceTokenizer.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/tokenizer/__pycache__/WhitespaceTokenizer.cpython-39.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/evaluation/__pycache__/BinaryClassificationEvaluator.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/BinaryClassificationEvaluator.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/evaluation/__pycache__/BinaryClassificationEvaluator.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/BinaryClassificationEvaluator.cpython-37.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/evaluation/__pycache__/BinaryClassificationEvaluator.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/BinaryClassificationEvaluator.cpython-39.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/evaluation/__pycache__/EmbeddingSimilarityEvaluator.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/EmbeddingSimilarityEvaluator.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/evaluation/__pycache__/EmbeddingSimilarityEvaluator.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/EmbeddingSimilarityEvaluator.cpython-39.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/evaluation/__pycache__/InformationRetrievalEvaluator.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/InformationRetrievalEvaluator.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers_congen/evaluation/__pycache__/InformationRetrievalEvaluator.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/InformationRetrievalEvaluator.cpython-39.pyc -------------------------------------------------------------------------------- /SentEval/.gitignore: -------------------------------------------------------------------------------- 1 | # SentEval data and .pyc files 2 | 3 | 4 | 5 | # python 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # log files 11 | *.log 12 | *.txt 13 | 14 | # data files 15 | data/senteval_data* 16 | data/downstream/ 17 | -------------------------------------------------------------------------------- /sentence_transformers_congen/readers/__init__.py: -------------------------------------------------------------------------------- 1 | from .InputExample import InputExample 2 | from .LabelSentenceReader import LabelSentenceReader 3 | from .NLIDataReader import NLIDataReader 4 | from .STSDataReader import STSDataReader, STSBenchmarkDataReader 5 | from .TripletReader import TripletReader -------------------------------------------------------------------------------- /SentEval/senteval/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | from __future__ import absolute_import 9 | 10 | from senteval.engine import SE 11 | -------------------------------------------------------------------------------- /sentence_transformers_congen/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .Transformer import Transformer 2 | from .Asym import Asym 3 | from .BoW import BoW 4 | from .CNN import CNN 5 | from .Dense import Dense 6 | from .Dropout import Dropout 7 | from .LayerNorm import LayerNorm 8 | from .LSTM import LSTM 9 | from .Normalize import Normalize 10 | from .Pooling import Pooling 11 | from .WeightedLayerPooling import WeightedLayerPooling 12 | from .WordEmbeddings import WordEmbeddings 13 | from .WordWeights import WordWeights 14 | from .CLIPModel import CLIPModel 15 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | with open("README.md", mode="r", encoding="utf-8") as readme_file: 4 | readme = readme_file.read() 5 | 6 | 7 | 8 | setup( 9 | name="congen-sbert", 10 | version="1.0.0", 11 | author=" ", 12 | author_email=" ", 13 | description="Sentence representation with SBERT", 14 | long_description=readme, 15 | long_description_content_type="text/markdown", 16 | packages=find_packages(), 17 | install_requires=[ 18 | "torch==1.8.1", 19 | "transformers==4.9.0", 20 | "sentence-transformers==2.0.0", 21 | ], 22 | ) 23 | -------------------------------------------------------------------------------- /SentEval/setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | import io 9 | from setuptools import setup, find_packages 10 | 11 | with io.open('./README.md', encoding='utf-8') as f: 12 | readme = f.read() 13 | 14 | setup( 15 | name='SentEval', 16 | version='0.1.0', 17 | url='https://github.com/facebookresearch/SentEval', 18 | packages=find_packages(exclude=['examples']), 19 | license='Attribution-NonCommercial 4.0 International', 20 | long_description=readme, 21 | ) 22 | -------------------------------------------------------------------------------- /sentence_transformers_congen/models/Normalize.py: -------------------------------------------------------------------------------- 1 | from torch import Tensor 2 | from torch import nn 3 | from typing import Dict 4 | import torch.nn.functional as F 5 | 6 | class Normalize(nn.Module): 7 | """ 8 | This layer normalizes embeddings to unit length 9 | """ 10 | def __init__(self): 11 | super(Normalize, self).__init__() 12 | 13 | def forward(self, features: Dict[str, Tensor]): 14 | features.update({'sentence_embedding': F.normalize(features['sentence_embedding'], p=2, dim=1)}) 15 | return features 16 | 17 | def save(self, output_path): 18 | pass 19 | 20 | @staticmethod 21 | def load(input_path): 22 | return Normalize() 23 | -------------------------------------------------------------------------------- /sentence_transformers_congen/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from .SentenceEvaluator import SentenceEvaluator 2 | from .SimilarityFunction import SimilarityFunction 3 | from .BinaryClassificationEvaluator import BinaryClassificationEvaluator 4 | from .EmbeddingSimilarityEvaluator import EmbeddingSimilarityEvaluator 5 | from .InformationRetrievalEvaluator import InformationRetrievalEvaluator 6 | from .LabelAccuracyEvaluator import LabelAccuracyEvaluator 7 | from .MSEEvaluator import MSEEvaluator 8 | from .MSEEvaluatorFromDataFrame import MSEEvaluatorFromDataFrame 9 | from .ParaphraseMiningEvaluator import ParaphraseMiningEvaluator 10 | from .SequentialEvaluator import SequentialEvaluator 11 | from .TranslationEvaluator import TranslationEvaluator 12 | from .TripletEvaluator import TripletEvaluator 13 | from .RerankingEvaluator import RerankingEvaluator -------------------------------------------------------------------------------- /sentence_transformers_congen/readers/InputExample.py: -------------------------------------------------------------------------------- 1 | from typing import Union, List 2 | 3 | 4 | class InputExample: 5 | """ 6 | Structure for one input example with texts, the label and a unique id 7 | """ 8 | def __init__(self, guid: str = '', texts: List[str] = None, label: Union[int, float] = 0): 9 | """ 10 | Creates one InputExample with the given texts, guid and label 11 | 12 | 13 | :param guid 14 | id for the example 15 | :param texts 16 | the texts for the example. Note, str.strip() is called on the texts 17 | :param label 18 | the label for the example 19 | """ 20 | self.guid = guid 21 | self.texts = texts 22 | self.label = label 23 | 24 | def __str__(self): 25 | return " label: {}, texts: {}".format(str(self.label), "; ".join(self.texts)) -------------------------------------------------------------------------------- /sentence_transformers_congen/evaluation/SequentialEvaluator.py: -------------------------------------------------------------------------------- 1 | from . import SentenceEvaluator 2 | from typing import Iterable 3 | 4 | class SequentialEvaluator(SentenceEvaluator): 5 | """ 6 | This evaluator allows that multiple sub-evaluators are passed. When the model is evaluated, 7 | the data is passed sequentially to all sub-evaluators. 8 | 9 | All scores are passed to 'main_score_function', which derives one final score value 10 | """ 11 | def __init__(self, evaluators: Iterable[SentenceEvaluator], main_score_function = lambda scores: scores[-1]): 12 | self.evaluators = evaluators 13 | self.main_score_function = main_score_function 14 | 15 | def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: 16 | scores = [] 17 | for evaluator in self.evaluators: 18 | scores.append(evaluator(model, output_path, epoch, steps)) 19 | 20 | return self.main_score_function(scores) 21 | -------------------------------------------------------------------------------- /train_congen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # please see Appendix A.1 in our paper or https://github.com/KornWtp/ConGen#parameters for the full setup 4 | CUDA_VISIBLE_DEVICES=$0 # GPU device number. 5 | python main.py \ 6 | --model_save_path "your-output-model-path" \ 7 | --teacher_model_name_or_path princeton-nlp/unsup-simcse-roberta-large \ # Default teacher 8 | --student_model_name_or_path nreimers/BERT-Tiny_L-2_H-128_A-2 \ # compressed model or large model 9 | --train_data_path "your-train-data-path" \ # https://drive.google.com/file/d/19O2NArJz_RlVNNGRbBnnWxNMW-7HaFZ8/view?usp=sharing 10 | --dev_data_path "your-validation-data-path" \ # STS-B dev set 11 | --train_batch_size 128 \ 12 | --eval_batch_size 128 \ 13 | --max_seq_length 128 \ 14 | --num_epochs 20 \ 15 | --learning_rate 5e-4 \ # see https://github.com/KornWtp/ConGen#parameters 16 | --teacher_temp 0.05 \ # see https://github.com/KornWtp/ConGen#parameters 17 | --student_temp 0.07 \ # see https://github.com/KornWtp/ConGen#parameters 18 | --queue_size 65536 # see https://github.com/KornWtp/ConGen#parameters 19 | -------------------------------------------------------------------------------- /sentence_transformers_congen/models/Dropout.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor 3 | from torch import nn 4 | from typing import Dict 5 | import os 6 | import json 7 | 8 | 9 | class Dropout(nn.Module): 10 | """Dropout layer. 11 | 12 | :param dropout: Sets a dropout value for dense layer. 13 | """ 14 | def __init__(self, dropout: float = 0.2): 15 | super(Dropout, self).__init__() 16 | self.dropout = dropout 17 | self.dropout_layer = nn.Dropout(self.dropout) 18 | 19 | def forward(self, features: Dict[str, Tensor]): 20 | features.update({'sentence_embedding': self.dropout_layer(features['sentence_embedding'])}) 21 | return features 22 | 23 | def save(self, output_path): 24 | with open(os.path.join(output_path, 'config.json'), 'w') as fOut: 25 | json.dump({'dropout': self.dropout}, fOut) 26 | 27 | 28 | 29 | @staticmethod 30 | def load(input_path): 31 | with open(os.path.join(input_path, 'config.json')) as fIn: 32 | config = json.load(fIn) 33 | 34 | model = Dropout(**config) 35 | return model 36 | -------------------------------------------------------------------------------- /sentence_transformers_congen/evaluation/SentenceEvaluator.py: -------------------------------------------------------------------------------- 1 | class SentenceEvaluator: 2 | """ 3 | Base class for all evaluators 4 | 5 | Extend this class and implement __call__ for custom evaluators. 6 | """ 7 | 8 | def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: 9 | """ 10 | This is called during training to evaluate the model. 11 | It returns a score for the evaluation with a higher score indicating a better result. 12 | 13 | :param model: 14 | the model to evaluate 15 | :param output_path: 16 | path where predictions and metrics are written to 17 | :param epoch 18 | the epoch where the evaluation takes place. 19 | This is used for the file prefixes. 20 | If this is -1, then we assume evaluation on test data. 21 | :param steps 22 | the steps in the current epoch at time of the evaluation. 23 | This is used for the file prefixes. 24 | If this is -1, then we assume evaluation at the end of the epoch. 25 | :return: a score for the evaluation with a higher score indicating a better result 26 | """ 27 | pass 28 | -------------------------------------------------------------------------------- /sentence_transformers_congen/readers/PairedFilesReader.py: -------------------------------------------------------------------------------- 1 | from . import InputExample 2 | import csv 3 | import gzip 4 | import os 5 | import gzip 6 | 7 | class PairedFilesReader(object): 8 | """ 9 | Reads in the a Pair Dataset, split in two files 10 | """ 11 | def __init__(self, filepaths): 12 | self.filepaths = filepaths 13 | 14 | 15 | def get_examples(self, max_examples=0): 16 | """ 17 | """ 18 | fIns = [] 19 | for filepath in self.filepaths: 20 | fIn = gzip.open(filepath, 'rt', encoding='utf-8') if filepath.endswith('.gz') else open(filepath, encoding='utf-8') 21 | fIns.append(fIn) 22 | 23 | examples = [] 24 | 25 | eof = False 26 | while not eof: 27 | texts = [] 28 | for fIn in fIns: 29 | text = fIn.readline() 30 | 31 | if text == '': 32 | eof = True 33 | break 34 | 35 | texts.append(text) 36 | 37 | if eof: 38 | break; 39 | 40 | examples.append(InputExample(guid=str(len(examples)), texts=texts, label=1)) 41 | if max_examples > 0 and len(examples) >= max_examples: 42 | break 43 | 44 | return examples -------------------------------------------------------------------------------- /sentence_transformers_congen/models/LayerNorm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor 3 | from torch import nn 4 | from typing import Union, Tuple, List, Iterable, Dict 5 | import os 6 | import json 7 | 8 | 9 | class LayerNorm(nn.Module): 10 | def __init__(self, dimension: int): 11 | super(LayerNorm, self).__init__() 12 | self.dimension = dimension 13 | self.norm = nn.LayerNorm(dimension) 14 | 15 | 16 | def forward(self, features: Dict[str, Tensor]): 17 | features['sentence_embedding'] = self.norm(features['sentence_embedding']) 18 | return features 19 | 20 | 21 | def get_sentence_embedding_dimension(self): 22 | return self.dimension 23 | 24 | def save(self, output_path): 25 | with open(os.path.join(output_path, 'config.json'), 'w') as fOut: 26 | json.dump({'dimension': self.dimension}, fOut, indent=2) 27 | 28 | torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin')) 29 | 30 | @staticmethod 31 | def load(input_path): 32 | with open(os.path.join(input_path, 'config.json')) as fIn: 33 | config = json.load(fIn) 34 | 35 | model = LayerNorm(**config) 36 | model.load_state_dict(torch.load(os.path.join(input_path, 'pytorch_model.bin'), map_location=torch.device('cpu'))) 37 | return model -------------------------------------------------------------------------------- /sentence_transformers_congen/readers/LabelSentenceReader.py: -------------------------------------------------------------------------------- 1 | from . import InputExample 2 | import csv 3 | import gzip 4 | import os 5 | 6 | class LabelSentenceReader: 7 | """Reads in a file that has at least two columns: a label and a sentence. 8 | This reader can for example be used with the BatchHardTripletLoss. 9 | Maps labels automatically to integers""" 10 | def __init__(self, folder, label_col_idx=0, sentence_col_idx=1, separator='\t'): 11 | self.folder = folder 12 | self.label_map = {} 13 | self.label_col_idx = label_col_idx 14 | self.sentence_col_idx = sentence_col_idx 15 | self.separator = separator 16 | 17 | def get_examples(self, filename, max_examples=0): 18 | examples = [] 19 | 20 | id = 0 21 | for line in open(os.path.join(self.folder, filename), encoding="utf-8"): 22 | splits = line.strip().split(self.separator) 23 | label = splits[self.label_col_idx] 24 | sentence = splits[self.sentence_col_idx] 25 | 26 | if label not in self.label_map: 27 | self.label_map[label] = len(self.label_map) 28 | 29 | label_id = self.label_map[label] 30 | guid = "%s-%d" % (filename, id) 31 | id += 1 32 | examples.append(InputExample(guid=guid, texts=[sentence], label=label_id)) 33 | 34 | if 0 < max_examples <= id: 35 | break 36 | 37 | return examples 38 | -------------------------------------------------------------------------------- /sentence_transformers_congen/readers/TripletReader.py: -------------------------------------------------------------------------------- 1 | from . import InputExample 2 | import csv 3 | import gzip 4 | import os 5 | 6 | class TripletReader(object): 7 | """ 8 | Reads in the a Triplet Dataset: Each line contains (at least) 3 columns, one anchor column (s1), 9 | one positive example (s2) and one negative example (s3) 10 | """ 11 | def __init__(self, dataset_folder, s1_col_idx=0, s2_col_idx=1, s3_col_idx=2, has_header=False, delimiter="\t", 12 | quoting=csv.QUOTE_NONE): 13 | self.dataset_folder = dataset_folder 14 | self.s1_col_idx = s1_col_idx 15 | self.s2_col_idx = s2_col_idx 16 | self.s3_col_idx = s3_col_idx 17 | self.has_header = has_header 18 | self.delimiter = delimiter 19 | self.quoting = quoting 20 | 21 | def get_examples(self, filename, max_examples=0): 22 | """ 23 | 24 | """ 25 | data = csv.reader(open(os.path.join(self.dataset_folder, filename), encoding="utf-8"), delimiter=self.delimiter, 26 | quoting=self.quoting) 27 | examples = [] 28 | if self.has_header: 29 | next(data) 30 | 31 | for id, row in enumerate(data): 32 | s1 = row[self.s1_col_idx] 33 | s2 = row[self.s2_col_idx] 34 | s3 = row[self.s3_col_idx] 35 | 36 | examples.append(InputExample(texts=[s1, s2, s3])) 37 | if max_examples > 0 and len(examples) >= max_examples: 38 | break 39 | 40 | return examples -------------------------------------------------------------------------------- /SentEval/LICENSE: -------------------------------------------------------------------------------- 1 | BSD License 2 | 3 | For SentEval software 4 | 5 | Copyright (c) 2017-present, Facebook, Inc. All rights reserved. 6 | 7 | Redistribution and use in source and binary forms, with or without modification, 8 | are permitted provided that the following conditions are met: 9 | 10 | * Redistributions of source code must retain the above copyright notice, this 11 | list of conditions and the following disclaimer. 12 | 13 | * Redistributions in binary form must reproduce the above copyright notice, 14 | this list of conditions and the following disclaimer in the documentation 15 | and/or other materials provided with the distribution. 16 | 17 | * Neither the name Facebook nor the names of its contributors may be used to 18 | endorse or promote products derived from this software without specific 19 | prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 28 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /sentence_transformers_congen/readers/NLIDataReader.py: -------------------------------------------------------------------------------- 1 | from . import InputExample 2 | import csv 3 | import gzip 4 | import os 5 | 6 | 7 | class NLIDataReader(object): 8 | """ 9 | Reads in the Stanford NLI dataset and the MultiGenre NLI dataset 10 | """ 11 | def __init__(self, dataset_folder): 12 | self.dataset_folder = dataset_folder 13 | 14 | def get_examples(self, filename, max_examples=0): 15 | """ 16 | data_splits specified which data split to use (train, dev, test). 17 | Expects that self.dataset_folder contains the files s1.$data_split.gz, s2.$data_split.gz, 18 | labels.$data_split.gz, e.g., for the train split, s1.train.gz, s2.train.gz, labels.train.gz 19 | """ 20 | s1 = gzip.open(os.path.join(self.dataset_folder, 's1.' + filename), 21 | mode="rt", encoding="utf-8").readlines() 22 | s2 = gzip.open(os.path.join(self.dataset_folder, 's2.' + filename), 23 | mode="rt", encoding="utf-8").readlines() 24 | labels = gzip.open(os.path.join(self.dataset_folder, 'labels.' + filename), 25 | mode="rt", encoding="utf-8").readlines() 26 | 27 | examples = [] 28 | id = 0 29 | for sentence_a, sentence_b, label in zip(s1, s2, labels): 30 | guid = "%s-%d" % (filename, id) 31 | id += 1 32 | examples.append(InputExample(guid=guid, texts=[sentence_a, sentence_b], label=self.map_label(label))) 33 | 34 | if 0 < max_examples <= len(examples): 35 | break 36 | 37 | return examples 38 | 39 | @staticmethod 40 | def get_labels(): 41 | return {"contradiction": 0, "entailment": 1, "neutral": 2} 42 | 43 | def get_num_labels(self): 44 | return len(self.get_labels()) 45 | 46 | def map_label(self, label): 47 | return self.get_labels()[label.strip().lower()] -------------------------------------------------------------------------------- /SentEval/examples/skipthought.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | from __future__ import absolute_import, division, unicode_literals 9 | 10 | """ 11 | Example of file for SkipThought in SentEval 12 | """ 13 | import logging 14 | import sys 15 | sys.setdefaultencoding('utf8') 16 | 17 | 18 | # Set PATHs 19 | PATH_TO_SENTEVAL = '../' 20 | PATH_TO_DATA = '../data/senteval_data/' 21 | PATH_TO_SKIPTHOUGHT = '' 22 | 23 | assert PATH_TO_SKIPTHOUGHT != '', 'Download skipthought and set correct PATH' 24 | 25 | # import skipthought and Senteval 26 | sys.path.insert(0, PATH_TO_SKIPTHOUGHT) 27 | import skipthoughts 28 | sys.path.insert(0, PATH_TO_SENTEVAL) 29 | import senteval 30 | 31 | 32 | def prepare(params, samples): 33 | return 34 | 35 | def batcher(params, batch): 36 | batch = [str(' '.join(sent), errors="ignore") if sent != [] else '.' for sent in batch] 37 | embeddings = skipthoughts.encode(params['encoder'], batch, 38 | verbose=False, use_eos=True) 39 | return embeddings 40 | 41 | 42 | # Set params for SentEval 43 | params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 10, 'batch_size': 512} 44 | params_senteval['classifier'] = {'nhid': 0, 'optim': 'adam', 'batch_size': 64, 45 | 'tenacity': 5, 'epoch_size': 4} 46 | # Set up logger 47 | logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) 48 | 49 | if __name__ == "__main__": 50 | # Load SkipThought model 51 | params_senteval['encoder'] = skipthoughts.load_model() 52 | 53 | se = senteval.engine.SE(params_senteval, batcher, prepare) 54 | transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 55 | 'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 56 | 'SICKEntailment', 'SICKRelatedness', 'STSBenchmark', 57 | 'Length', 'WordContent', 'Depth', 'TopConstituents', 58 | 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber', 59 | 'OddManOut', 'CoordinationInversion'] 60 | results = se.eval(transfer_tasks) 61 | print(results) 62 | -------------------------------------------------------------------------------- /sentence_transformers_congen/losses/ConGenLoss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from typing import Iterable, Dict 6 | 7 | class ConGenLoss(nn.Module): 8 | def __init__(self, instanceQ_encoded, model, teacher_temp=0.1, student_temp=0.09): 9 | """ 10 | param model: SentenceTransformerModel 11 | teacher_temp: distillation temperature for teacher model 12 | student_temp: distillation temperature for student model 13 | """ 14 | super(ConGenLoss, self).__init__() 15 | self.instanceQ_encoded = instanceQ_encoded 16 | self.model = model 17 | self.teacher_temp = teacher_temp 18 | self.student_temp = student_temp 19 | 20 | def forward(self, 21 | sents1_features: Iterable[Dict[str, Tensor]], 22 | sents2_features: Iterable[Dict[str, Tensor]], 23 | Z_ref: Tensor): 24 | 25 | # Batch-size 26 | batch_size = Z_ref.shape[0] 27 | 28 | Z_con = F.normalize(self.model(sents1_features)['sentence_embedding'], p=2, dim=1) 29 | Z_gen = F.normalize(self.model(sents2_features)['sentence_embedding'], p=2, dim=1) 30 | 31 | # insert the current batch embedding from T 32 | instanceQ_encoded = self.instanceQ_encoded 33 | Q = torch.cat((instanceQ_encoded, Z_ref)) 34 | 35 | # probability scores distribution for T, S: B X (N + 1) 36 | T_ref = torch.einsum('nc,ck->nk', Z_ref, Q.t().clone().detach()) 37 | S_con = torch.einsum('nc,ck->nk', Z_con, Q.t().clone().detach()) 38 | S_gen = torch.einsum('nc,ck->nk', Z_gen, Q.t().clone().detach()) 39 | 40 | 41 | # Apply temperatures for soft-labels 42 | T_ref = F.softmax(T_ref/self.teacher_temp, dim=1) 43 | S_con = S_con / self.student_temp 44 | S_gen = S_gen / self.student_temp 45 | 46 | 47 | # loss computation, use log_softmax for stable computation 48 | loss_Con = -torch.mul(T_ref, F.log_softmax(S_con, dim=1)).sum() / batch_size 49 | loss_Gen = -torch.mul(T_ref, F.log_softmax(S_gen, dim=1)).sum() / batch_size 50 | 51 | # update the random sample queue 52 | self.instanceQ_encoded = Q[batch_size:] 53 | 54 | return (loss_Con + loss_Gen) / 2 -------------------------------------------------------------------------------- /SentEval/examples/googleuse.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | from __future__ import absolute_import, division 9 | 10 | import os 11 | import sys 12 | import logging 13 | import tensorflow as tf 14 | import tensorflow_hub as hub 15 | tf.logging.set_verbosity(0) 16 | 17 | # Set PATHs 18 | PATH_TO_SENTEVAL = '../' 19 | PATH_TO_DATA = '../data' 20 | 21 | # import SentEval 22 | sys.path.insert(0, PATH_TO_SENTEVAL) 23 | import senteval 24 | 25 | # tensorflow session 26 | session = tf.Session() 27 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 28 | 29 | # SentEval prepare and batcher 30 | def prepare(params, samples): 31 | return 32 | 33 | def batcher(params, batch): 34 | batch = [' '.join(sent) if sent != [] else '.' for sent in batch] 35 | embeddings = params['google_use'](batch) 36 | return embeddings 37 | 38 | def make_embed_fn(module): 39 | with tf.Graph().as_default(): 40 | sentences = tf.placeholder(tf.string) 41 | embed = hub.Module(module) 42 | embeddings = embed(sentences) 43 | session = tf.train.MonitoredSession() 44 | return lambda x: session.run(embeddings, {sentences: x}) 45 | 46 | # Start TF session and load Google Universal Sentence Encoder 47 | encoder = make_embed_fn("https://tfhub.dev/google/universal-sentence-encoder-large/2") 48 | 49 | # Set params for SentEval 50 | params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5} 51 | params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128, 52 | 'tenacity': 3, 'epoch_size': 2} 53 | params_senteval['google_use'] = encoder 54 | 55 | # Set up logger 56 | logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) 57 | 58 | if __name__ == "__main__": 59 | se = senteval.engine.SE(params_senteval, batcher, prepare) 60 | transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 61 | 'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 62 | 'SICKEntailment', 'SICKRelatedness', 'STSBenchmark', 63 | 'Length', 'WordContent', 'Depth', 'TopConstituents', 64 | 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber', 65 | 'OddManOut', 'CoordinationInversion'] 66 | results = se.eval(transfer_tasks) 67 | print(results) 68 | -------------------------------------------------------------------------------- /sentence_transformers_congen/models/WeightedLayerPooling.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor 3 | from torch import nn 4 | from typing import Union, Tuple, List, Iterable, Dict 5 | import os 6 | import json 7 | 8 | 9 | class WeightedLayerPooling(nn.Module): 10 | """ 11 | Token embeddings are weighted mean of their different hidden layer representations 12 | """ 13 | def __init__(self, word_embedding_dimension, num_hidden_layers: int = 12, layer_start: int = 4, layer_weights = None): 14 | super(WeightedLayerPooling, self).__init__() 15 | self.config_keys = ['word_embedding_dimension', 'layer_start', 'num_hidden_layers'] 16 | self.word_embedding_dimension = word_embedding_dimension 17 | self.layer_start = layer_start 18 | self.num_hidden_layers = num_hidden_layers 19 | self.layer_weights = layer_weights if layer_weights is not None else nn.Parameter(torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float)) 20 | 21 | def forward(self, features: Dict[str, Tensor]): 22 | ft_all_layers = features['all_layer_embeddings'] 23 | 24 | all_layer_embedding = torch.stack(ft_all_layers) 25 | all_layer_embedding = all_layer_embedding[self.layer_start:, :, :, :] # Start from 4th layers output 26 | 27 | weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size()) 28 | weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum() 29 | 30 | features.update({'token_embeddings': weighted_average}) 31 | return features 32 | 33 | def get_word_embedding_dimension(self): 34 | return self.word_embedding_dimension 35 | 36 | def get_config_dict(self): 37 | return {key: self.__dict__[key] for key in self.config_keys} 38 | 39 | def save(self, output_path): 40 | with open(os.path.join(output_path, 'config.json'), 'w') as fOut: 41 | json.dump(self.get_config_dict(), fOut, indent=2) 42 | 43 | torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin')) 44 | 45 | 46 | @staticmethod 47 | def load(input_path): 48 | with open(os.path.join(input_path, 'config.json')) as fIn: 49 | config = json.load(fIn) 50 | 51 | model = WeightedLayerPooling(**config) 52 | model.load_state_dict(torch.load(os.path.join(input_path, 'pytorch_model.bin'), map_location=torch.device('cpu'))) 53 | return model 54 | -------------------------------------------------------------------------------- /sentence_transformers_congen/models/tokenizer/WhitespaceTokenizer.py: -------------------------------------------------------------------------------- 1 | from typing import Union, Tuple, List, Iterable, Dict 2 | import collections 3 | import string 4 | import os 5 | import json 6 | from .WordTokenizer import WordTokenizer, ENGLISH_STOP_WORDS 7 | 8 | class WhitespaceTokenizer(WordTokenizer): 9 | """ 10 | Simple and fast white-space tokenizer. Splits sentence based on white spaces. 11 | Punctuation are stripped from tokens. 12 | """ 13 | def __init__(self, vocab: Iterable[str] = [], stop_words: Iterable[str] = ENGLISH_STOP_WORDS, do_lower_case: bool = False): 14 | self.stop_words = set(stop_words) 15 | self.do_lower_case = do_lower_case 16 | self.set_vocab(vocab) 17 | 18 | def get_vocab(self): 19 | return self.vocab 20 | 21 | def set_vocab(self, vocab: Iterable[str]): 22 | self.vocab = vocab 23 | self.word2idx = collections.OrderedDict([(word, idx) for idx, word in enumerate(vocab)]) 24 | 25 | def tokenize(self, text: str) -> List[int]: 26 | if self.do_lower_case: 27 | text = text.lower() 28 | 29 | tokens = text.split() 30 | 31 | tokens_filtered = [] 32 | for token in tokens: 33 | if token in self.stop_words: 34 | continue 35 | elif token in self.word2idx: 36 | tokens_filtered.append(self.word2idx[token]) 37 | continue 38 | 39 | token = token.strip(string.punctuation) 40 | if token in self.stop_words: 41 | continue 42 | elif len(token) > 0 and token in self.word2idx: 43 | tokens_filtered.append(self.word2idx[token]) 44 | continue 45 | 46 | token = token.lower() 47 | if token in self.stop_words: 48 | continue 49 | elif token in self.word2idx: 50 | tokens_filtered.append(self.word2idx[token]) 51 | continue 52 | 53 | return tokens_filtered 54 | 55 | def save(self, output_path: str): 56 | with open(os.path.join(output_path, 'whitespacetokenizer_config.json'), 'w') as fOut: 57 | json.dump({'vocab': list(self.word2idx.keys()), 'stop_words': list(self.stop_words), 'do_lower_case': self.do_lower_case}, fOut) 58 | 59 | @staticmethod 60 | def load(input_path: str): 61 | with open(os.path.join(input_path, 'whitespacetokenizer_config.json'), 'r') as fIn: 62 | config = json.load(fIn) 63 | 64 | return WhitespaceTokenizer(**config) 65 | -------------------------------------------------------------------------------- /sentence_transformers_congen/models/LSTM.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from typing import List 4 | import os 5 | import json 6 | 7 | 8 | 9 | class LSTM(nn.Module): 10 | """ 11 | Bidirectional LSTM running over word embeddings. 12 | """ 13 | def __init__(self, word_embedding_dimension: int, hidden_dim: int, num_layers: int = 1, dropout: float = 0, bidirectional: bool = True): 14 | nn.Module.__init__(self) 15 | self.config_keys = ['word_embedding_dimension', 'hidden_dim', 'num_layers', 'dropout', 'bidirectional'] 16 | self.word_embedding_dimension = word_embedding_dimension 17 | self.hidden_dim = hidden_dim 18 | self.num_layers = num_layers 19 | self.dropout = dropout 20 | self.bidirectional = bidirectional 21 | 22 | self.embeddings_dimension = hidden_dim 23 | if self.bidirectional: 24 | self.embeddings_dimension *= 2 25 | 26 | self.encoder = nn.LSTM(word_embedding_dimension, hidden_dim, num_layers=num_layers, dropout=dropout, bidirectional=bidirectional, batch_first=True) 27 | 28 | def forward(self, features): 29 | token_embeddings = features['token_embeddings'] 30 | sentence_lengths = torch.clamp(features['sentence_lengths'], min=1) 31 | 32 | packed = nn.utils.rnn.pack_padded_sequence(token_embeddings, sentence_lengths, batch_first=True, enforce_sorted=False) 33 | packed = self.encoder(packed) 34 | unpack = nn.utils.rnn.pad_packed_sequence(packed[0], batch_first=True)[0] 35 | features.update({'token_embeddings': unpack}) 36 | return features 37 | 38 | def get_word_embedding_dimension(self) -> int: 39 | return self.embeddings_dimension 40 | 41 | def tokenize(self, text: str) -> List[int]: 42 | raise NotImplementedError() 43 | 44 | def save(self, output_path: str): 45 | with open(os.path.join(output_path, 'lstm_config.json'), 'w') as fOut: 46 | json.dump(self.get_config_dict(), fOut, indent=2) 47 | 48 | torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin')) 49 | 50 | def get_config_dict(self): 51 | return {key: self.__dict__[key] for key in self.config_keys} 52 | 53 | @staticmethod 54 | def load(input_path: str): 55 | with open(os.path.join(input_path, 'lstm_config.json'), 'r') as fIn: 56 | config = json.load(fIn) 57 | 58 | weights = torch.load(os.path.join(input_path, 'pytorch_model.bin')) 59 | model = LSTM(**config) 60 | model.load_state_dict(weights) 61 | return model 62 | 63 | -------------------------------------------------------------------------------- /SentEval/examples/gensen.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | """ 9 | Clone GenSen repo here: https://github.com/Maluuba/gensen.git 10 | And follow instructions for loading the model used in batcher 11 | """ 12 | 13 | from __future__ import absolute_import, division, unicode_literals 14 | 15 | import sys 16 | import logging 17 | # import GenSen package 18 | from gensen import GenSen, GenSenSingle 19 | 20 | # Set PATHs 21 | PATH_TO_SENTEVAL = '../' 22 | PATH_TO_DATA = '../data' 23 | 24 | # import SentEval 25 | sys.path.insert(0, PATH_TO_SENTEVAL) 26 | import senteval 27 | 28 | # SentEval prepare and batcher 29 | def prepare(params, samples): 30 | return 31 | 32 | def batcher(params, batch): 33 | batch = [' '.join(sent) if sent != [] else '.' for sent in batch] 34 | _, reps_h_t = gensen.get_representation( 35 | sentences, pool='last', return_numpy=True, tokenize=True 36 | ) 37 | embeddings = reps_h_t 38 | return embeddings 39 | 40 | # Load GenSen model 41 | gensen_1 = GenSenSingle( 42 | model_folder='../data/models', 43 | filename_prefix='nli_large_bothskip', 44 | pretrained_emb='../data/embedding/glove.840B.300d.h5' 45 | ) 46 | gensen_2 = GenSenSingle( 47 | model_folder='../data/models', 48 | filename_prefix='nli_large_bothskip_parse', 49 | pretrained_emb='../data/embedding/glove.840B.300d.h5' 50 | ) 51 | gensen_encoder = GenSen(gensen_1, gensen_2) 52 | reps_h, reps_h_t = gensen.get_representation( 53 | sentences, pool='last', return_numpy=True, tokenize=True 54 | ) 55 | 56 | # Set params for SentEval 57 | params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5} 58 | params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128, 59 | 'tenacity': 3, 'epoch_size': 2} 60 | params_senteval['gensen'] = gensen_encoder 61 | 62 | # Set up logger 63 | logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) 64 | 65 | if __name__ == "__main__": 66 | se = senteval.engine.SE(params_senteval, batcher, prepare) 67 | transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 68 | 'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 69 | 'SICKEntailment', 'SICKRelatedness', 'STSBenchmark', 70 | 'Length', 'WordContent', 'Depth', 'TopConstituents', 71 | 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber', 72 | 'OddManOut', 'CoordinationInversion'] 73 | results = se.eval(transfer_tasks) 74 | print(results) 75 | -------------------------------------------------------------------------------- /SentEval/examples/infersent.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | """ 9 | InferSent models. See https://github.com/facebookresearch/InferSent. 10 | """ 11 | 12 | from __future__ import absolute_import, division, unicode_literals 13 | 14 | import sys 15 | import os 16 | import torch 17 | import logging 18 | 19 | # get models.py from InferSent repo 20 | from models import InferSent 21 | 22 | # Set PATHs 23 | PATH_SENTEVAL = '../' 24 | PATH_TO_DATA = '../data' 25 | PATH_TO_W2V = 'PATH/TO/glove.840B.300d.txt' # or crawl-300d-2M.vec for V2 26 | MODEL_PATH = 'infersent1.pkl' 27 | V = 1 # version of InferSent 28 | 29 | assert os.path.isfile(MODEL_PATH) and os.path.isfile(PATH_TO_W2V), \ 30 | 'Set MODEL and GloVe PATHs' 31 | 32 | # import senteval 33 | sys.path.insert(0, PATH_SENTEVAL) 34 | import senteval 35 | 36 | 37 | def prepare(params, samples): 38 | params.infersent.build_vocab([' '.join(s) for s in samples], tokenize=False) 39 | 40 | 41 | def batcher(params, batch): 42 | sentences = [' '.join(s) for s in batch] 43 | embeddings = params.infersent.encode(sentences, bsize=params.batch_size, tokenize=False) 44 | return embeddings 45 | 46 | 47 | """ 48 | Evaluation of trained model on Transfer Tasks (SentEval) 49 | """ 50 | 51 | # define senteval params 52 | params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5} 53 | params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128, 54 | 'tenacity': 3, 'epoch_size': 2} 55 | # Set up logger 56 | logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) 57 | 58 | if __name__ == "__main__": 59 | # Load InferSent model 60 | params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 61 | 'pool_type': 'max', 'dpout_model': 0.0, 'version': V} 62 | model = InferSent(params_model) 63 | model.load_state_dict(torch.load(MODEL_PATH)) 64 | model.set_w2v_path(PATH_TO_W2V) 65 | 66 | params_senteval['infersent'] = model.cuda() 67 | 68 | se = senteval.engine.SE(params_senteval, batcher, prepare) 69 | transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 70 | 'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 71 | 'SICKEntailment', 'SICKRelatedness', 'STSBenchmark', 72 | 'Length', 'WordContent', 'Depth', 'TopConstituents', 73 | 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber', 74 | 'OddManOut', 'CoordinationInversion'] 75 | results = se.eval(transfer_tasks) 76 | print(results) 77 | -------------------------------------------------------------------------------- /sentence_transformers_congen/readers/STSDataReader.py: -------------------------------------------------------------------------------- 1 | from . import InputExample 2 | import csv 3 | import gzip 4 | import os 5 | 6 | class STSDataReader: 7 | """ 8 | Reads in the STS dataset. Each line contains two sentences (s1_col_idx, s2_col_idx) and one label (score_col_idx) 9 | 10 | Default values expects a tab seperated file with the first & second column the sentence pair and third column the score (0...1). Default config normalizes scores from 0...5 to 0...1 11 | """ 12 | def __init__(self, dataset_folder, s1_col_idx=0, s2_col_idx=1, score_col_idx=2, delimiter="\t", 13 | quoting=csv.QUOTE_NONE, normalize_scores=True, min_score=0, max_score=5): 14 | self.dataset_folder = dataset_folder 15 | self.score_col_idx = score_col_idx 16 | self.s1_col_idx = s1_col_idx 17 | self.s2_col_idx = s2_col_idx 18 | self.delimiter = delimiter 19 | self.quoting = quoting 20 | self.normalize_scores = normalize_scores 21 | self.min_score = min_score 22 | self.max_score = max_score 23 | 24 | def get_examples(self, filename, max_examples=0): 25 | """ 26 | filename specified which data split to use (train.csv, dev.csv, test.csv). 27 | """ 28 | filepath = os.path.join(self.dataset_folder, filename) 29 | with gzip.open(filepath, 'rt', encoding='utf8') if filename.endswith('.gz') else open(filepath, encoding="utf-8") as fIn: 30 | data = csv.reader(fIn, delimiter=self.delimiter, quoting=self.quoting) 31 | examples = [] 32 | for id, row in enumerate(data): 33 | score = float(row[self.score_col_idx]) 34 | if self.normalize_scores: # Normalize to a 0...1 value 35 | score = (score - self.min_score) / (self.max_score - self.min_score) 36 | 37 | s1 = row[self.s1_col_idx] 38 | s2 = row[self.s2_col_idx] 39 | examples.append(InputExample(guid=filename+str(id), texts=[s1, s2], label=score)) 40 | 41 | if max_examples > 0 and len(examples) >= max_examples: 42 | break 43 | 44 | return examples 45 | 46 | class STSBenchmarkDataReader(STSDataReader): 47 | """ 48 | Reader especially for the STS benchmark dataset. There, the sentences are in column 5 and 6, the score is in column 4. 49 | Scores are normalized from 0...5 to 0...1 50 | """ 51 | def __init__(self, dataset_folder, s1_col_idx=5, s2_col_idx=6, score_col_idx=4, delimiter="\t", 52 | quoting=csv.QUOTE_NONE, normalize_scores=True, min_score=0, max_score=5): 53 | super().__init__(dataset_folder=dataset_folder, s1_col_idx=s1_col_idx, s2_col_idx=s2_col_idx, score_col_idx=score_col_idx, delimiter=delimiter, 54 | quoting=quoting, normalize_scores=normalize_scores, min_score=min_score, max_score=max_score) -------------------------------------------------------------------------------- /sentence_transformers_congen/models/Dense.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor 3 | from torch import nn 4 | from torch import functional as F 5 | from typing import Union, Tuple, List, Iterable, Dict 6 | import os 7 | import json 8 | from ..util import fullname, import_from_string 9 | 10 | 11 | class Dense(nn.Module): 12 | """Feed-forward function with activiation function. 13 | 14 | This layer takes a fixed-sized sentence embedding and passes it through a feed-forward layer. Can be used to generate deep averaging networs (DAN). 15 | 16 | :param in_features: Size of the input dimension 17 | :param out_features: Output size 18 | :param bias: Add a bias vector 19 | :param activation_function: Pytorch activation function applied on output 20 | :param init_weight: Initial value for the matrix of the linear layer 21 | :param init_bias: Initial value for the bias of the linear layer 22 | """ 23 | def __init__(self, in_features: int, out_features: int, bias: bool = True, activation_function=nn.Tanh(), init_weight: Tensor = None, init_bias: Tensor = None): 24 | super(Dense, self).__init__() 25 | self.in_features = in_features 26 | self.out_features = out_features 27 | self.bias = bias 28 | self.activation_function = activation_function 29 | self.linear = nn.Linear(in_features, out_features, bias=bias) 30 | 31 | if init_weight is not None: 32 | self.linear.weight = nn.Parameter(init_weight) 33 | 34 | if init_bias is not None: 35 | self.linear.bias = nn.Parameter(init_bias) 36 | 37 | def forward(self, features: Dict[str, Tensor]): 38 | features.update({'sentence_embedding': self.activation_function(self.linear(features['sentence_embedding']))}) 39 | return features 40 | 41 | def get_sentence_embedding_dimension(self) -> int: 42 | return self.out_features 43 | 44 | def get_config_dict(self): 45 | return {'in_features': self.in_features, 'out_features': self.out_features, 'bias': self.bias, 'activation_function': fullname(self.activation_function)} 46 | 47 | def save(self, output_path): 48 | with open(os.path.join(output_path, 'config.json'), 'w') as fOut: 49 | json.dump(self.get_config_dict(), fOut) 50 | 51 | torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin')) 52 | 53 | def __repr__(self): 54 | return "Dense({})".format(self.get_config_dict()) 55 | @staticmethod 56 | def load(input_path): 57 | with open(os.path.join(input_path, 'config.json')) as fIn: 58 | config = json.load(fIn) 59 | 60 | config['activation_function'] = import_from_string(config['activation_function'])() 61 | model = Dense(**config) 62 | model.load_state_dict(torch.load(os.path.join(input_path, 'pytorch_model.bin'), map_location=torch.device('cpu'))) 63 | return model 64 | -------------------------------------------------------------------------------- /sentence_transformers_congen/models/CNN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn, Tensor 3 | from typing import Union, Tuple, List, Iterable, Dict 4 | import logging 5 | import gzip 6 | from tqdm import tqdm 7 | import numpy as np 8 | import os 9 | import json 10 | from ..util import import_from_string, fullname, http_get 11 | from .tokenizer import WordTokenizer, WhitespaceTokenizer 12 | 13 | 14 | class CNN(nn.Module): 15 | """CNN-layer with multiple kernel-sizes over the word embeddings""" 16 | 17 | def __init__(self, in_word_embedding_dimension: int, out_channels: int = 256, kernel_sizes: List[int] = [1, 3, 5], stride_sizes: List[int] = None): 18 | nn.Module.__init__(self) 19 | self.config_keys = ['in_word_embedding_dimension', 'out_channels', 'kernel_sizes'] 20 | self.in_word_embedding_dimension = in_word_embedding_dimension 21 | self.out_channels = out_channels 22 | self.kernel_sizes = kernel_sizes 23 | 24 | self.embeddings_dimension = out_channels*len(kernel_sizes) 25 | self.convs = nn.ModuleList() 26 | 27 | in_channels = in_word_embedding_dimension 28 | if stride_sizes is None: 29 | stride_sizes = [1] * len(kernel_sizes) 30 | 31 | for kernel_size, stride in zip(kernel_sizes, stride_sizes): 32 | padding_size = int((kernel_size - 1) / 2) 33 | conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, 34 | kernel_size=kernel_size, 35 | stride=stride, 36 | padding=padding_size) 37 | self.convs.append(conv) 38 | 39 | def forward(self, features): 40 | token_embeddings = features['token_embeddings'] 41 | 42 | token_embeddings = token_embeddings.transpose(1, -1) 43 | vectors = [conv(token_embeddings) for conv in self.convs] 44 | out = torch.cat(vectors, 1).transpose(1, -1) 45 | 46 | features.update({'token_embeddings': out}) 47 | return features 48 | 49 | def get_word_embedding_dimension(self) -> int: 50 | return self.embeddings_dimension 51 | 52 | def tokenize(self, text: str) -> List[int]: 53 | raise NotImplementedError() 54 | 55 | def save(self, output_path: str): 56 | with open(os.path.join(output_path, 'cnn_config.json'), 'w') as fOut: 57 | json.dump(self.get_config_dict(), fOut, indent=2) 58 | 59 | torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin')) 60 | 61 | def get_config_dict(self): 62 | return {key: self.__dict__[key] for key in self.config_keys} 63 | 64 | @staticmethod 65 | def load(input_path: str): 66 | with open(os.path.join(input_path, 'cnn_config.json'), 'r') as fIn: 67 | config = json.load(fIn) 68 | 69 | weights = torch.load(os.path.join(input_path, 'pytorch_model.bin'), map_location=torch.device('cpu')) 70 | model = CNN(**config) 71 | model.load_state_dict(weights) 72 | return model 73 | 74 | -------------------------------------------------------------------------------- /SentEval/senteval/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | from __future__ import absolute_import, division, unicode_literals 9 | 10 | import numpy as np 11 | import re 12 | import inspect 13 | from torch import optim 14 | 15 | 16 | def create_dictionary(sentences): 17 | words = {} 18 | for s in sentences: 19 | for word in s: 20 | if word in words: 21 | words[word] += 1 22 | else: 23 | words[word] = 1 24 | words[''] = 1e9 + 4 25 | words[''] = 1e9 + 3 26 | words['

'] = 1e9 + 2 27 | # words[''] = 1e9 + 1 28 | sorted_words = sorted(words.items(), key=lambda x: -x[1]) # inverse sort 29 | id2word = [] 30 | word2id = {} 31 | for i, (w, _) in enumerate(sorted_words): 32 | id2word.append(w) 33 | word2id[w] = i 34 | 35 | return id2word, word2id 36 | 37 | 38 | def cosine(u, v): 39 | return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v)) 40 | 41 | 42 | class dotdict(dict): 43 | """ dot.notation access to dictionary attributes """ 44 | __getattr__ = dict.get 45 | __setattr__ = dict.__setitem__ 46 | __delattr__ = dict.__delitem__ 47 | 48 | 49 | def get_optimizer(s): 50 | """ 51 | Parse optimizer parameters. 52 | Input should be of the form: 53 | - "sgd,lr=0.01" 54 | - "adagrad,lr=0.1,lr_decay=0.05" 55 | """ 56 | if "," in s: 57 | method = s[:s.find(',')] 58 | optim_params = {} 59 | for x in s[s.find(',') + 1:].split(','): 60 | split = x.split('=') 61 | assert len(split) == 2 62 | assert re.match("^[+-]?(\d+(\.\d*)?|\.\d+)$", split[1]) is not None 63 | optim_params[split[0]] = float(split[1]) 64 | else: 65 | method = s 66 | optim_params = {} 67 | 68 | if method == 'adadelta': 69 | optim_fn = optim.Adadelta 70 | elif method == 'adagrad': 71 | optim_fn = optim.Adagrad 72 | elif method == 'adam': 73 | optim_fn = optim.Adam 74 | elif method == 'adamax': 75 | optim_fn = optim.Adamax 76 | elif method == 'asgd': 77 | optim_fn = optim.ASGD 78 | elif method == 'rmsprop': 79 | optim_fn = optim.RMSprop 80 | elif method == 'rprop': 81 | optim_fn = optim.Rprop 82 | elif method == 'sgd': 83 | optim_fn = optim.SGD 84 | assert 'lr' in optim_params 85 | else: 86 | raise Exception('Unknown optimization method: "%s"' % method) 87 | 88 | # check that we give good parameters to the optimizer 89 | expected_args = inspect.getargspec(optim_fn.__init__)[0] 90 | assert expected_args[:2] == ['self', 'params'] 91 | if not all(k in expected_args[2:] for k in optim_params.keys()): 92 | raise Exception('Unexpected parameters: expected "%s", got "%s"' % ( 93 | str(expected_args[2:]), str(optim_params.keys()))) 94 | 95 | return optim_fn, optim_params 96 | -------------------------------------------------------------------------------- /sentence_transformers_congen/models/CLIPModel.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | import transformers 3 | import torch 4 | from PIL import Image 5 | 6 | 7 | class CLIPModel(nn.Module): 8 | def __init__(self, model_name: str = "openai/clip-vit-base-patch32", processor_name = None): 9 | super(CLIPModel, self).__init__() 10 | 11 | if processor_name is None: 12 | processor_name = model_name 13 | 14 | self.model = transformers.CLIPModel.from_pretrained(model_name) 15 | self.processor = transformers.CLIPProcessor.from_pretrained(processor_name) 16 | 17 | def __repr__(self): 18 | return "CLIPModel()" 19 | 20 | def forward(self, features): 21 | image_embeds = [] 22 | text_embeds = [] 23 | 24 | if 'pixel_values' in features: 25 | vision_outputs = self.model.vision_model(pixel_values=features['pixel_values']) 26 | image_embeds = self.model.visual_projection(vision_outputs[1]) 27 | 28 | if 'input_ids' in features: 29 | text_outputs = self.model.text_model( 30 | input_ids=features.get('input_ids'), 31 | attention_mask=features.get('attention_mask', None), 32 | position_ids=features.get('position_ids', None), 33 | output_attentions=features.get('output_attentions', None), 34 | output_hidden_states=features.get('output_hidden_states', None), 35 | ) 36 | text_embeds = self.model.text_projection(text_outputs[1]) 37 | 38 | sentence_embedding = [] 39 | image_features = iter(image_embeds) 40 | text_features = iter(text_embeds) 41 | 42 | for idx, input_type in enumerate(features['image_text_info']): 43 | if input_type == 0: 44 | sentence_embedding.append(next(image_features)) 45 | else: 46 | sentence_embedding.append(next(text_features)) 47 | 48 | features['sentence_embedding'] = torch.stack(sentence_embedding).float() 49 | 50 | return features 51 | 52 | 53 | def tokenize(self, texts): 54 | images = [] 55 | texts_values = [] 56 | image_text_info = [] 57 | 58 | for idx, data in enumerate(texts): 59 | if isinstance(data, Image.Image): # An Image 60 | images.append(data) 61 | image_text_info.append(0) 62 | else: # A text 63 | texts_values.append(data) 64 | image_text_info.append(1) 65 | 66 | if len(texts_values) == 0: 67 | texts_values = None 68 | if len(images) == 0: 69 | images = None 70 | 71 | inputs = self.processor(text=texts_values, images=images, return_tensors="pt", padding=True) 72 | inputs['image_text_info'] = image_text_info 73 | return inputs 74 | 75 | 76 | def save(self, output_path: str): 77 | self.model.save_pretrained(output_path) 78 | self.processor.save_pretrained(output_path) 79 | 80 | @staticmethod 81 | def load(input_path: str): 82 | return CLIPModel(model_name=input_path) 83 | 84 | 85 | -------------------------------------------------------------------------------- /sentence_transformers_congen/evaluation/LabelAccuracyEvaluator.py: -------------------------------------------------------------------------------- 1 | from . import SentenceEvaluator 2 | import torch 3 | from torch.utils.data import DataLoader 4 | import logging 5 | from ..util import batch_to_device 6 | import os 7 | import csv 8 | 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | class LabelAccuracyEvaluator(SentenceEvaluator): 13 | """ 14 | Evaluate a model based on its accuracy on a labeled dataset 15 | 16 | This requires a model with LossFunction.SOFTMAX 17 | 18 | The results are written in a CSV. If a CSV already exists, then values are appended. 19 | """ 20 | 21 | def __init__(self, dataloader: DataLoader, name: str = "", softmax_model = None, write_csv: bool = True): 22 | """ 23 | Constructs an evaluator for the given dataset 24 | 25 | :param dataloader: 26 | the data for the evaluation 27 | """ 28 | self.dataloader = dataloader 29 | self.name = name 30 | self.softmax_model = softmax_model 31 | 32 | if name: 33 | name = "_"+name 34 | 35 | self.write_csv = write_csv 36 | self.csv_file = "accuracy_evaluation"+name+"_results.csv" 37 | self.csv_headers = ["epoch", "steps", "accuracy"] 38 | 39 | def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: 40 | model.eval() 41 | total = 0 42 | correct = 0 43 | 44 | if epoch != -1: 45 | if steps == -1: 46 | out_txt = " after epoch {}:".format(epoch) 47 | else: 48 | out_txt = " in epoch {} after {} steps:".format(epoch, steps) 49 | else: 50 | out_txt = ":" 51 | 52 | logger.info("Evaluation on the "+self.name+" dataset"+out_txt) 53 | self.dataloader.collate_fn = model.smart_batching_collate 54 | for step, batch in enumerate(self.dataloader): 55 | features, label_ids = batch 56 | for idx in range(len(features)): 57 | features[idx] = batch_to_device(features[idx], model.device) 58 | label_ids = label_ids.to(model.device) 59 | with torch.no_grad(): 60 | _, prediction = self.softmax_model(features, labels=None) 61 | 62 | total += prediction.size(0) 63 | correct += torch.argmax(prediction, dim=1).eq(label_ids).sum().item() 64 | accuracy = correct/total 65 | 66 | logger.info("Accuracy: {:.4f} ({}/{})\n".format(accuracy, correct, total)) 67 | 68 | if output_path is not None and self.write_csv: 69 | csv_path = os.path.join(output_path, self.csv_file) 70 | if not os.path.isfile(csv_path): 71 | with open(csv_path, newline='', mode="w", encoding="utf-8") as f: 72 | writer = csv.writer(f) 73 | writer.writerow(self.csv_headers) 74 | writer.writerow([epoch, steps, accuracy]) 75 | else: 76 | with open(csv_path, newline='', mode="a", encoding="utf-8") as f: 77 | writer = csv.writer(f) 78 | writer.writerow([epoch, steps, accuracy]) 79 | 80 | return accuracy 81 | -------------------------------------------------------------------------------- /sentence_transformers_congen/evaluation/MSEEvaluator.py: -------------------------------------------------------------------------------- 1 | from sentence_transformers.evaluation import SentenceEvaluator 2 | import numpy as np 3 | import logging 4 | import os 5 | import csv 6 | from typing import List 7 | 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | class MSEEvaluator(SentenceEvaluator): 12 | """ 13 | Computes the mean squared error (x100) between the computed sentence embedding 14 | and some target sentence embedding. 15 | 16 | The MSE is computed between ||teacher.encode(source_sentences) - student.encode(target_sentences)||. 17 | 18 | For multilingual knowledge distillation (https://arxiv.org/abs/2004.09813), source_sentences are in English 19 | and target_sentences are in a different language like German, Chinese, Spanish... 20 | 21 | :param source_sentences: Source sentences are embedded with the teacher model 22 | :param target_sentences: Target sentences are ambedding with the student model. 23 | :param show_progress_bar: Show progress bar when computing embeddings 24 | :param batch_size: Batch size to compute sentence embeddings 25 | :param name: Name of the evaluator 26 | :param write_csv: Write results to CSV file 27 | """ 28 | def __init__(self, source_sentences: List[str], target_sentences: List[str], teacher_model = None, show_progress_bar: bool = False, batch_size: int = 32, name: str = '', write_csv: bool = True): 29 | self.source_embeddings = teacher_model.encode(source_sentences, show_progress_bar=show_progress_bar, batch_size=batch_size, convert_to_numpy=True) 30 | 31 | self.target_sentences = target_sentences 32 | self.show_progress_bar = show_progress_bar 33 | self.batch_size = batch_size 34 | self.name = name 35 | 36 | self.csv_file = "mse_evaluation_" + name + "_results.csv" 37 | self.csv_headers = ["epoch", "steps", "MSE"] 38 | self.write_csv = write_csv 39 | 40 | def __call__(self, model, output_path, epoch = -1, steps = -1): 41 | if epoch != -1: 42 | if steps == -1: 43 | out_txt = " after epoch {}:".format(epoch) 44 | else: 45 | out_txt = " in epoch {} after {} steps:".format(epoch, steps) 46 | else: 47 | out_txt = ":" 48 | 49 | target_embeddings = model.encode(self.target_sentences, show_progress_bar=self.show_progress_bar, batch_size=self.batch_size, convert_to_numpy=True) 50 | 51 | mse = ((self.source_embeddings - target_embeddings)**2).mean() 52 | mse *= 100 53 | 54 | logger.info("MSE evaluation (lower = better) on "+self.name+" dataset"+out_txt) 55 | logger.info("MSE (*100):\t{:4f}".format(mse)) 56 | 57 | if output_path is not None and self.write_csv: 58 | csv_path = os.path.join(output_path, self.csv_file) 59 | output_file_exists = os.path.isfile(csv_path) 60 | with open(csv_path, newline='', mode="a" if output_file_exists else 'w', encoding="utf-8") as f: 61 | writer = csv.writer(f) 62 | if not output_file_exists: 63 | writer.writerow(self.csv_headers) 64 | 65 | writer.writerow([epoch, steps, mse]) 66 | 67 | return -mse #Return negative score as SentenceTransformers maximizes the performance 68 | -------------------------------------------------------------------------------- /sentence_transformers_congen/models/WordWeights.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor 3 | from torch import nn 4 | from typing import Union, Tuple, List, Iterable, Dict 5 | import os 6 | import json 7 | import logging 8 | 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | class WordWeights(nn.Module): 13 | """This model can weight word embeddings, for example, with idf-values.""" 14 | 15 | def __init__(self, vocab: List[str], word_weights: Dict[str, float], unknown_word_weight: float = 1): 16 | """ 17 | 18 | :param vocab: 19 | Vocabulary of the tokenizer 20 | :param word_weights: 21 | Mapping of tokens to a float weight value. Words embeddings are multiplied by this float value. Tokens in word_weights must not be equal to the vocab (can contain more or less values) 22 | :param unknown_word_weight: 23 | Weight for words in vocab, that do not appear in the word_weights lookup. These can be for example rare words in the vocab, where no weight exists. 24 | """ 25 | super(WordWeights, self).__init__() 26 | self.config_keys = ['vocab', 'word_weights', 'unknown_word_weight'] 27 | self.vocab = vocab 28 | self.word_weights = word_weights 29 | self.unknown_word_weight = unknown_word_weight 30 | 31 | weights = [] 32 | num_unknown_words = 0 33 | for word in vocab: 34 | weight = unknown_word_weight 35 | if word in word_weights: 36 | weight = word_weights[word] 37 | elif word.lower() in word_weights: 38 | weight = word_weights[word.lower()] 39 | else: 40 | num_unknown_words += 1 41 | weights.append(weight) 42 | 43 | logger.info("{} of {} words without a weighting value. Set weight to {}".format(num_unknown_words, len(vocab), unknown_word_weight)) 44 | 45 | self.emb_layer = nn.Embedding(len(vocab), 1) 46 | self.emb_layer.load_state_dict({'weight': torch.FloatTensor(weights).unsqueeze(1)}) 47 | 48 | 49 | def forward(self, features: Dict[str, Tensor]): 50 | attention_mask = features['attention_mask'] 51 | token_embeddings = features['token_embeddings'] 52 | 53 | #Compute a weight value for each token 54 | token_weights_raw = self.emb_layer(features['input_ids']).squeeze(-1) 55 | token_weights = token_weights_raw * attention_mask.float() 56 | token_weights_sum = torch.sum(token_weights, 1) 57 | 58 | #Multiply embedding by token weight value 59 | token_weights_expanded = token_weights.unsqueeze(-1).expand(token_embeddings.size()) 60 | token_embeddings = token_embeddings * token_weights_expanded 61 | 62 | features.update({'token_embeddings': token_embeddings, 'token_weights_sum': token_weights_sum}) 63 | return features 64 | 65 | def get_config_dict(self): 66 | return {key: self.__dict__[key] for key in self.config_keys} 67 | 68 | def save(self, output_path): 69 | with open(os.path.join(output_path, 'config.json'), 'w') as fOut: 70 | json.dump(self.get_config_dict(), fOut, indent=2) 71 | 72 | @staticmethod 73 | def load(input_path): 74 | with open(os.path.join(input_path, 'config.json')) as fIn: 75 | config = json.load(fIn) 76 | 77 | return WordWeights(**config) 78 | -------------------------------------------------------------------------------- /sentence_transformers_congen/models/BoW.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor 3 | from torch import nn 4 | from typing import Union, Tuple, List, Iterable, Dict 5 | import os 6 | import json 7 | import logging 8 | import numpy as np 9 | from .tokenizer import WhitespaceTokenizer 10 | 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | class BoW(nn.Module): 15 | """Implements a Bag-of-Words (BoW) model to derive sentence embeddings. 16 | 17 | A weighting can be added to allow the generation of tf-idf vectors. The output vector has the size of the vocab. 18 | """ 19 | 20 | def __init__(self, vocab: List[str], word_weights: Dict[str, float] = {}, unknown_word_weight: float = 1, cumulative_term_frequency: bool = True): 21 | super(BoW, self).__init__() 22 | vocab = list(set(vocab)) #Ensure vocab is unique 23 | self.config_keys = ['vocab', 'word_weights', 'unknown_word_weight', 'cumulative_term_frequency'] 24 | self.vocab = vocab 25 | self.word_weights = word_weights 26 | self.unknown_word_weight = unknown_word_weight 27 | self.cumulative_term_frequency = cumulative_term_frequency 28 | 29 | #Maps wordIdx -> word weight 30 | self.weights = [] 31 | num_unknown_words = 0 32 | for word in vocab: 33 | weight = unknown_word_weight 34 | if word in word_weights: 35 | weight = word_weights[word] 36 | elif word.lower() in word_weights: 37 | weight = word_weights[word.lower()] 38 | else: 39 | num_unknown_words += 1 40 | self.weights.append(weight) 41 | 42 | logger.info("{} out of {} words without a weighting value. Set weight to {}".format(num_unknown_words, len(vocab), unknown_word_weight)) 43 | 44 | self.tokenizer = WhitespaceTokenizer(vocab, stop_words=set(), do_lower_case=False) 45 | self.sentence_embedding_dimension = len(vocab) 46 | 47 | 48 | def forward(self, features: Dict[str, Tensor]): 49 | #Nothing to do, everything is done in get_sentence_features 50 | return features 51 | 52 | def tokenize(self, texts: List[str]) -> List[int]: 53 | tokenized = [self.tokenizer.tokenize(text) for text in texts] 54 | return self.get_sentence_features(tokenized) 55 | 56 | def get_sentence_embedding_dimension(self): 57 | return self.sentence_embedding_dimension 58 | 59 | def get_sentence_features(self, tokenized_texts: List[List[int]], pad_seq_length: int = 0): 60 | vectors = [] 61 | 62 | for tokens in tokenized_texts: 63 | vector = np.zeros(self.get_sentence_embedding_dimension(), dtype=np.float32) 64 | for token in tokens: 65 | if self.cumulative_term_frequency: 66 | vector[token] += self.weights[token] 67 | else: 68 | vector[token] = self.weights[token] 69 | vectors.append(vector) 70 | 71 | return {'sentence_embedding': torch.tensor(vectors, dtype=torch.float)} 72 | 73 | def get_config_dict(self): 74 | return {key: self.__dict__[key] for key in self.config_keys} 75 | 76 | def save(self, output_path): 77 | with open(os.path.join(output_path, 'config.json'), 'w') as fOut: 78 | json.dump(self.get_config_dict(), fOut, indent=2) 79 | 80 | @staticmethod 81 | def load(input_path): 82 | with open(os.path.join(input_path, 'config.json')) as fIn: 83 | config = json.load(fIn) 84 | 85 | return BoW(**config) 86 | -------------------------------------------------------------------------------- /sentence_transformers_congen/models/T5.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | from transformers import T5Model, T5Tokenizer 3 | import json 4 | from typing import List, Dict, Optional 5 | import os 6 | import numpy as np 7 | import logging 8 | 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | class T5(nn.Module): 13 | """DEPRECATED: Please use models.Transformer instead. 14 | 15 | T5 model to generate token embeddings. 16 | 17 | Each token is mapped to an output vector from BERT. 18 | """ 19 | def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: Optional[bool] = None, task_identifier: str = 'stsb sentence1: ', model_args: Dict = {}, tokenizer_args: Dict = {}): 20 | super(T5, self).__init__() 21 | self.config_keys = ['max_seq_length', 'do_lower_case', 'task_identifier'] 22 | self.do_lower_case = do_lower_case 23 | 24 | if max_seq_length > 512: 25 | logger.warning("T5 only allows a max_seq_length of 512. Value will be set to 512") 26 | max_seq_length = 512 27 | self.max_seq_length = max_seq_length 28 | 29 | if self.do_lower_case is not None: 30 | tokenizer_args['do_lower_case'] = do_lower_case 31 | 32 | self.t5model = T5Model.from_pretrained(model_name_or_path, **model_args) 33 | self.tokenizer = T5Tokenizer.from_pretrained(model_name_or_path, **tokenizer_args) 34 | self.task_identifier = task_identifier 35 | 36 | def forward(self, features): 37 | """Returns token_embeddings, cls_token""" 38 | output_states = self.t5model.encoder(input_ids=features['input_ids'], attention_mask=features['attention_mask']) 39 | output_tokens = output_states[0] 40 | cls_tokens = output_tokens[:, 0, :] # CLS token is first token 41 | features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens}) 42 | 43 | if len(output_states) > 1: 44 | features.update({'all_layer_embeddings': output_states[1]}) 45 | 46 | return features 47 | 48 | def get_word_embedding_dimension(self) -> int: 49 | return self.t5model.config.hidden_size 50 | 51 | def tokenize(self, text: str) -> List[int]: 52 | """ 53 | Tokenizes a text and maps tokens to token-ids 54 | """ 55 | return self.tokenizer.encode(self.task_identifier+text) 56 | 57 | def get_sentence_features(self, tokens: List[int], pad_seq_length: int): 58 | """ 59 | Convert tokenized sentence in its embedding ids, segment ids and mask 60 | 61 | :param tokens: 62 | a tokenized sentence 63 | :param pad_seq_length: 64 | the maximal length of the sequence. Cannot be greater than self.sentence_transformer_config.max_seq_length 65 | :return: embedding ids, segment ids and mask for the sentence 66 | """ 67 | 68 | pad_seq_length = min(pad_seq_length, self.max_seq_length) 69 | return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, padding='max_length', return_tensors='pt', truncation=True, prepend_batch_axis=True) 70 | 71 | def get_config_dict(self): 72 | return {key: self.__dict__[key] for key in self.config_keys} 73 | 74 | def save(self, output_path: str): 75 | self.t5model.save_pretrained(output_path) 76 | self.tokenizer.save_pretrained(output_path) 77 | 78 | with open(os.path.join(output_path, 'sentence_T5_config.json'), 'w') as fOut: 79 | json.dump(self.get_config_dict(), fOut, indent=2) 80 | 81 | @staticmethod 82 | def load(input_path: str): 83 | with open(os.path.join(input_path, 'sentence_T5_config.json')) as fIn: 84 | config = json.load(fIn) 85 | return T5(model_name_or_path=input_path, **config) 86 | 87 | 88 | 89 | 90 | 91 | 92 | -------------------------------------------------------------------------------- /sentence_transformers_congen/models/tokenizer/WordTokenizer.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Union, Tuple, List, Iterable, Dict 3 | 4 | ENGLISH_STOP_WORDS = ['!', '"', "''", "``", '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', 'a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'ain', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'aren', 'around', 'as', 'at', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'below', 'beside', 'besides', 'between', 'beyond', 'bill', 'both', 'bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant', 'co', 'con', 'could', 'couldn', 'couldnt', 'cry', 'd', 'de', 'describe', 'detail', 'did', 'didn', 'do', 'does', 'doesn', 'doing', 'don', 'done', 'down', 'due', 'during', 'each', 'eg', 'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'etc', 'even', 'ever', 'every', 'everyone', 'everything', 'everywhere', 'except', 'few', 'fifteen', 'fifty', 'fill', 'find', 'fire', 'first', 'five', 'for', 'former', 'formerly', 'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'get', 'give', 'go', 'had', 'hadn', 'has', 'hasn', 'hasnt', 'have', 'haven', 'having', 'he', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'however', 'hundred', 'i', 'ie', 'if', 'in', 'inc', 'indeed', 'interest', 'into', 'is', 'isn', 'it', 'its', 'itself', 'just', 'keep', 'last', 'latter', 'latterly', 'least', 'less', 'll', 'ltd', 'm', 'ma', 'made', 'many', 'may', 'me', 'meanwhile', 'might', 'mightn', 'mill', 'mine', 'more', 'moreover', 'most', 'mostly', 'move', 'much', 'must', 'mustn', 'my', 'myself', 'name', 'namely', 'needn', 'neither', 'never', 'nevertheless', 'next', 'nine', 'no', 'nobody', 'none', 'noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'o', 'of', 'off', 'often', 'on', 'once', 'one', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'part', 'per', 'perhaps', 'please', 'put', 'rather', 're', 's', 'same', 'see', 'seem', 'seemed', 'seeming', 'seems', 'serious', 'several', 'shan', 'she', 'should', 'shouldn', 'show', 'side', 'since', 'sincere', 'six', 'sixty', 'so', 'some', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhere', 'still', 'such', 'system', 't', 'take', 'ten', 'than', 'that', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby', 'therefore', 'therein', 'thereupon', 'these', 'they', 'thick', 'thin', 'third', 'this', 'those', 'though', 'three', 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', 'top', 'toward', 'towards', 'twelve', 'twenty', 'two', 'un', 'under', 'until', 'up', 'upon', 'us', 've', 'very', 'via', 'was', 'wasn', 'we', 'well', 'were', 'weren', 'what', 'whatever', 'when', 'whence', 'whenever', 'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whoever', 'whole', 'whom', 'whose', 'why', 'will', 'with', 'within', 'without', 'won', 'would', 'wouldn', 'y', 'yet', 'you', 'your', 'yours', 'yourself', 'yourselves'] 5 | 6 | 7 | class WordTokenizer(ABC): 8 | @abstractmethod 9 | def set_vocab(self, vocab: Iterable[str]): 10 | pass 11 | 12 | @abstractmethod 13 | def get_vocab(self, vocab: Iterable[str]): 14 | pass 15 | 16 | @abstractmethod 17 | def tokenize(self, text: str) -> List[int]: 18 | pass 19 | 20 | @abstractmethod 21 | def save(self, output_path: str): 22 | pass 23 | 24 | @staticmethod 25 | @abstractmethod 26 | def load(input_path: str): 27 | pass -------------------------------------------------------------------------------- /SentEval/senteval/trec.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | ''' 9 | TREC question-type classification 10 | ''' 11 | 12 | from __future__ import absolute_import, division, unicode_literals 13 | 14 | import os 15 | import io 16 | import logging 17 | import numpy as np 18 | 19 | from senteval.tools.validation import KFoldClassifier 20 | 21 | 22 | class TRECEval(object): 23 | def __init__(self, task_path, seed=1111): 24 | logging.info('***** Transfer task : TREC *****\n\n') 25 | self.seed = seed 26 | self.train = self.loadFile(os.path.join(task_path, 'train_5500.label')) 27 | self.test = self.loadFile(os.path.join(task_path, 'TREC_10.label')) 28 | 29 | def do_prepare(self, params, prepare): 30 | samples = self.train['X'] + self.test['X'] 31 | return prepare(params, samples) 32 | 33 | def loadFile(self, fpath): 34 | trec_data = {'X': [], 'y': []} 35 | tgt2idx = {'ABBR': 0, 'DESC': 1, 'ENTY': 2, 36 | 'HUM': 3, 'LOC': 4, 'NUM': 5} 37 | with io.open(fpath, 'r', encoding='latin-1') as f: 38 | for line in f: 39 | target, sample = line.strip().split(':', 1) 40 | sample = sample.split(' ', 1)[1].split() 41 | assert target in tgt2idx, target 42 | trec_data['X'].append(sample) 43 | trec_data['y'].append(tgt2idx[target]) 44 | return trec_data 45 | 46 | def run(self, params, batcher): 47 | train_embeddings, test_embeddings = [], [] 48 | 49 | # Sort to reduce padding 50 | sorted_corpus_train = sorted(zip(self.train['X'], self.train['y']), 51 | key=lambda z: (len(z[0]), z[1])) 52 | train_samples = [x for (x, y) in sorted_corpus_train] 53 | train_labels = [y for (x, y) in sorted_corpus_train] 54 | 55 | sorted_corpus_test = sorted(zip(self.test['X'], self.test['y']), 56 | key=lambda z: (len(z[0]), z[1])) 57 | test_samples = [x for (x, y) in sorted_corpus_test] 58 | test_labels = [y for (x, y) in sorted_corpus_test] 59 | 60 | # Get train embeddings 61 | for ii in range(0, len(train_labels), params.batch_size): 62 | batch = train_samples[ii:ii + params.batch_size] 63 | embeddings = batcher(params, batch) 64 | train_embeddings.append(embeddings) 65 | train_embeddings = np.vstack(train_embeddings) 66 | logging.info('Computed train embeddings') 67 | 68 | # Get test embeddings 69 | for ii in range(0, len(test_labels), params.batch_size): 70 | batch = test_samples[ii:ii + params.batch_size] 71 | embeddings = batcher(params, batch) 72 | test_embeddings.append(embeddings) 73 | test_embeddings = np.vstack(test_embeddings) 74 | logging.info('Computed test embeddings') 75 | 76 | config_classifier = {'nclasses': 6, 'seed': self.seed, 77 | 'usepytorch': params.usepytorch, 78 | 'classifier': params.classifier, 79 | 'kfold': params.kfold} 80 | clf = KFoldClassifier({'X': train_embeddings, 81 | 'y': np.array(train_labels)}, 82 | {'X': test_embeddings, 83 | 'y': np.array(test_labels)}, 84 | config_classifier) 85 | devacc, testacc, _ = clf.run() 86 | logging.debug('\nDev acc : {0} Test acc : {1} \ 87 | for TREC\n'.format(devacc, testacc)) 88 | return {'devacc': devacc, 'acc': testacc, 89 | 'ndev': len(self.train['X']), 'ntest': len(self.test['X'])} 90 | -------------------------------------------------------------------------------- /SentEval/examples/bow.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | from __future__ import absolute_import, division, unicode_literals 9 | 10 | import sys 11 | import io 12 | import numpy as np 13 | import logging 14 | 15 | 16 | # Set PATHs 17 | PATH_TO_SENTEVAL = '../' 18 | PATH_TO_DATA = '../data' 19 | # PATH_TO_VEC = 'glove/glove.840B.300d.txt' 20 | PATH_TO_VEC = 'fasttext/crawl-300d-2M.vec' 21 | 22 | # import SentEval 23 | sys.path.insert(0, PATH_TO_SENTEVAL) 24 | import senteval 25 | 26 | 27 | # Create dictionary 28 | def create_dictionary(sentences, threshold=0): 29 | words = {} 30 | for s in sentences: 31 | for word in s: 32 | words[word] = words.get(word, 0) + 1 33 | 34 | if threshold > 0: 35 | newwords = {} 36 | for word in words: 37 | if words[word] >= threshold: 38 | newwords[word] = words[word] 39 | words = newwords 40 | words[''] = 1e9 + 4 41 | words[''] = 1e9 + 3 42 | words['

'] = 1e9 + 2 43 | 44 | sorted_words = sorted(words.items(), key=lambda x: -x[1]) # inverse sort 45 | id2word = [] 46 | word2id = {} 47 | for i, (w, _) in enumerate(sorted_words): 48 | id2word.append(w) 49 | word2id[w] = i 50 | 51 | return id2word, word2id 52 | 53 | # Get word vectors from vocabulary (glove, word2vec, fasttext ..) 54 | def get_wordvec(path_to_vec, word2id): 55 | word_vec = {} 56 | 57 | with io.open(path_to_vec, 'r', encoding='utf-8') as f: 58 | # if word2vec or fasttext file : skip first line "next(f)" 59 | for line in f: 60 | word, vec = line.split(' ', 1) 61 | if word in word2id: 62 | word_vec[word] = np.fromstring(vec, sep=' ') 63 | 64 | logging.info('Found {0} words with word vectors, out of \ 65 | {1} words'.format(len(word_vec), len(word2id))) 66 | return word_vec 67 | 68 | 69 | # SentEval prepare and batcher 70 | def prepare(params, samples): 71 | _, params.word2id = create_dictionary(samples) 72 | params.word_vec = get_wordvec(PATH_TO_VEC, params.word2id) 73 | params.wvec_dim = 300 74 | return 75 | 76 | def batcher(params, batch): 77 | batch = [sent if sent != [] else ['.'] for sent in batch] 78 | embeddings = [] 79 | 80 | for sent in batch: 81 | sentvec = [] 82 | for word in sent: 83 | if word in params.word_vec: 84 | sentvec.append(params.word_vec[word]) 85 | if not sentvec: 86 | vec = np.zeros(params.wvec_dim) 87 | sentvec.append(vec) 88 | sentvec = np.mean(sentvec, 0) 89 | embeddings.append(sentvec) 90 | 91 | embeddings = np.vstack(embeddings) 92 | return embeddings 93 | 94 | 95 | # Set params for SentEval 96 | params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5} 97 | params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128, 98 | 'tenacity': 3, 'epoch_size': 2} 99 | 100 | # Set up logger 101 | logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) 102 | 103 | if __name__ == "__main__": 104 | se = senteval.engine.SE(params_senteval, batcher, prepare) 105 | transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 106 | 'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 107 | 'SICKEntailment', 'SICKRelatedness', 'STSBenchmark', 108 | 'Length', 'WordContent', 'Depth', 'TopConstituents', 109 | 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber', 110 | 'OddManOut', 'CoordinationInversion'] 111 | results = se.eval(transfer_tasks) 112 | print(results) 113 | -------------------------------------------------------------------------------- /SentEval/senteval/binary.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | ''' 9 | Binary classifier and corresponding datasets : MR, CR, SUBJ, MPQA 10 | ''' 11 | from __future__ import absolute_import, division, unicode_literals 12 | 13 | import io 14 | import os 15 | import numpy as np 16 | import logging 17 | 18 | from senteval.tools.validation import InnerKFoldClassifier 19 | 20 | 21 | class BinaryClassifierEval(object): 22 | def __init__(self, pos, neg, seed=1111): 23 | self.seed = seed 24 | self.samples, self.labels = pos + neg, [1] * len(pos) + [0] * len(neg) 25 | self.n_samples = len(self.samples) 26 | 27 | def do_prepare(self, params, prepare): 28 | # prepare is given the whole text 29 | return prepare(params, self.samples) 30 | # prepare puts everything it outputs in "params" : params.word2id etc 31 | # Those output will be further used by "batcher". 32 | 33 | def loadFile(self, fpath): 34 | with io.open(fpath, 'r', encoding='latin-1') as f: 35 | return [line.split() for line in f.read().splitlines()] 36 | 37 | def run(self, params, batcher): 38 | enc_input = [] 39 | # Sort to reduce padding 40 | sorted_corpus = sorted(zip(self.samples, self.labels), 41 | key=lambda z: (len(z[0]), z[1])) 42 | sorted_samples = [x for (x, y) in sorted_corpus] 43 | sorted_labels = [y for (x, y) in sorted_corpus] 44 | logging.info('Generating sentence embeddings') 45 | for ii in range(0, self.n_samples, params.batch_size): 46 | batch = sorted_samples[ii:ii + params.batch_size] 47 | embeddings = batcher(params, batch) 48 | enc_input.append(embeddings) 49 | enc_input = np.vstack(enc_input) 50 | logging.info('Generated sentence embeddings') 51 | 52 | config = {'nclasses': 2, 'seed': self.seed, 53 | 'usepytorch': params.usepytorch, 54 | 'classifier': params.classifier, 55 | 'nhid': params.nhid, 'kfold': params.kfold} 56 | clf = InnerKFoldClassifier(enc_input, np.array(sorted_labels), config) 57 | devacc, testacc = clf.run() 58 | logging.debug('Dev acc : {0} Test acc : {1}\n'.format(devacc, testacc)) 59 | return {'devacc': devacc, 'acc': testacc, 'ndev': self.n_samples, 60 | 'ntest': self.n_samples} 61 | 62 | 63 | class CREval(BinaryClassifierEval): 64 | def __init__(self, task_path, seed=1111): 65 | logging.debug('***** Transfer task : CR *****\n\n') 66 | pos = self.loadFile(os.path.join(task_path, 'custrev.pos')) 67 | neg = self.loadFile(os.path.join(task_path, 'custrev.neg')) 68 | super(self.__class__, self).__init__(pos, neg, seed) 69 | 70 | 71 | class MREval(BinaryClassifierEval): 72 | def __init__(self, task_path, seed=1111): 73 | logging.debug('***** Transfer task : MR *****\n\n') 74 | pos = self.loadFile(os.path.join(task_path, 'rt-polarity.pos')) 75 | neg = self.loadFile(os.path.join(task_path, 'rt-polarity.neg')) 76 | super(self.__class__, self).__init__(pos, neg, seed) 77 | 78 | 79 | class SUBJEval(BinaryClassifierEval): 80 | def __init__(self, task_path, seed=1111): 81 | logging.debug('***** Transfer task : SUBJ *****\n\n') 82 | obj = self.loadFile(os.path.join(task_path, 'subj.objective')) 83 | subj = self.loadFile(os.path.join(task_path, 'subj.subjective')) 84 | super(self.__class__, self).__init__(obj, subj, seed) 85 | 86 | 87 | class MPQAEval(BinaryClassifierEval): 88 | def __init__(self, task_path, seed=1111): 89 | logging.debug('***** Transfer task : MPQA *****\n\n') 90 | pos = self.loadFile(os.path.join(task_path, 'mpqa.pos')) 91 | neg = self.loadFile(os.path.join(task_path, 'mpqa.neg')) 92 | super(self.__class__, self).__init__(pos, neg, seed) 93 | -------------------------------------------------------------------------------- /SentEval/senteval/sst.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | ''' 9 | SST - binary classification 10 | ''' 11 | 12 | from __future__ import absolute_import, division, unicode_literals 13 | 14 | import os 15 | import io 16 | import logging 17 | import numpy as np 18 | 19 | from senteval.tools.validation import SplitClassifier 20 | 21 | 22 | class SSTEval(object): 23 | def __init__(self, task_path, nclasses=2, seed=1111): 24 | self.seed = seed 25 | 26 | # binary of fine-grained 27 | assert nclasses in [2, 5] 28 | self.nclasses = nclasses 29 | self.task_name = 'Binary' if self.nclasses == 2 else 'Fine-Grained' 30 | logging.debug('***** Transfer task : SST %s classification *****\n\n', self.task_name) 31 | 32 | train = self.loadFile(os.path.join(task_path, 'sentiment-train')) 33 | dev = self.loadFile(os.path.join(task_path, 'sentiment-dev')) 34 | test = self.loadFile(os.path.join(task_path, 'sentiment-test')) 35 | self.sst_data = {'train': train, 'dev': dev, 'test': test} 36 | 37 | def do_prepare(self, params, prepare): 38 | samples = self.sst_data['train']['X'] + self.sst_data['dev']['X'] + \ 39 | self.sst_data['test']['X'] 40 | return prepare(params, samples) 41 | 42 | def loadFile(self, fpath): 43 | sst_data = {'X': [], 'y': []} 44 | with io.open(fpath, 'r', encoding='utf-8') as f: 45 | for line in f: 46 | if self.nclasses == 2: 47 | sample = line.strip().split('\t') 48 | sst_data['y'].append(int(sample[1])) 49 | sst_data['X'].append(sample[0].split()) 50 | elif self.nclasses == 5: 51 | sample = line.strip().split(' ', 1) 52 | sst_data['y'].append(int(sample[0])) 53 | sst_data['X'].append(sample[1].split()) 54 | assert max(sst_data['y']) == self.nclasses - 1 55 | return sst_data 56 | 57 | def run(self, params, batcher): 58 | sst_embed = {'train': {}, 'dev': {}, 'test': {}} 59 | bsize = params.batch_size 60 | 61 | for key in self.sst_data: 62 | logging.info('Computing embedding for {0}'.format(key)) 63 | # Sort to reduce padding 64 | sorted_data = sorted(zip(self.sst_data[key]['X'], 65 | self.sst_data[key]['y']), 66 | key=lambda z: (len(z[0]), z[1])) 67 | self.sst_data[key]['X'], self.sst_data[key]['y'] = map(list, zip(*sorted_data)) 68 | 69 | sst_embed[key]['X'] = [] 70 | for ii in range(0, len(self.sst_data[key]['y']), bsize): 71 | batch = self.sst_data[key]['X'][ii:ii + bsize] 72 | embeddings = batcher(params, batch) 73 | sst_embed[key]['X'].append(embeddings) 74 | sst_embed[key]['X'] = np.vstack(sst_embed[key]['X']) 75 | sst_embed[key]['y'] = np.array(self.sst_data[key]['y']) 76 | logging.info('Computed {0} embeddings'.format(key)) 77 | 78 | config_classifier = {'nclasses': self.nclasses, 'seed': self.seed, 79 | 'usepytorch': params.usepytorch, 80 | 'classifier': params.classifier} 81 | 82 | clf = SplitClassifier(X={'train': sst_embed['train']['X'], 83 | 'valid': sst_embed['dev']['X'], 84 | 'test': sst_embed['test']['X']}, 85 | y={'train': sst_embed['train']['y'], 86 | 'valid': sst_embed['dev']['y'], 87 | 'test': sst_embed['test']['y']}, 88 | config=config_classifier) 89 | 90 | devacc, testacc = clf.run() 91 | logging.debug('\nDev acc : {0} Test acc : {1} for \ 92 | SST {2} classification\n'.format(devacc, testacc, self.task_name)) 93 | 94 | return {'devacc': devacc, 'acc': testacc, 95 | 'ndev': len(sst_embed['dev']['X']), 96 | 'ntest': len(sst_embed['test']['X'])} 97 | -------------------------------------------------------------------------------- /sentence_transformers_congen/evaluation/MSEEvaluatorFromDataFrame.py: -------------------------------------------------------------------------------- 1 | from sentence_transformers.evaluation import SentenceEvaluator 2 | from sentence_transformers.util import batch_to_device 3 | from sentence_transformers import SentenceTransformer 4 | from typing import List, Tuple, Dict 5 | import torch 6 | import numpy as np 7 | import logging 8 | import os 9 | import csv 10 | 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class MSEEvaluatorFromDataFrame(SentenceEvaluator): 16 | """ 17 | Computes the mean squared error (x100) between the computed sentence embedding 18 | and some target sentence embedding. 19 | :param dataframe: 20 | It must have the following format. Rows contains different, parallel sentences. Columns are the respective language codes 21 | [{'en': 'My sentence', 'es': 'Sentence in Spanisch', 'fr': 'Sentence in French'...}, 22 | {'en': 'My second sentence', ....] 23 | :param combinations: 24 | Must be of the format [('en', 'es'), ('en', 'fr'), ...] 25 | First entry in a tuple is the source language. The sentence in the respective language will be fetched from the dataframe and passed to the teacher model. 26 | Second entry in a tuple the the target language. Sentence will be fetched from the dataframe and passed to the student model 27 | """ 28 | def __init__(self, dataframe: List[Dict[str, str]], teacher_model: SentenceTransformer, combinations: List[Tuple[str, str]], batch_size: int = 8, name='', write_csv: bool = True): 29 | 30 | self.combinations = combinations 31 | self.name = name 32 | self.batch_size = batch_size 33 | 34 | 35 | if name: 36 | name = "_"+name 37 | 38 | self.csv_file = "mse_evaluation" + name + "_results.csv" 39 | self.csv_headers = ["epoch", "steps"] 40 | self.write_csv = write_csv 41 | self.data = {} 42 | 43 | logger.info("Compute teacher embeddings") 44 | all_source_sentences = set() 45 | for src_lang, trg_lang in self.combinations: 46 | src_sentences = [] 47 | trg_sentences = [] 48 | 49 | for row in dataframe: 50 | if row[src_lang].strip() != "" and row[trg_lang].strip() != "": 51 | all_source_sentences.add(row[src_lang]) 52 | src_sentences.append(row[src_lang]) 53 | trg_sentences.append(row[trg_lang]) 54 | 55 | self.data[(src_lang, trg_lang)] = (src_sentences, trg_sentences) 56 | self.csv_headers.append("{}-{}".format(src_lang, trg_lang)) 57 | 58 | all_source_sentences = list(all_source_sentences) 59 | all_src_embeddings = teacher_model.encode(all_source_sentences, batch_size=self.batch_size) 60 | self.teacher_embeddings = {sent: emb for sent, emb in zip(all_source_sentences, all_src_embeddings)} 61 | 62 | def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1): 63 | model.eval() 64 | 65 | mse_scores = [] 66 | for src_lang, trg_lang in self.combinations: 67 | src_sentences, trg_sentences = self.data[(src_lang, trg_lang)] 68 | 69 | src_embeddings = np.asarray([self.teacher_embeddings[sent] for sent in src_sentences]) 70 | trg_embeddings = np.asarray(model.encode(trg_sentences, batch_size=self.batch_size)) 71 | 72 | mse = ((src_embeddings - trg_embeddings) ** 2).mean() 73 | mse *= 100 74 | mse_scores.append(mse) 75 | 76 | logger.info("MSE evaluation on {} dataset - {}-{}:".format(self.name, src_lang, trg_lang)) 77 | logger.info("MSE (*100):\t{:4f}".format(mse)) 78 | 79 | if output_path is not None and self.write_csv: 80 | csv_path = os.path.join(output_path, self.csv_file) 81 | output_file_exists = os.path.isfile(csv_path) 82 | with open(csv_path, newline='', mode="a" if output_file_exists else 'w', encoding="utf-8") as f: 83 | writer = csv.writer(f) 84 | if not output_file_exists: 85 | writer.writerow(self.csv_headers) 86 | 87 | writer.writerow([epoch, steps]+mse_scores) 88 | 89 | return -np.mean(mse_scores) #Return negative score as SentenceTransformers maximizes the performance 90 | 91 | -------------------------------------------------------------------------------- /sentence_transformers_congen/models/tokenizer/PhraseTokenizer.py: -------------------------------------------------------------------------------- 1 | from typing import Union, Tuple, List, Iterable, Dict 2 | import collections 3 | import string 4 | import os 5 | import json 6 | import logging 7 | from .WordTokenizer import WordTokenizer, ENGLISH_STOP_WORDS 8 | import nltk 9 | 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | class PhraseTokenizer(WordTokenizer): 14 | """Tokenizes the text with respect to existent phrases in the vocab. 15 | 16 | This tokenizers respects phrases that are in the vocab. Phrases are separated with 'ngram_separator', for example, 17 | in Google News word2vec file, ngrams are separated with a _ like New_York. These phrases are detected in text and merged as one special token. (New York is the ... => [New_York, is, the]) 18 | """ 19 | def __init__(self, vocab: Iterable[str] = [], stop_words: Iterable[str] = ENGLISH_STOP_WORDS, do_lower_case: bool = False, ngram_separator: str = "_", max_ngram_length: int = 5): 20 | self.stop_words = set(stop_words) 21 | self.do_lower_case = do_lower_case 22 | self.ngram_separator = ngram_separator 23 | self.max_ngram_length = max_ngram_length 24 | self.set_vocab(vocab) 25 | 26 | def get_vocab(self): 27 | return self.vocab 28 | 29 | def set_vocab(self, vocab: Iterable[str]): 30 | self.vocab = vocab 31 | self.word2idx = collections.OrderedDict([(word, idx) for idx, word in enumerate(vocab)]) 32 | 33 | # Check for ngram in vocab 34 | self.ngram_lookup = set() 35 | self.ngram_lengths = set() 36 | for word in vocab: 37 | 38 | if self.ngram_separator is not None and self.ngram_separator in word: 39 | # Sum words might me malformed in e.g. google news word2vec, containing two or more _ after each other 40 | ngram_count = word.count(self.ngram_separator) + 1 41 | if self.ngram_separator + self.ngram_separator not in word and ngram_count <= self.max_ngram_length: 42 | self.ngram_lookup.add(word) 43 | self.ngram_lengths.add(ngram_count) 44 | 45 | if len(vocab) > 0: 46 | logger.info("PhraseTokenizer - Phrase ngram lengths: {}".format(self.ngram_lengths)) 47 | logger.info("PhraseTokenizer - Num phrases: {}".format(len(self.ngram_lookup))) 48 | 49 | def tokenize(self, text: str) -> List[int]: 50 | tokens = nltk.word_tokenize(text, preserve_line=True) 51 | 52 | #phrase detection 53 | for ngram_len in sorted(self.ngram_lengths, reverse=True): 54 | idx = 0 55 | while idx <= len(tokens) - ngram_len: 56 | ngram = self.ngram_separator.join(tokens[idx:idx + ngram_len]) 57 | if ngram in self.ngram_lookup: 58 | tokens[idx:idx + ngram_len] = [ngram] 59 | elif ngram.lower() in self.ngram_lookup: 60 | tokens[idx:idx + ngram_len] = [ngram.lower()] 61 | idx += 1 62 | 63 | #Map tokens to idx, filter stop words 64 | tokens_filtered = [] 65 | for token in tokens: 66 | if token in self.stop_words: 67 | continue 68 | elif token in self.word2idx: 69 | tokens_filtered.append(self.word2idx[token]) 70 | continue 71 | 72 | token = token.lower() 73 | if token in self.stop_words: 74 | continue 75 | elif token in self.word2idx: 76 | tokens_filtered.append(self.word2idx[token]) 77 | continue 78 | 79 | token = token.strip(string.punctuation) 80 | if token in self.stop_words: 81 | continue 82 | elif len(token) > 0 and token in self.word2idx: 83 | tokens_filtered.append(self.word2idx[token]) 84 | continue 85 | 86 | return tokens_filtered 87 | 88 | def save(self, output_path: str): 89 | with open(os.path.join(output_path, 'phrasetokenizer_config.json'), 'w') as fOut: 90 | json.dump({'vocab': list(self.word2idx.keys()), 'stop_words': list(self.stop_words), 'do_lower_case': self.do_lower_case, 'ngram_separator': self.ngram_separator, 'max_ngram_length': self.max_ngram_length}, fOut) 91 | 92 | @staticmethod 93 | def load(input_path: str): 94 | with open(os.path.join(input_path, 'phrasetokenizer_config.json'), 'r') as fIn: 95 | config = json.load(fIn) 96 | 97 | return PhraseTokenizer(**config) 98 | -------------------------------------------------------------------------------- /SentEval/senteval/mrpc.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | ''' 9 | MRPC : Microsoft Research Paraphrase (detection) Corpus 10 | ''' 11 | from __future__ import absolute_import, division, unicode_literals 12 | 13 | import os 14 | import logging 15 | import numpy as np 16 | import io 17 | 18 | from senteval.tools.validation import KFoldClassifier 19 | 20 | from sklearn.metrics import f1_score 21 | 22 | 23 | class MRPCEval(object): 24 | def __init__(self, task_path, seed=1111): 25 | logging.info('***** Transfer task : MRPC *****\n\n') 26 | self.seed = seed 27 | train = self.loadFile(os.path.join(task_path, 28 | 'msr_paraphrase_train.txt')) 29 | test = self.loadFile(os.path.join(task_path, 30 | 'msr_paraphrase_test.txt')) 31 | self.mrpc_data = {'train': train, 'test': test} 32 | 33 | def do_prepare(self, params, prepare): 34 | # TODO : Should we separate samples in "train, test"? 35 | samples = self.mrpc_data['train']['X_A'] + \ 36 | self.mrpc_data['train']['X_B'] + \ 37 | self.mrpc_data['test']['X_A'] + self.mrpc_data['test']['X_B'] 38 | return prepare(params, samples) 39 | 40 | def loadFile(self, fpath): 41 | mrpc_data = {'X_A': [], 'X_B': [], 'y': []} 42 | with io.open(fpath, 'r', encoding='utf-8') as f: 43 | for line in f: 44 | text = line.strip().split('\t') 45 | mrpc_data['X_A'].append(text[3].split()) 46 | mrpc_data['X_B'].append(text[4].split()) 47 | mrpc_data['y'].append(text[0]) 48 | 49 | mrpc_data['X_A'] = mrpc_data['X_A'][1:] 50 | mrpc_data['X_B'] = mrpc_data['X_B'][1:] 51 | mrpc_data['y'] = [int(s) for s in mrpc_data['y'][1:]] 52 | return mrpc_data 53 | 54 | def run(self, params, batcher): 55 | mrpc_embed = {'train': {}, 'test': {}} 56 | 57 | for key in self.mrpc_data: 58 | logging.info('Computing embedding for {0}'.format(key)) 59 | # Sort to reduce padding 60 | text_data = {} 61 | sorted_corpus = sorted(zip(self.mrpc_data[key]['X_A'], 62 | self.mrpc_data[key]['X_B'], 63 | self.mrpc_data[key]['y']), 64 | key=lambda z: (len(z[0]), len(z[1]), z[2])) 65 | 66 | text_data['A'] = [x for (x, y, z) in sorted_corpus] 67 | text_data['B'] = [y for (x, y, z) in sorted_corpus] 68 | text_data['y'] = [z for (x, y, z) in sorted_corpus] 69 | 70 | for txt_type in ['A', 'B']: 71 | mrpc_embed[key][txt_type] = [] 72 | for ii in range(0, len(text_data['y']), params.batch_size): 73 | batch = text_data[txt_type][ii:ii + params.batch_size] 74 | embeddings = batcher(params, batch) 75 | mrpc_embed[key][txt_type].append(embeddings) 76 | mrpc_embed[key][txt_type] = np.vstack(mrpc_embed[key][txt_type]) 77 | mrpc_embed[key]['y'] = np.array(text_data['y']) 78 | logging.info('Computed {0} embeddings'.format(key)) 79 | 80 | # Train 81 | trainA = mrpc_embed['train']['A'] 82 | trainB = mrpc_embed['train']['B'] 83 | trainF = np.c_[np.abs(trainA - trainB), trainA * trainB] 84 | trainY = mrpc_embed['train']['y'] 85 | 86 | # Test 87 | testA = mrpc_embed['test']['A'] 88 | testB = mrpc_embed['test']['B'] 89 | testF = np.c_[np.abs(testA - testB), testA * testB] 90 | testY = mrpc_embed['test']['y'] 91 | 92 | config = {'nclasses': 2, 'seed': self.seed, 93 | 'usepytorch': params.usepytorch, 94 | 'classifier': params.classifier, 95 | 'nhid': params.nhid, 'kfold': params.kfold} 96 | clf = KFoldClassifier(train={'X': trainF, 'y': trainY}, 97 | test={'X': testF, 'y': testY}, config=config) 98 | 99 | devacc, testacc, yhat = clf.run() 100 | testf1 = round(100*f1_score(testY, yhat), 2) 101 | logging.debug('Dev acc : {0} Test acc {1}; Test F1 {2} for MRPC.\n' 102 | .format(devacc, testacc, testf1)) 103 | return {'devacc': devacc, 'acc': testacc, 'f1': testf1, 104 | 'ndev': len(trainA), 'ntest': len(testA)} 105 | -------------------------------------------------------------------------------- /sentence_transformers_congen/evaluation/TranslationEvaluator.py: -------------------------------------------------------------------------------- 1 | from . import SentenceEvaluator 2 | import logging 3 | from ..util import pytorch_cos_sim 4 | import os 5 | import csv 6 | import numpy as np 7 | import scipy.spatial 8 | from typing import List 9 | import torch 10 | 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | class TranslationEvaluator(SentenceEvaluator): 15 | """ 16 | Given two sets of sentences in different languages, e.g. (en_1, en_2, en_3...) and (fr_1, fr_2, fr_3, ...), 17 | and assuming that fr_i is the translation of en_i. 18 | Checks if vec(en_i) has the highest similarity to vec(fr_i). Computes the accurarcy in both directions 19 | """ 20 | def __init__(self, source_sentences: List[str], target_sentences: List[str], show_progress_bar: bool = False, batch_size: int = 16, name: str = '', print_wrong_matches: bool = False, write_csv: bool = True): 21 | """ 22 | Constructs an evaluator based for the dataset 23 | 24 | The labels need to indicate the similarity between the sentences. 25 | 26 | :param source_sentences: 27 | List of sentences in source language 28 | :param target_sentences: 29 | List of sentences in target language 30 | :param print_wrong_matches: 31 | Prints incorrect matches 32 | :param write_csv: 33 | Write results to CSV file 34 | """ 35 | self.source_sentences = source_sentences 36 | self.target_sentences = target_sentences 37 | self.name = name 38 | self.batch_size = batch_size 39 | self.show_progress_bar = show_progress_bar 40 | self.print_wrong_matches = print_wrong_matches 41 | 42 | assert len(self.source_sentences) == len(self.target_sentences) 43 | 44 | if name: 45 | name = "_"+name 46 | 47 | self.csv_file = "translation_evaluation"+name+"_results.csv" 48 | self.csv_headers = ["epoch", "steps", "src2trg", "trg2src"] 49 | self.write_csv = write_csv 50 | 51 | def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: 52 | if epoch != -1: 53 | if steps == -1: 54 | out_txt = " after epoch {}:".format(epoch) 55 | else: 56 | out_txt = " in epoch {} after {} steps:".format(epoch, steps) 57 | else: 58 | out_txt = ":" 59 | 60 | logger.info("Evaluating translation matching Accuracy on "+self.name+" dataset"+out_txt) 61 | 62 | embeddings1 = torch.stack(model.encode(self.source_sentences, show_progress_bar=self.show_progress_bar, batch_size=self.batch_size, convert_to_numpy=False)) 63 | embeddings2 = torch.stack(model.encode(self.target_sentences, show_progress_bar=self.show_progress_bar, batch_size=self.batch_size, convert_to_numpy=False)) 64 | 65 | 66 | cos_sims = pytorch_cos_sim(embeddings1, embeddings2).detach().cpu().numpy() 67 | 68 | correct_src2trg = 0 69 | correct_trg2src = 0 70 | 71 | for i in range(len(cos_sims)): 72 | max_idx = np.argmax(cos_sims[i]) 73 | 74 | if i == max_idx: 75 | correct_src2trg += 1 76 | elif self.print_wrong_matches: 77 | print("i:", i, "j:", max_idx, "INCORRECT" if i != max_idx else "CORRECT") 78 | print("Src:", self.source_sentences[i]) 79 | print("Trg:", self.target_sentences[max_idx]) 80 | print("Argmax score:", cos_sims[i][max_idx], "vs. correct score:", cos_sims[i][i]) 81 | 82 | results = zip(range(len(cos_sims[i])), cos_sims[i]) 83 | results = sorted(results, key=lambda x: x[1], reverse=True) 84 | for idx, score in results[0:5]: 85 | print("\t", idx, "(Score: %.4f)" % (score), self.target_sentences[idx]) 86 | 87 | 88 | 89 | cos_sims = cos_sims.T 90 | for i in range(len(cos_sims)): 91 | max_idx = np.argmax(cos_sims[i]) 92 | if i == max_idx: 93 | correct_trg2src += 1 94 | 95 | acc_src2trg = correct_src2trg / len(cos_sims) 96 | acc_trg2src = correct_trg2src / len(cos_sims) 97 | 98 | logger.info("Accuracy src2trg: {:.2f}".format(acc_src2trg*100)) 99 | logger.info("Accuracy trg2src: {:.2f}".format(acc_trg2src*100)) 100 | 101 | if output_path is not None and self.write_csv: 102 | csv_path = os.path.join(output_path, self.csv_file) 103 | output_file_exists = os.path.isfile(csv_path) 104 | with open(csv_path, newline='', mode="a" if output_file_exists else 'w', encoding="utf-8") as f: 105 | writer = csv.writer(f) 106 | if not output_file_exists: 107 | writer.writerow(self.csv_headers) 108 | 109 | writer.writerow([epoch, steps, acc_src2trg, acc_trg2src]) 110 | 111 | return (acc_src2trg+acc_trg2src)/2 112 | -------------------------------------------------------------------------------- /SentEval/senteval/snli.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | ''' 9 | SNLI - Entailment 10 | ''' 11 | from __future__ import absolute_import, division, unicode_literals 12 | 13 | import codecs 14 | import os 15 | import io 16 | import copy 17 | import logging 18 | import numpy as np 19 | 20 | from senteval.tools.validation import SplitClassifier 21 | 22 | 23 | class SNLIEval(object): 24 | def __init__(self, taskpath, seed=1111): 25 | logging.debug('***** Transfer task : SNLI Entailment*****\n\n') 26 | self.seed = seed 27 | train1 = self.loadFile(os.path.join(taskpath, 's1.train')) 28 | train2 = self.loadFile(os.path.join(taskpath, 's2.train')) 29 | 30 | trainlabels = io.open(os.path.join(taskpath, 'labels.train'), 31 | encoding='utf-8').read().splitlines() 32 | 33 | valid1 = self.loadFile(os.path.join(taskpath, 's1.dev')) 34 | valid2 = self.loadFile(os.path.join(taskpath, 's2.dev')) 35 | validlabels = io.open(os.path.join(taskpath, 'labels.dev'), 36 | encoding='utf-8').read().splitlines() 37 | 38 | test1 = self.loadFile(os.path.join(taskpath, 's1.test')) 39 | test2 = self.loadFile(os.path.join(taskpath, 's2.test')) 40 | testlabels = io.open(os.path.join(taskpath, 'labels.test'), 41 | encoding='utf-8').read().splitlines() 42 | 43 | # sort data (by s2 first) to reduce padding 44 | sorted_train = sorted(zip(train2, train1, trainlabels), 45 | key=lambda z: (len(z[0]), len(z[1]), z[2])) 46 | train2, train1, trainlabels = map(list, zip(*sorted_train)) 47 | 48 | sorted_valid = sorted(zip(valid2, valid1, validlabels), 49 | key=lambda z: (len(z[0]), len(z[1]), z[2])) 50 | valid2, valid1, validlabels = map(list, zip(*sorted_valid)) 51 | 52 | sorted_test = sorted(zip(test2, test1, testlabels), 53 | key=lambda z: (len(z[0]), len(z[1]), z[2])) 54 | test2, test1, testlabels = map(list, zip(*sorted_test)) 55 | 56 | self.samples = train1 + train2 + valid1 + valid2 + test1 + test2 57 | self.data = {'train': (train1, train2, trainlabels), 58 | 'valid': (valid1, valid2, validlabels), 59 | 'test': (test1, test2, testlabels) 60 | } 61 | 62 | def do_prepare(self, params, prepare): 63 | return prepare(params, self.samples) 64 | 65 | def loadFile(self, fpath): 66 | with codecs.open(fpath, 'rb', 'latin-1') as f: 67 | return [line.split() for line in 68 | f.read().splitlines()] 69 | 70 | def run(self, params, batcher): 71 | self.X, self.y = {}, {} 72 | dico_label = {'entailment': 0, 'neutral': 1, 'contradiction': 2} 73 | for key in self.data: 74 | if key not in self.X: 75 | self.X[key] = [] 76 | if key not in self.y: 77 | self.y[key] = [] 78 | 79 | input1, input2, mylabels = self.data[key] 80 | enc_input = [] 81 | n_labels = len(mylabels) 82 | for ii in range(0, n_labels, params.batch_size): 83 | batch1 = input1[ii:ii + params.batch_size] 84 | batch2 = input2[ii:ii + params.batch_size] 85 | 86 | if len(batch1) == len(batch2) and len(batch1) > 0: 87 | enc1 = batcher(params, batch1) 88 | enc2 = batcher(params, batch2) 89 | enc_input.append(np.hstack((enc1, enc2, enc1 * enc2, 90 | np.abs(enc1 - enc2)))) 91 | if (ii*params.batch_size) % (20000*params.batch_size) == 0: 92 | logging.info("PROGRESS (encoding): %.2f%%" % 93 | (100 * ii / n_labels)) 94 | self.X[key] = np.vstack(enc_input) 95 | self.y[key] = [dico_label[y] for y in mylabels] 96 | 97 | config = {'nclasses': 3, 'seed': self.seed, 98 | 'usepytorch': params.usepytorch, 99 | 'cudaEfficient': True, 100 | 'nhid': params.nhid, 'noreg': True} 101 | 102 | config_classifier = copy.deepcopy(params.classifier) 103 | config_classifier['max_epoch'] = 15 104 | config_classifier['epoch_size'] = 1 105 | config['classifier'] = config_classifier 106 | 107 | clf = SplitClassifier(self.X, self.y, config) 108 | devacc, testacc = clf.run() 109 | logging.debug('Dev acc : {0} Test acc : {1} for SNLI\n' 110 | .format(devacc, testacc)) 111 | return {'devacc': devacc, 'acc': testacc, 112 | 'ndev': len(self.data['valid'][0]), 113 | 'ntest': len(self.data['test'][0])} 114 | -------------------------------------------------------------------------------- /SentEval/senteval/rank.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | ''' 9 | Image-Caption Retrieval with COCO dataset 10 | ''' 11 | from __future__ import absolute_import, division, unicode_literals 12 | 13 | import os 14 | import sys 15 | import logging 16 | import numpy as np 17 | 18 | try: 19 | import cPickle as pickle 20 | except ImportError: 21 | import pickle 22 | 23 | from senteval.tools.ranking import ImageSentenceRankingPytorch 24 | 25 | 26 | class ImageCaptionRetrievalEval(object): 27 | def __init__(self, task_path, seed=1111): 28 | logging.debug('***** Transfer task: Image Caption Retrieval *****\n\n') 29 | 30 | # Get captions and image features 31 | self.seed = seed 32 | train, dev, test = self.loadFile(task_path) 33 | self.coco_data = {'train': train, 'dev': dev, 'test': test} 34 | 35 | def do_prepare(self, params, prepare): 36 | samples = self.coco_data['train']['sent'] + \ 37 | self.coco_data['dev']['sent'] + \ 38 | self.coco_data['test']['sent'] 39 | prepare(params, samples) 40 | 41 | def loadFile(self, fpath): 42 | coco = {} 43 | 44 | for split in ['train', 'valid', 'test']: 45 | list_sent = [] 46 | list_img_feat = [] 47 | if sys.version_info < (3, 0): 48 | with open(os.path.join(fpath, split + '.pkl')) as f: 49 | cocodata = pickle.load(f) 50 | else: 51 | with open(os.path.join(fpath, split + '.pkl'), 'rb') as f: 52 | cocodata = pickle.load(f, encoding='latin1') 53 | 54 | for imgkey in range(len(cocodata['features'])): 55 | assert len(cocodata['image_to_caption_ids'][imgkey]) >= 5, \ 56 | cocodata['image_to_caption_ids'][imgkey] 57 | for captkey in cocodata['image_to_caption_ids'][imgkey][0:5]: 58 | sent = cocodata['captions'][captkey]['cleaned_caption'] 59 | sent += ' .' # add punctuation to end of sentence in COCO 60 | list_sent.append(sent.encode('utf-8').split()) 61 | list_img_feat.append(cocodata['features'][imgkey]) 62 | assert len(list_sent) == len(list_img_feat) and \ 63 | len(list_sent) % 5 == 0 64 | list_img_feat = np.array(list_img_feat).astype('float32') 65 | coco[split] = {'sent': list_sent, 'imgfeat': list_img_feat} 66 | return coco['train'], coco['valid'], coco['test'] 67 | 68 | def run(self, params, batcher): 69 | coco_embed = {'train': {'sentfeat': [], 'imgfeat': []}, 70 | 'dev': {'sentfeat': [], 'imgfeat': []}, 71 | 'test': {'sentfeat': [], 'imgfeat': []}} 72 | 73 | for key in self.coco_data: 74 | logging.info('Computing embedding for {0}'.format(key)) 75 | # Sort to reduce padding 76 | self.coco_data[key]['sent'] = np.array(self.coco_data[key]['sent']) 77 | self.coco_data[key]['sent'], idx_sort = np.sort(self.coco_data[key]['sent']), np.argsort(self.coco_data[key]['sent']) 78 | idx_unsort = np.argsort(idx_sort) 79 | 80 | coco_embed[key]['X'] = [] 81 | nsent = len(self.coco_data[key]['sent']) 82 | for ii in range(0, nsent, params.batch_size): 83 | batch = self.coco_data[key]['sent'][ii:ii + params.batch_size] 84 | embeddings = batcher(params, batch) 85 | coco_embed[key]['sentfeat'].append(embeddings) 86 | coco_embed[key]['sentfeat'] = np.vstack(coco_embed[key]['sentfeat'])[idx_unsort] 87 | coco_embed[key]['imgfeat'] = np.array(self.coco_data[key]['imgfeat']) 88 | logging.info('Computed {0} embeddings'.format(key)) 89 | 90 | config = {'seed': self.seed, 'projdim': 1000, 'margin': 0.2} 91 | clf = ImageSentenceRankingPytorch(train=coco_embed['train'], 92 | valid=coco_embed['dev'], 93 | test=coco_embed['test'], 94 | config=config) 95 | 96 | bestdevscore, r1_i2t, r5_i2t, r10_i2t, medr_i2t, \ 97 | r1_t2i, r5_t2i, r10_t2i, medr_t2i = clf.run() 98 | 99 | logging.debug("\nTest scores | Image to text: \ 100 | {0}, {1}, {2}, {3}".format(r1_i2t, r5_i2t, r10_i2t, medr_i2t)) 101 | logging.debug("Test scores | Text to image: \ 102 | {0}, {1}, {2}, {3}\n".format(r1_t2i, r5_t2i, r10_t2i, medr_t2i)) 103 | 104 | return {'devacc': bestdevscore, 105 | 'acc': [(r1_i2t, r5_i2t, r10_i2t, medr_i2t), 106 | (r1_t2i, r5_t2i, r10_t2i, medr_t2i)], 107 | 'ndev': len(coco_embed['dev']['sentfeat']), 108 | 'ntest': len(coco_embed['test']['sentfeat'])} 109 | -------------------------------------------------------------------------------- /SentEval/senteval/tools/relatedness.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | """ 9 | Semantic Relatedness (supervised) with Pytorch 10 | """ 11 | from __future__ import absolute_import, division, unicode_literals 12 | 13 | import copy 14 | import numpy as np 15 | 16 | import torch 17 | from torch import nn 18 | import torch.optim as optim 19 | 20 | from scipy.stats import pearsonr, spearmanr 21 | 22 | 23 | class RelatednessPytorch(object): 24 | # Can be used for SICK-Relatedness, and STS14 25 | def __init__(self, train, valid, test, devscores, config): 26 | # fix seed 27 | np.random.seed(config['seed']) 28 | torch.manual_seed(config['seed']) 29 | assert torch.cuda.is_available(), 'torch.cuda required for Relatedness' 30 | torch.cuda.manual_seed(config['seed']) 31 | 32 | self.train = train 33 | self.valid = valid 34 | self.test = test 35 | self.devscores = devscores 36 | 37 | self.inputdim = train['X'].shape[1] 38 | self.nclasses = config['nclasses'] 39 | self.seed = config['seed'] 40 | self.l2reg = 0. 41 | self.batch_size = 64 42 | self.maxepoch = 1000 43 | self.early_stop = True 44 | 45 | self.model = nn.Sequential( 46 | nn.Linear(self.inputdim, self.nclasses), 47 | nn.Softmax(dim=-1), 48 | ) 49 | self.loss_fn = nn.MSELoss() 50 | 51 | if torch.cuda.is_available(): 52 | self.model = self.model.cuda() 53 | self.loss_fn = self.loss_fn.cuda() 54 | 55 | self.loss_fn.size_average = False 56 | self.optimizer = optim.Adam(self.model.parameters(), 57 | weight_decay=self.l2reg) 58 | 59 | def prepare_data(self, trainX, trainy, devX, devy, testX, testy): 60 | # Transform probs to log-probs for KL-divergence 61 | trainX = torch.from_numpy(trainX).float().cuda() 62 | trainy = torch.from_numpy(trainy).float().cuda() 63 | devX = torch.from_numpy(devX).float().cuda() 64 | devy = torch.from_numpy(devy).float().cuda() 65 | testX = torch.from_numpy(testX).float().cuda() 66 | testY = torch.from_numpy(testy).float().cuda() 67 | 68 | return trainX, trainy, devX, devy, testX, testy 69 | 70 | def run(self): 71 | self.nepoch = 0 72 | bestpr = -1 73 | early_stop_count = 0 74 | r = np.arange(1, 6) 75 | stop_train = False 76 | 77 | # Preparing data 78 | trainX, trainy, devX, devy, testX, testy = self.prepare_data( 79 | self.train['X'], self.train['y'], 80 | self.valid['X'], self.valid['y'], 81 | self.test['X'], self.test['y']) 82 | 83 | # Training 84 | while not stop_train and self.nepoch <= self.maxepoch: 85 | self.trainepoch(trainX, trainy, nepoches=50) 86 | yhat = np.dot(self.predict_proba(devX), r) 87 | pr = spearmanr(yhat, self.devscores)[0] 88 | pr = 0 if pr != pr else pr # if NaN bc std=0 89 | # early stop on Pearson 90 | if pr > bestpr: 91 | bestpr = pr 92 | bestmodel = copy.deepcopy(self.model) 93 | elif self.early_stop: 94 | if early_stop_count >= 3: 95 | stop_train = True 96 | early_stop_count += 1 97 | self.model = bestmodel 98 | 99 | yhat = np.dot(self.predict_proba(testX), r) 100 | 101 | return bestpr, yhat 102 | 103 | def trainepoch(self, X, y, nepoches=1): 104 | self.model.train() 105 | for _ in range(self.nepoch, self.nepoch + nepoches): 106 | permutation = np.random.permutation(len(X)) 107 | all_costs = [] 108 | for i in range(0, len(X), self.batch_size): 109 | # forward 110 | idx = torch.from_numpy(permutation[i:i + self.batch_size]).long().cuda() 111 | Xbatch = X[idx] 112 | ybatch = y[idx] 113 | output = self.model(Xbatch) 114 | # loss 115 | loss = self.loss_fn(output, ybatch) 116 | all_costs.append(loss.item()) 117 | # backward 118 | self.optimizer.zero_grad() 119 | loss.backward() 120 | # Update parameters 121 | self.optimizer.step() 122 | self.nepoch += nepoches 123 | 124 | def predict_proba(self, devX): 125 | self.model.eval() 126 | probas = [] 127 | with torch.no_grad(): 128 | for i in range(0, len(devX), self.batch_size): 129 | Xbatch = devX[i:i + self.batch_size] 130 | if len(probas) == 0: 131 | probas = self.model(Xbatch).data.cpu().numpy() 132 | else: 133 | probas = np.concatenate((probas, self.model(Xbatch).data.cpu().numpy()), axis=0) 134 | return probas 135 | -------------------------------------------------------------------------------- /sentence_transformers_congen/models/Asym.py: -------------------------------------------------------------------------------- 1 | from torch import Tensor 2 | from torch import nn 3 | from typing import List, Dict 4 | import os 5 | import json 6 | from ..util import import_from_string 7 | from collections import OrderedDict 8 | from typing import List, Dict, Optional, Union, Tuple 9 | 10 | class Asym(nn.Sequential): 11 | def __init__(self, sub_modules: Dict[str, List[nn.Module]], allow_empty_key: bool = True): 12 | """ 13 | This model allows to create asymmetric SentenceTransformer models, that apply different models depending on the specified input key. 14 | 15 | In the below example, we create two different Dense models for 'query' and 'doc'. Text that is passed as {'query': 'My query'} will 16 | be passed along along the first Dense model, and text that will be passed as {'doc': 'My document'} will use the other Dense model. 17 | 18 | Note, that when you call encode(), that only inputs of the same type can be encoded. Mixed-Types cannot be encoded. 19 | 20 | Example:: 21 | word_embedding_model = models.Transformer(model_name) 22 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension()) 23 | asym_model = models.Asym({'query': [models.Dense(word_embedding_model.get_word_embedding_dimension(), 128)], 'doc': [models.Dense(word_embedding_model.get_word_embedding_dimension(), 128)]}) 24 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model, asym_model]) 25 | 26 | model.encode([{'query': 'Q1'}, {'query': 'Q2'}] 27 | model.encode([{'doc': 'Doc1'}, {'doc': 'Doc2'}] 28 | 29 | #You can train it with InputExample like this. Note, that the order must always be the same: 30 | train_example = InputExample(texts=[{'query': 'Train query', 'doc': 'Doc query'}], label=1) 31 | 32 | 33 | :param sub_modules: Dict in the format str -> List[models]. The models in the specified list will be applied for input marked with the respective key. 34 | :param allow_empty_key: If true, inputs without a key can be processed. If false, an exception will be thrown if no key is specified. 35 | """ 36 | self.sub_modules = sub_modules 37 | self.allow_empty_key = allow_empty_key 38 | 39 | ordered_dict = OrderedDict() 40 | for name, models in sub_modules.items(): 41 | if not isinstance(models, List): 42 | models = [models] 43 | 44 | for idx, model in enumerate(models): 45 | ordered_dict[name+"-"+str(idx)] = model 46 | super(Asym, self).__init__(ordered_dict) 47 | 48 | 49 | def forward(self, features: Dict[str, Tensor]): 50 | if 'text_keys' in features and len(features['text_keys']) > 0: 51 | text_key = features['text_keys'][0] 52 | for model in self.sub_modules[text_key]: 53 | features = model(features) 54 | elif not self.allow_empty_key: 55 | raise ValueError('Input did not specify any keys and allow_empty_key is False') 56 | 57 | return features 58 | 59 | def get_sentence_embedding_dimension(self) -> int: 60 | raise NotImplementedError() 61 | 62 | def save(self, output_path): 63 | model_lookup = {} 64 | model_types = {} 65 | model_structure = {} 66 | 67 | for name, models in self.sub_modules.items(): 68 | model_structure[name] = [] 69 | for model in models: 70 | model_id = str(id(model))+'_'+type(model).__name__ 71 | model_lookup[model_id] = model 72 | model_types[model_id] = type(model).__module__ 73 | model_structure[name].append(model_id) 74 | 75 | for model_id, model in model_lookup.items(): 76 | model_path = os.path.join(output_path, str(model_id)) 77 | os.makedirs(model_path, exist_ok=True) 78 | model.save(model_path) 79 | 80 | with open(os.path.join(output_path, 'config.json'), 'w', encoding='utf8') as fOut: 81 | json.dump({'types': model_types, 'structure': model_structure, 82 | 'parameters': {'allow_empty_key': self.allow_empty_key}}, 83 | fOut, indent=2) 84 | 85 | def tokenize(self, texts: Union[List[str], List[Tuple[str, str]]]): 86 | """ 87 | Tokenizes a text and maps tokens to token-ids 88 | """ 89 | if not isinstance(texts[0], dict): 90 | raise AttributeError("Asym. model requires that texts are passed as dicts: {'key': 'text'}") 91 | 92 | 93 | module_key = None 94 | 95 | for lookup in texts: 96 | text_key, text = next(iter(lookup.items())) 97 | if module_key is None: 98 | module_key = text_key 99 | 100 | assert text_key == module_key #Mixed batches are not allowed 101 | return self.sub_modules[module_key][0].tokenize(texts) 102 | 103 | 104 | @staticmethod 105 | def load(input_path): 106 | with open(os.path.join(input_path, 'config.json')) as fIn: 107 | config = json.load(fIn) 108 | 109 | modules = {} 110 | for model_id, model_type in config['types'].items(): 111 | module_class = import_from_string(model_type) 112 | module = module_class.load(os.path.join(input_path, model_id)) 113 | modules[model_id] = module 114 | 115 | model_structure = {} 116 | for key_name, models_list in config['structure'].items(): 117 | model_structure[key_name] = [] 118 | for model_id in models_list: 119 | model_structure[key_name].append(modules[model_id]) 120 | 121 | model = Asym(model_structure, **config['parameters']) 122 | return model -------------------------------------------------------------------------------- /sentence_transformers_congen/evaluation/RerankingEvaluator.py: -------------------------------------------------------------------------------- 1 | from . import SentenceEvaluator 2 | import logging 3 | import numpy as np 4 | import os 5 | import csv 6 | from ..util import cos_sim, dot_score 7 | import torch 8 | from sklearn.metrics import average_precision_score 9 | import tqdm 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | class RerankingEvaluator(SentenceEvaluator): 14 | """ 15 | This class evaluates a SentenceTransformer model for the task of re-ranking. 16 | 17 | Given a query and a list of documents, it computes the score [query, doc_i] for all possible 18 | documents and sorts them in decreasing order. Then, MRR@10 and MAP is compute to measure the quality of the ranking. 19 | 20 | :param samples: Must be a list and each element is of the form: {'query': '', 'positive': [], 'negative': []}. Query is the search query, 21 | positive is a list of positive (relevant) documents, negative is a list of negative (irrelevant) documents. 22 | """ 23 | def __init__(self, samples, mrr_at_k: int = 10, name: str = '', write_csv: bool = True, similarity_fct=cos_sim, batch_size: int = 64, show_progress_bar: bool = False): 24 | self.samples = samples 25 | self.name = name 26 | self.mrr_at_k = mrr_at_k 27 | self.similarity_fct = similarity_fct 28 | self.batch_size = batch_size 29 | self.show_progress_bar = show_progress_bar 30 | 31 | if isinstance(self.samples, dict): 32 | self.samples = list(self.samples.values()) 33 | 34 | ### Remove sample with empty positive / negative set 35 | self.samples = [sample for sample in self.samples if len(sample['positive']) > 0 and len(sample['negative']) > 0] 36 | 37 | 38 | self.csv_file = "RerankingEvaluator" + ("_" + name if name else '') + "_results.csv" 39 | self.csv_headers = ["epoch", "steps", "MAP", "MRR@{}".format(mrr_at_k)] 40 | self.write_csv = write_csv 41 | 42 | def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: 43 | if epoch != -1: 44 | if steps == -1: 45 | out_txt = " after epoch {}:".format(epoch) 46 | else: 47 | out_txt = " in epoch {} after {} steps:".format(epoch, steps) 48 | else: 49 | out_txt = ":" 50 | 51 | logger.info("RerankingEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt) 52 | 53 | 54 | scores = self.compute_metrices(model) 55 | mean_ap = scores['map'] 56 | mean_mrr = scores['mrr'] 57 | 58 | #### Some stats about the dataset 59 | num_positives = [len(sample['positive']) for sample in self.samples] 60 | num_negatives = [len(sample['negative']) for sample in self.samples] 61 | 62 | logger.info("Queries: {} \t Positives: Min {:.1f}, Mean {:.1f}, Max {:.1f} \t Negatives: Min {:.1f}, Mean {:.1f}, Max {:.1f}".format(len(self.samples), np.min(num_positives), np.mean(num_positives), 63 | np.max(num_positives), np.min(num_negatives), 64 | np.mean(num_negatives), np.max(num_negatives))) 65 | logger.info("MAP: {:.2f}".format(mean_ap * 100)) 66 | logger.info("MRR@{}: {:.2f}".format(self.mrr_at_k, mean_mrr * 100)) 67 | 68 | #### Write results to disc 69 | if output_path is not None and self.write_csv: 70 | csv_path = os.path.join(output_path, self.csv_file) 71 | output_file_exists = os.path.isfile(csv_path) 72 | with open(csv_path, newline='', mode="a" if output_file_exists else 'w', encoding="utf-8") as f: 73 | writer = csv.writer(f) 74 | if not output_file_exists: 75 | writer.writerow(self.csv_headers) 76 | 77 | writer.writerow([epoch, steps, mean_ap, mean_mrr]) 78 | 79 | return mean_ap 80 | 81 | def compute_metrices(self, model): 82 | all_mrr_scores = [] 83 | all_ap_scores = [] 84 | 85 | 86 | for instance in tqdm.tqdm(self.samples, disable=not self.show_progress_bar, desc="Samples"): 87 | query = instance['query'] 88 | positive = list(instance['positive']) 89 | negative = list(instance['negative']) 90 | 91 | if len(positive) == 0 or len(negative) == 0: 92 | continue 93 | 94 | docs = positive + negative 95 | is_relevant = [True]*len(positive) + [False]*len(negative) 96 | 97 | query_emb = model.encode([query], convert_to_tensor=True, batch_size=self.batch_size, show_progress_bar=False) 98 | docs_emb = model.encode(docs, convert_to_tensor=True, batch_size=self.batch_size, show_progress_bar=False) 99 | 100 | pred_scores = self.similarity_fct(query_emb, docs_emb) 101 | if len(pred_scores.shape) > 1: 102 | pred_scores = pred_scores[0] 103 | 104 | pred_scores_argsort = torch.argsort(-pred_scores) #Sort in decreasing order 105 | 106 | #Compute MRR score 107 | mrr_score = 0 108 | for rank, index in enumerate(pred_scores_argsort[0:self.mrr_at_k]): 109 | if is_relevant[index]: 110 | mrr_score = 1 / (rank+1) 111 | break 112 | all_mrr_scores.append(mrr_score) 113 | 114 | # Compute AP 115 | all_ap_scores.append(average_precision_score(is_relevant, pred_scores.cpu().tolist())) 116 | 117 | mean_ap = np.mean(all_ap_scores) 118 | mean_mrr = np.mean(all_mrr_scores) 119 | 120 | return {'map': mean_ap, 'mrr': mean_mrr} 121 | 122 | -------------------------------------------------------------------------------- /sentence_transformers_congen/models/Pooling.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor 3 | from torch import nn 4 | from typing import Union, Tuple, List, Iterable, Dict 5 | import os 6 | import json 7 | 8 | 9 | class Pooling(nn.Module): 10 | """Performs pooling (max or mean) on the token embeddings. 11 | 12 | Using pooling, it generates from a variable sized sentence a fixed sized sentence embedding. This layer also allows to use the CLS token if it is returned by the underlying word embedding model. 13 | You can concatenate multiple poolings together. 14 | 15 | :param word_embedding_dimension: Dimensions for the word embeddings 16 | :param pooling_mode: Can be a string: mean/max/cls. If set, overwrites the other pooling_mode_* settings 17 | :param pooling_mode_cls_token: Use the first token (CLS token) as text representations 18 | :param pooling_mode_max_tokens: Use max in each dimension over all tokens. 19 | :param pooling_mode_mean_tokens: Perform mean-pooling 20 | :param pooling_mode_mean_sqrt_len_tokens: Perform mean-pooling, but devide by sqrt(input_length). 21 | """ 22 | def __init__(self, 23 | word_embedding_dimension: int, 24 | pooling_mode: str = None, 25 | pooling_mode_cls_token: bool = False, 26 | pooling_mode_max_tokens: bool = False, 27 | pooling_mode_mean_tokens: bool = True, 28 | pooling_mode_mean_sqrt_len_tokens: bool = False, 29 | ): 30 | super(Pooling, self).__init__() 31 | 32 | self.config_keys = ['word_embedding_dimension', 'pooling_mode_cls_token', 'pooling_mode_mean_tokens', 'pooling_mode_max_tokens', 'pooling_mode_mean_sqrt_len_tokens'] 33 | 34 | if pooling_mode is not None: #Set pooling mode by string 35 | pooling_mode = pooling_mode.lower() 36 | assert pooling_mode in ['mean', 'max', 'cls'] 37 | pooling_mode_cls_token = (pooling_mode == 'cls') 38 | pooling_mode_max_tokens = (pooling_mode == 'max') 39 | pooling_mode_mean_tokens = (pooling_mode == 'mean') 40 | 41 | self.word_embedding_dimension = word_embedding_dimension 42 | self.pooling_mode_cls_token = pooling_mode_cls_token 43 | self.pooling_mode_mean_tokens = pooling_mode_mean_tokens 44 | self.pooling_mode_max_tokens = pooling_mode_max_tokens 45 | self.pooling_mode_mean_sqrt_len_tokens = pooling_mode_mean_sqrt_len_tokens 46 | 47 | pooling_mode_multiplier = sum([pooling_mode_cls_token, pooling_mode_max_tokens, pooling_mode_mean_tokens, pooling_mode_mean_sqrt_len_tokens]) 48 | self.pooling_output_dimension = (pooling_mode_multiplier * word_embedding_dimension) 49 | 50 | 51 | def __repr__(self): 52 | return "Pooling({})".format(self.get_config_dict()) 53 | 54 | def get_pooling_mode_str(self) -> str: 55 | """ 56 | Returns the pooling mode as string 57 | """ 58 | modes = [] 59 | if self.pooling_mode_cls_token: 60 | modes.append('cls') 61 | if self.pooling_mode_mean_tokens: 62 | modes.append('mean') 63 | if self.pooling_mode_max_tokens: 64 | modes.append('max') 65 | if self.pooling_mode_mean_sqrt_len_tokens: 66 | modes.append('mean_sqrt_len_tokens') 67 | 68 | return "+".join(modes) 69 | 70 | def forward(self, features: Dict[str, Tensor]): 71 | token_embeddings = features['token_embeddings'] 72 | attention_mask = features['attention_mask'] 73 | 74 | ## Pooling strategy 75 | output_vectors = [] 76 | if self.pooling_mode_cls_token: 77 | cls_token = features.get('cls_token_embeddings', token_embeddings[:, 0]) # Take first token by default 78 | output_vectors.append(cls_token) 79 | if self.pooling_mode_max_tokens: 80 | input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() 81 | token_embeddings[input_mask_expanded == 0] = -1e9 # Set padding tokens to large negative value 82 | max_over_time = torch.max(token_embeddings, 1)[0] 83 | output_vectors.append(max_over_time) 84 | if self.pooling_mode_mean_tokens or self.pooling_mode_mean_sqrt_len_tokens: 85 | input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() 86 | sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) 87 | 88 | #If tokens are weighted (by WordWeights layer), feature 'token_weights_sum' will be present 89 | if 'token_weights_sum' in features: 90 | sum_mask = features['token_weights_sum'].unsqueeze(-1).expand(sum_embeddings.size()) 91 | else: 92 | sum_mask = input_mask_expanded.sum(1) 93 | 94 | sum_mask = torch.clamp(sum_mask, min=1e-9) 95 | 96 | if self.pooling_mode_mean_tokens: 97 | output_vectors.append(sum_embeddings / sum_mask) 98 | if self.pooling_mode_mean_sqrt_len_tokens: 99 | output_vectors.append(sum_embeddings / torch.sqrt(sum_mask)) 100 | 101 | output_vector = torch.cat(output_vectors, 1) 102 | features.update({'sentence_embedding': output_vector}) 103 | return features 104 | 105 | def get_sentence_embedding_dimension(self): 106 | return self.pooling_output_dimension 107 | 108 | def get_config_dict(self): 109 | return {key: self.__dict__[key] for key in self.config_keys} 110 | 111 | def save(self, output_path): 112 | with open(os.path.join(output_path, 'config.json'), 'w') as fOut: 113 | json.dump(self.get_config_dict(), fOut, indent=2) 114 | 115 | @staticmethod 116 | def load(input_path): 117 | with open(os.path.join(input_path, 'config.json')) as fIn: 118 | config = json.load(fIn) 119 | 120 | return Pooling(**config) 121 | -------------------------------------------------------------------------------- /sentence_transformers_congen/model_card_templates.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from .util import fullname 4 | 5 | class ModelCardTemplate: 6 | __TAGS__ = ["sentence-transformers", "feature-extraction", "sentence-similarity"] 7 | __DEFAULT_VARS__ = { 8 | "{PIPELINE_TAG}": "sentence-similarity", 9 | "{MODEL_DESCRIPTION}": "", 10 | "{TRAINING_SECTION}": "", 11 | "{USAGE_TRANSFORMERS_SECTION}": "", 12 | "{EVALUATION}": "", 13 | "{CITING}": "" 14 | } 15 | 16 | __MODEL_CARD__ = """ 17 | --- 18 | pipeline_tag: {PIPELINE_TAG} 19 | tags: 20 | {TAGS} 21 | --- 22 | 23 | # {MODEL_NAME} 24 | 25 | This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a {NUM_DIMENSIONS} dimensional dense vector space and can be used for tasks like clustering or semantic search. 26 | 27 | {MODEL_DESCRIPTION} 28 | 29 | ## Usage (Sentence-Transformers) 30 | 31 | Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed: 32 | 33 | ``` 34 | pip install -U sentence-transformers 35 | ``` 36 | 37 | Then you can use the model like this: 38 | 39 | ```python 40 | from sentence_transformers import SentenceTransformer 41 | sentences = ["This is an example sentence", "Each sentence is converted"] 42 | 43 | model = SentenceTransformer('{MODEL_NAME}') 44 | embeddings = model.encode(sentences) 45 | print(embeddings) 46 | ``` 47 | 48 | {USAGE_TRANSFORMERS_SECTION} 49 | 50 | ## Evaluation Results 51 | 52 | {EVALUATION} 53 | 54 | For an automated evaluation of this model, see the *Sentence Embeddings Benchmark*: [https://seb.sbert.net](https://seb.sbert.net?model_name={MODEL_NAME}) 55 | 56 | {TRAINING_SECTION} 57 | 58 | ## Full Model Architecture 59 | ``` 60 | {FULL_MODEL_STR} 61 | ``` 62 | 63 | ## Citing & Authors 64 | 65 | {CITING} 66 | 67 | """ 68 | 69 | 70 | 71 | __TRAINING_SECTION__ = """ 72 | ## Training 73 | The model was trained with the parameters: 74 | 75 | {LOSS_FUNCTIONS} 76 | 77 | Parameters of the fit()-Method: 78 | ``` 79 | {FIT_PARAMETERS} 80 | ``` 81 | """ 82 | 83 | 84 | __USAGE_TRANSFORMERS__ = """\n 85 | ## Usage (HuggingFace Transformers) 86 | Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings. 87 | 88 | ```python 89 | from transformers import AutoTokenizer, AutoModel 90 | import torch 91 | 92 | {POOLING_FUNCTION} 93 | 94 | # Sentences we want sentence embeddings for 95 | sentences = ['This is an example sentence', 'Each sentence is converted'] 96 | 97 | # Load model from HuggingFace Hub 98 | tokenizer = AutoTokenizer.from_pretrained('{MODEL_NAME}') 99 | model = AutoModel.from_pretrained('{MODEL_NAME}') 100 | 101 | # Tokenize sentences 102 | encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt') 103 | 104 | # Compute token embeddings 105 | with torch.no_grad(): 106 | model_output = model(**encoded_input) 107 | 108 | # Perform pooling. In this case, {POOLING_MODE} pooling. 109 | sentence_embeddings = {POOLING_FUNCTION_NAME}(model_output, encoded_input['attention_mask']) 110 | 111 | print("Sentence embeddings:") 112 | print(sentence_embeddings) 113 | ``` 114 | 115 | """ 116 | 117 | 118 | 119 | @staticmethod 120 | def model_card_get_pooling_function(pooling_mode): 121 | if pooling_mode == 'max': 122 | return "max_pooling", """ 123 | # Max Pooling - Take the max value over time for every dimension. 124 | def max_pooling(model_output, attention_mask): 125 | token_embeddings = model_output[0] #First element of model_output contains all token embeddings 126 | input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() 127 | token_embeddings[input_mask_expanded == 0] = -1e9 # Set padding tokens to large negative value 128 | return torch.max(token_embeddings, 1)[0] 129 | """ 130 | elif pooling_mode == 'mean': 131 | return "mean_pooling", """ 132 | #Mean Pooling - Take attention mask into account for correct averaging 133 | def mean_pooling(model_output, attention_mask): 134 | token_embeddings = model_output[0] #First element of model_output contains all token embeddings 135 | input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() 136 | return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) 137 | """ 138 | 139 | elif pooling_mode == 'cls': 140 | return "cls_pooling", """ 141 | def cls_pooling(model_output, attention_mask): 142 | return model_output[0][:,0] 143 | """ 144 | 145 | @staticmethod 146 | def get_train_objective_info(dataloader, loss): 147 | try: 148 | if hasattr(dataloader, 'get_config_dict'): 149 | train_loader = dataloader.get_config_dict() 150 | else: 151 | loader_params = {} 152 | loader_params['batch_size'] = dataloader.batch_size if hasattr(dataloader, 'batch_size') else 'unknown' 153 | if hasattr(dataloader, 'sampler'): 154 | loader_params['sampler'] = fullname(dataloader.sampler) 155 | if hasattr(dataloader, 'batch_sampler'): 156 | loader_params['batch_sampler'] = fullname(dataloader.batch_sampler) 157 | 158 | dataloader_str = """**DataLoader**:\n\n`{}` of length {} with parameters: 159 | ``` 160 | {} 161 | ```""".format(fullname(dataloader), len(dataloader), loader_params) 162 | 163 | loss_str = "**Loss**:\n\n`{}` {}".format(fullname(loss), 164 | """with parameters: 165 | ``` 166 | {} 167 | ```""".format(loss.get_config_dict()) if hasattr(loss, 'get_config_dict') else "") 168 | 169 | return [dataloader_str, loss_str] 170 | 171 | except Exception as e: 172 | logging.WARN("Exception when creating get_train_objective_info: {}".format(str(e))) 173 | return "" -------------------------------------------------------------------------------- /sentence_transformers_congen/models/WordEmbeddings.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn, Tensor 3 | from typing import Union, Tuple, List, Iterable, Dict 4 | import logging 5 | import gzip 6 | from tqdm import tqdm 7 | import numpy as np 8 | import os 9 | import json 10 | from ..util import import_from_string, fullname, http_get 11 | from .tokenizer import WordTokenizer, WhitespaceTokenizer 12 | 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | class WordEmbeddings(nn.Module): 17 | def __init__(self, tokenizer: WordTokenizer, embedding_weights, update_embeddings: bool = False, max_seq_length: int = 1000000): 18 | nn.Module.__init__(self) 19 | if isinstance(embedding_weights, list): 20 | embedding_weights = np.asarray(embedding_weights) 21 | 22 | if isinstance(embedding_weights, np.ndarray): 23 | embedding_weights = torch.from_numpy(embedding_weights) 24 | 25 | num_embeddings, embeddings_dimension = embedding_weights.size() 26 | self.embeddings_dimension = embeddings_dimension 27 | self.emb_layer = nn.Embedding(num_embeddings, embeddings_dimension) 28 | self.emb_layer.load_state_dict({'weight': embedding_weights}) 29 | self.emb_layer.weight.requires_grad = update_embeddings 30 | self.tokenizer = tokenizer 31 | self.update_embeddings = update_embeddings 32 | self.max_seq_length = max_seq_length 33 | 34 | def forward(self, features): 35 | token_embeddings = self.emb_layer(features['input_ids']) 36 | cls_tokens = None 37 | features.update({'token_embeddings': token_embeddings, 'cls_token_embeddings': cls_tokens, 'attention_mask': features['attention_mask']}) 38 | return features 39 | 40 | def tokenize(self, texts: List[str]): 41 | tokenized_texts = [self.tokenizer.tokenize(text) for text in texts] 42 | sentence_lengths = [len(tokens) for tokens in tokenized_texts] 43 | max_len = max(sentence_lengths) 44 | 45 | input_ids = [] 46 | attention_masks = [] 47 | for tokens in tokenized_texts: 48 | padding = [0] * (max_len - len(tokens)) 49 | input_ids.append(tokens + padding) 50 | attention_masks.append([1]*len(tokens) + padding) 51 | 52 | output = {'input_ids': torch.tensor(input_ids, dtype=torch.long), 53 | 'attention_mask': torch.tensor(attention_masks, dtype=torch.long), 54 | 'sentence_lengths': torch.tensor(sentence_lengths, dtype=torch.long)} 55 | 56 | return output 57 | 58 | 59 | 60 | def get_word_embedding_dimension(self) -> int: 61 | return self.embeddings_dimension 62 | 63 | def save(self, output_path: str): 64 | with open(os.path.join(output_path, 'wordembedding_config.json'), 'w') as fOut: 65 | json.dump(self.get_config_dict(), fOut, indent=2) 66 | 67 | torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin')) 68 | self.tokenizer.save(output_path) 69 | 70 | def get_config_dict(self): 71 | return {'tokenizer_class': fullname(self.tokenizer), 'update_embeddings': self.update_embeddings, 'max_seq_length': self.max_seq_length} 72 | 73 | @staticmethod 74 | def load(input_path: str): 75 | with open(os.path.join(input_path, 'wordembedding_config.json'), 'r') as fIn: 76 | config = json.load(fIn) 77 | 78 | tokenizer_class = import_from_string(config['tokenizer_class']) 79 | tokenizer = tokenizer_class.load(input_path) 80 | weights = torch.load(os.path.join(input_path, 'pytorch_model.bin'), map_location=torch.device('cpu')) 81 | embedding_weights = weights['emb_layer.weight'] 82 | model = WordEmbeddings(tokenizer=tokenizer, embedding_weights=embedding_weights, update_embeddings=config['update_embeddings']) 83 | return model 84 | 85 | @staticmethod 86 | def from_text_file(embeddings_file_path: str, update_embeddings: bool = False, item_separator: str = " ", tokenizer=WhitespaceTokenizer(), max_vocab_size: int = None): 87 | logger.info("Read in embeddings file {}".format(embeddings_file_path)) 88 | 89 | if not os.path.exists(embeddings_file_path): 90 | logger.info("{} does not exist, try to download from server".format(embeddings_file_path)) 91 | 92 | if '/' in embeddings_file_path or '\\' in embeddings_file_path: 93 | raise ValueError("Embeddings file not found: ".format(embeddings_file_path)) 94 | 95 | url = "https://public.ukp.informatik.tu-darmstadt.de/reimers/embeddings/"+embeddings_file_path 96 | http_get(url, embeddings_file_path) 97 | 98 | embeddings_dimension = None 99 | vocab = [] 100 | embeddings = [] 101 | 102 | with gzip.open(embeddings_file_path, "rt", encoding="utf8") if embeddings_file_path.endswith('.gz') else open(embeddings_file_path, encoding="utf8") as fIn: 103 | iterator = tqdm(fIn, desc="Load Word Embeddings", unit="Embeddings") 104 | for line in iterator: 105 | split = line.rstrip().split(item_separator) 106 | word = split[0] 107 | 108 | if embeddings_dimension == None: 109 | embeddings_dimension = len(split) - 1 110 | vocab.append("PADDING_TOKEN") 111 | embeddings.append(np.zeros(embeddings_dimension)) 112 | 113 | if (len(split) - 1) != embeddings_dimension: # Assure that all lines in the embeddings file are of the same length 114 | logger.error("ERROR: A line in the embeddings file had more or less dimensions than expected. Skip token.") 115 | continue 116 | 117 | vector = np.array([float(num) for num in split[1:]]) 118 | embeddings.append(vector) 119 | vocab.append(word) 120 | 121 | if max_vocab_size is not None and max_vocab_size > 0 and len(vocab) > max_vocab_size: 122 | break 123 | 124 | embeddings = np.asarray(embeddings) 125 | 126 | tokenizer.set_vocab(vocab) 127 | return WordEmbeddings(tokenizer=tokenizer, embedding_weights=embeddings, update_embeddings=update_embeddings) 128 | 129 | -------------------------------------------------------------------------------- /sentence_transformers_congen/models/Transformer.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | from transformers import AutoModel, AutoTokenizer, AutoConfig 3 | import json 4 | from typing import List, Dict, Optional, Union, Tuple 5 | import os 6 | 7 | 8 | class Transformer(nn.Module): 9 | """Huggingface AutoModel to generate token embeddings. 10 | Loads the correct class, e.g. BERT / RoBERTa etc. 11 | 12 | :param model_name_or_path: Huggingface models name (https://huggingface.co/models) 13 | :param max_seq_length: Truncate any inputs longer than max_seq_length 14 | :param model_args: Arguments (key, value pairs) passed to the Huggingface Transformers model 15 | :param cache_dir: Cache dir for Huggingface Transformers to store/load models 16 | :param tokenizer_args: Arguments (key, value pairs) passed to the Huggingface Tokenizer model 17 | :param do_lower_case: If true, lowercases the input (independent if the model is cased or not) 18 | :param tokenizer_name_or_path: Name or path of the tokenizer. When None, then model_name_or_path is used 19 | """ 20 | def __init__(self, model_name_or_path: str, max_seq_length: Optional[int] = None, 21 | model_args: Dict = {}, cache_dir: Optional[str] = None, 22 | tokenizer_args: Dict = {}, do_lower_case: bool = False, 23 | tokenizer_name_or_path : str = None): 24 | super(Transformer, self).__init__() 25 | self.config_keys = ['max_seq_length', 'do_lower_case'] 26 | self.do_lower_case = do_lower_case 27 | 28 | config = AutoConfig.from_pretrained(model_name_or_path, **model_args, cache_dir=cache_dir) 29 | self.auto_model = AutoModel.from_pretrained(model_name_or_path, config=config, cache_dir=cache_dir) 30 | self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path if tokenizer_name_or_path is not None else model_name_or_path, cache_dir=cache_dir, **tokenizer_args) 31 | 32 | #No max_seq_length set. Try to infer from model 33 | if max_seq_length is None: 34 | if hasattr(self.auto_model, "config") and hasattr(self.auto_model.config, "max_position_embeddings") and hasattr(self.tokenizer, "model_max_length"): 35 | max_seq_length = min(self.auto_model.config.max_position_embeddings, self.tokenizer.model_max_length) 36 | 37 | self.max_seq_length = max_seq_length 38 | 39 | if tokenizer_name_or_path is not None: 40 | self.auto_model.config.tokenizer_class = self.tokenizer.__class__.__name__ 41 | 42 | def __repr__(self): 43 | return "Transformer({}) with Transformer model: {} ".format(self.get_config_dict(), self.auto_model.__class__.__name__) 44 | 45 | def forward(self, features): 46 | """Returns token_embeddings, cls_token""" 47 | trans_features = {'input_ids': features['input_ids'], 'attention_mask': features['attention_mask']} 48 | if 'token_type_ids' in features: 49 | trans_features['token_type_ids'] = features['token_type_ids'] 50 | 51 | output_states = self.auto_model(**trans_features, return_dict=False) 52 | output_tokens = output_states[0] 53 | 54 | features.update({'token_embeddings': output_tokens, 'attention_mask': features['attention_mask']}) 55 | 56 | if self.auto_model.config.output_hidden_states: 57 | all_layer_idx = 2 58 | if len(output_states) < 3: #Some models only output last_hidden_states and all_hidden_states 59 | all_layer_idx = 1 60 | 61 | hidden_states = output_states[all_layer_idx] 62 | features.update({'all_layer_embeddings': hidden_states}) 63 | 64 | return features 65 | 66 | def get_word_embedding_dimension(self) -> int: 67 | return self.auto_model.config.hidden_size 68 | 69 | def tokenize(self, texts: Union[List[str], List[Dict], List[Tuple[str, str]]]): 70 | """ 71 | Tokenizes a text and maps tokens to token-ids 72 | """ 73 | output = {} 74 | if isinstance(texts[0], str): 75 | to_tokenize = [texts] 76 | elif isinstance(texts[0], dict): 77 | to_tokenize = [] 78 | output['text_keys'] = [] 79 | for lookup in texts: 80 | text_key, text = next(iter(lookup.items())) 81 | to_tokenize.append(text) 82 | output['text_keys'].append(text_key) 83 | to_tokenize = [to_tokenize] 84 | else: 85 | batch1, batch2 = [], [] 86 | for text_tuple in texts: 87 | batch1.append(text_tuple[0]) 88 | batch2.append(text_tuple[1]) 89 | to_tokenize = [batch1, batch2] 90 | 91 | #strip 92 | to_tokenize = [[str(s).strip() for s in col] for col in to_tokenize] 93 | 94 | #Lowercase 95 | if self.do_lower_case: 96 | to_tokenize = [[s.lower() for s in col] for col in to_tokenize] 97 | 98 | 99 | output.update(self.tokenizer(*to_tokenize, padding=True, truncation='longest_first', return_tensors="pt", max_length=self.max_seq_length)) 100 | return output 101 | 102 | 103 | def get_config_dict(self): 104 | return {key: self.__dict__[key] for key in self.config_keys} 105 | 106 | def save(self, output_path: str): 107 | self.auto_model.save_pretrained(output_path) 108 | self.tokenizer.save_pretrained(output_path) 109 | 110 | with open(os.path.join(output_path, 'sentence_bert_config.json'), 'w') as fOut: 111 | json.dump(self.get_config_dict(), fOut, indent=2) 112 | 113 | @staticmethod 114 | def load(input_path: str): 115 | #Old classes used other config names than 'sentence_bert_config.json' 116 | for config_name in ['sentence_bert_config.json', 'sentence_roberta_config.json', 'sentence_distilbert_config.json', 'sentence_camembert_config.json', 'sentence_albert_config.json', 'sentence_xlm-roberta_config.json', 'sentence_xlnet_config.json']: 117 | sbert_config_path = os.path.join(input_path, config_name) 118 | if os.path.exists(sbert_config_path): 119 | break 120 | 121 | with open(sbert_config_path) as fIn: 122 | config = json.load(fIn) 123 | return Transformer(model_name_or_path=input_path, **config) 124 | 125 | 126 | 127 | 128 | 129 | 130 | -------------------------------------------------------------------------------- /evaluation.py: -------------------------------------------------------------------------------- 1 | # This code our modify from https://github.com/princeton-nlp/SimCSE/blob/main/evaluation.py 2 | 3 | import sys 4 | import io, os 5 | import numpy as np 6 | import logging 7 | import argparse 8 | from prettytable import PrettyTable 9 | import torch 10 | import transformers 11 | from sentence_transformers import SentenceTransformer 12 | 13 | # Set up logger 14 | logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) 15 | 16 | # Set PATHs 17 | PATH_TO_SENTEVAL = './SentEval' 18 | PATH_TO_DATA = './SentEval/data' 19 | 20 | # Import SentEval 21 | sys.path.insert(0, PATH_TO_SENTEVAL) 22 | import senteval 23 | 24 | def print_table(task_names, scores): 25 | tb = PrettyTable() 26 | tb.field_names = task_names 27 | tb.add_row(scores) 28 | print(tb) 29 | 30 | def prepare(params, samples): 31 | return 32 | 33 | def batcher(params, batch): 34 | batch = [' '.join(sent) if sent != [] else '.' for sent in batch] 35 | embeddings = params['encoder'](batch, show_progress_bar=False) 36 | return embeddings 37 | 38 | def main(): 39 | parser = argparse.ArgumentParser() 40 | parser.add_argument("--model_name_or_path", type=str, 41 | help="Transformers' model name or path") 42 | parser.add_argument("--mode", type=str, 43 | choices=['dev', 'test', 'fasttest'], 44 | default='test', 45 | help="What evaluation mode to use (dev: fast mode, dev results; test: full mode, test results); fasttest: fast mode, test results") 46 | parser.add_argument("--task_set", type=str, 47 | choices=['sts', 'transfer', 'full', 'na'], 48 | default='sts', 49 | help="What set of tasks to evaluate on. If not 'na', this will override '--tasks'") 50 | parser.add_argument("--tasks", type=str, nargs='+', 51 | default=['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 52 | 'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'TREC', 'MRPC', 53 | 'SICKRelatedness', 'STSBenchmark'], 54 | help="Tasks to evaluate on. If '--task_set' is specified, this will be overridden") 55 | 56 | args = parser.parse_args() 57 | 58 | # Load sentence transformers' model checkpoint 59 | model = SentenceTransformer(args.model_name_or_path) 60 | 61 | # Set up the tasks 62 | if args.task_set == 'sts': 63 | args.tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'STSBenchmark', 'SICKRelatedness'] 64 | elif args.task_set == 'transfer': 65 | args.tasks = ['MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'TREC', 'MRPC'] 66 | elif args.task_set == 'full': 67 | args.tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'STSBenchmark', 'SICKRelatedness'] 68 | args.tasks += ['MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'TREC', 'MRPC'] 69 | 70 | # Set params for SentEval 71 | if args.mode == 'dev' or args.mode == 'fasttest': 72 | # Fast mode 73 | params = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5} 74 | params['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128, 75 | 'tenacity': 3, 'epoch_size': 2} 76 | elif args.mode == 'test': 77 | # Full mode 78 | params = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 10} 79 | params['classifier'] = {'nhid': 0, 'optim': 'adam', 'batch_size': 64, 80 | 'tenacity': 5, 'epoch_size': 4} 81 | else: 82 | raise NotImplementedError 83 | 84 | results = {} 85 | 86 | for task in args.tasks: 87 | params = {'task_path': 'SentEval/data/', 'usepytorch': True, 'kfold': 10} 88 | params['encoder'] = model.encode 89 | se = senteval.engine.SE(params, batcher, prepare) 90 | result = se.eval(task) 91 | results[task] = result 92 | 93 | # Print evaluation results 94 | if args.mode == 'dev': 95 | print("------ %s ------" % (args.mode)) 96 | 97 | task_names = [] 98 | scores = [] 99 | for task in ['STSBenchmark', 'SICKRelatedness']: 100 | task_names.append(task) 101 | if task in results: 102 | scores.append("%.2f" % (results[task]['dev']['spearman'][0] * 100)) 103 | else: 104 | scores.append("0.00") 105 | print_table(task_names, scores) 106 | 107 | task_names = [] 108 | scores = [] 109 | for task in ['MR', 'CR', 'SUBJ', 'MPQA', 'SST2', 'TREC', 'MRPC']: 110 | task_names.append(task) 111 | if task in results: 112 | scores.append("%.2f" % (results[task]['devacc'])) 113 | else: 114 | scores.append("0.00") 115 | task_names.append("Avg.") 116 | scores.append("%.2f" % (sum([float(score) for score in scores]) / len(scores))) 117 | print_table(task_names, scores) 118 | 119 | elif args.mode == 'test' or args.mode == 'fasttest': 120 | print("------ %s ------" % (args.mode)) 121 | 122 | task_names = [] 123 | scores = [] 124 | for task in ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'STSBenchmark', 'SICKRelatedness']: 125 | task_names.append(task) 126 | if task in results: 127 | if task in ['STS12', 'STS13', 'STS14', 'STS15', 'STS16']: 128 | scores.append("%.2f" % (results[task]['all']['spearman']['all'] * 100)) 129 | else: 130 | scores.append("%.2f" % (results[task]['test']['spearman'].correlation * 100)) 131 | else: 132 | scores.append("0.00") 133 | task_names.append("Avg.") 134 | scores.append("%.2f" % (sum([float(score) for score in scores]) / len(scores))) 135 | print_table(task_names, scores) 136 | 137 | task_names = [] 138 | scores = [] 139 | for task in ['MR', 'CR', 'SUBJ', 'MPQA', 'SST2', 'TREC', 'MRPC']: 140 | task_names.append(task) 141 | if task in results: 142 | scores.append("%.2f" % (results[task]['acc'])) 143 | else: 144 | scores.append("0.00") 145 | task_names.append("Avg.") 146 | scores.append("%.2f" % (sum([float(score) for score in scores]) / len(scores))) 147 | print_table(task_names, scores) 148 | 149 | 150 | if __name__ == "__main__": 151 | main() -------------------------------------------------------------------------------- /sentence_transformers_congen/evaluation/EmbeddingSimilarityEvaluator.py: -------------------------------------------------------------------------------- 1 | from . import SentenceEvaluator, SimilarityFunction 2 | import logging 3 | import os 4 | import csv 5 | from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances 6 | from scipy.stats import pearsonr, spearmanr 7 | import numpy as np 8 | from typing import List 9 | from ..readers import InputExample 10 | 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | class EmbeddingSimilarityEvaluator(SentenceEvaluator): 15 | """ 16 | Evaluate a model based on the similarity of the embeddings by calculating the Spearman and Pearson rank correlation 17 | in comparison to the gold standard labels. 18 | The metrics are the cosine similarity as well as euclidean and Manhattan distance 19 | The returned score is the Spearman correlation with a specified metric. 20 | 21 | The results are written in a CSV. If a CSV already exists, then values are appended. 22 | """ 23 | def __init__(self, sentences1: List[str], sentences2: List[str], scores: List[float], batch_size: int = 16, main_similarity: SimilarityFunction = None, name: str = '', show_progress_bar: bool = False, write_csv: bool = True): 24 | """ 25 | Constructs an evaluator based for the dataset 26 | 27 | The labels need to indicate the similarity between the sentences. 28 | 29 | :param sentences1: List with the first sentence in a pair 30 | :param sentences2: List with the second sentence in a pair 31 | :param scores: Similarity score between sentences1[i] and sentences2[i] 32 | :param write_csv: Write results to a CSV file 33 | """ 34 | self.sentences1 = sentences1 35 | self.sentences2 = sentences2 36 | self.scores = scores 37 | self.write_csv = write_csv 38 | 39 | assert len(self.sentences1) == len(self.sentences2) 40 | assert len(self.sentences1) == len(self.scores) 41 | 42 | self.main_similarity = main_similarity 43 | self.name = name 44 | 45 | self.batch_size = batch_size 46 | if show_progress_bar is None: 47 | show_progress_bar = (logger.getEffectiveLevel() == logging.INFO or logger.getEffectiveLevel() == logging.DEBUG) 48 | self.show_progress_bar = show_progress_bar 49 | 50 | self.csv_file = "similarity_evaluation"+("_"+name if name else '')+"_results.csv" 51 | self.csv_headers = ["epoch", "steps", "cosine_pearson", "cosine_spearman", "euclidean_pearson", "euclidean_spearman", "manhattan_pearson", "manhattan_spearman", "dot_pearson", "dot_spearman"] 52 | 53 | @classmethod 54 | def from_input_examples(cls, examples: List[InputExample], **kwargs): 55 | sentences1 = [] 56 | sentences2 = [] 57 | scores = [] 58 | 59 | for example in examples: 60 | sentences1.append(example.texts[0]) 61 | sentences2.append(example.texts[1]) 62 | scores.append(example.label) 63 | return cls(sentences1, sentences2, scores, **kwargs) 64 | 65 | 66 | def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: 67 | if epoch != -1: 68 | if steps == -1: 69 | out_txt = " after epoch {}:".format(epoch) 70 | else: 71 | out_txt = " in epoch {} after {} steps:".format(epoch, steps) 72 | else: 73 | out_txt = ":" 74 | 75 | logger.info("EmbeddingSimilarityEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt) 76 | 77 | embeddings1 = model.encode(self.sentences1, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True) 78 | embeddings2 = model.encode(self.sentences2, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True) 79 | labels = self.scores 80 | 81 | cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2)) 82 | manhattan_distances = -paired_manhattan_distances(embeddings1, embeddings2) 83 | euclidean_distances = -paired_euclidean_distances(embeddings1, embeddings2) 84 | dot_products = [np.dot(emb1, emb2) for emb1, emb2 in zip(embeddings1, embeddings2)] 85 | 86 | 87 | eval_pearson_cosine, _ = pearsonr(labels, cosine_scores) 88 | eval_spearman_cosine, _ = spearmanr(labels, cosine_scores) 89 | 90 | eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances) 91 | eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances) 92 | 93 | eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances) 94 | eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances) 95 | 96 | eval_pearson_dot, _ = pearsonr(labels, dot_products) 97 | eval_spearman_dot, _ = spearmanr(labels, dot_products) 98 | 99 | logger.info("Cosine-Similarity :\tSpearman: {:.4f}".format( 100 | eval_pearson_cosine, eval_spearman_cosine)) 101 | # logger.info("Manhattan-Distance:\tPearson: {:.4f}\tSpearman: {:.4f}".format( 102 | # eval_pearson_manhattan, eval_spearman_manhattan)) 103 | # logger.info("Euclidean-Distance:\tPearson: {:.4f}\tSpearman: {:.4f}".format( 104 | # eval_pearson_euclidean, eval_spearman_euclidean)) 105 | # logger.info("Dot-Product-Similarity:\tPearson: {:.4f}\tSpearman: {:.4f}".format( 106 | # eval_pearson_dot, eval_spearman_dot)) 107 | 108 | if output_path is not None and self.write_csv: 109 | csv_path = os.path.join(output_path, self.csv_file) 110 | output_file_exists = os.path.isfile(csv_path) 111 | with open(csv_path, newline='', mode="a" if output_file_exists else 'w', encoding="utf-8") as f: 112 | writer = csv.writer(f) 113 | if not output_file_exists: 114 | writer.writerow(self.csv_headers) 115 | 116 | writer.writerow([epoch, steps, eval_pearson_cosine, eval_spearman_cosine, eval_pearson_euclidean, 117 | eval_spearman_euclidean, eval_pearson_manhattan, eval_spearman_manhattan, eval_pearson_dot, eval_spearman_dot]) 118 | 119 | 120 | if self.main_similarity == SimilarityFunction.COSINE: 121 | return eval_spearman_cosine 122 | elif self.main_similarity == SimilarityFunction.EUCLIDEAN: 123 | return eval_spearman_euclidean 124 | elif self.main_similarity == SimilarityFunction.MANHATTAN: 125 | return eval_spearman_manhattan 126 | elif self.main_similarity == SimilarityFunction.DOT_PRODUCT: 127 | return eval_spearman_dot 128 | elif self.main_similarity is None: 129 | return max(eval_spearman_cosine, eval_spearman_manhattan, eval_spearman_euclidean, eval_spearman_dot) 130 | else: 131 | raise ValueError("Unknown main_similarity value") 132 | -------------------------------------------------------------------------------- /SentEval/senteval/engine.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | ''' 9 | 10 | Generic sentence evaluation scripts wrapper 11 | 12 | ''' 13 | from __future__ import absolute_import, division, unicode_literals 14 | 15 | from senteval import utils 16 | from senteval.binary import CREval, MREval, MPQAEval, SUBJEval 17 | from senteval.snli import SNLIEval 18 | from senteval.trec import TRECEval 19 | from senteval.sick import SICKEntailmentEval, SICKEval 20 | from senteval.mrpc import MRPCEval 21 | from senteval.sts import STS12Eval, STS13Eval, STS14Eval, STS15Eval, STS16Eval, STSBenchmarkEval, SICKRelatednessEval, STSBenchmarkFinetune 22 | from senteval.sst import SSTEval 23 | from senteval.rank import ImageCaptionRetrievalEval 24 | from senteval.probing import * 25 | 26 | class SE(object): 27 | def __init__(self, params, batcher, prepare=None): 28 | # parameters 29 | params = utils.dotdict(params) 30 | params.usepytorch = True if 'usepytorch' not in params else params.usepytorch 31 | params.seed = 1111 if 'seed' not in params else params.seed 32 | 33 | params.batch_size = 128 if 'batch_size' not in params else params.batch_size 34 | params.nhid = 0 if 'nhid' not in params else params.nhid 35 | params.kfold = 5 if 'kfold' not in params else params.kfold 36 | 37 | if 'classifier' not in params or not params['classifier']: 38 | params.classifier = {'nhid': 0} 39 | 40 | assert 'nhid' in params.classifier, 'Set number of hidden units in classifier config!!' 41 | 42 | self.params = params 43 | 44 | # batcher and prepare 45 | self.batcher = batcher 46 | self.prepare = prepare if prepare else lambda x, y: None 47 | 48 | self.list_tasks = ['CR', 'MR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 49 | 'SICKRelatedness', 'SICKEntailment', 'STSBenchmark', 50 | 'SNLI', 'ImageCaptionRetrieval', 'STS12', 'STS13', 51 | 'STS14', 'STS15', 'STS16', 52 | 'Length', 'WordContent', 'Depth', 'TopConstituents', 53 | 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber', 54 | 'OddManOut', 'CoordinationInversion', 'SICKRelatedness-finetune', 'STSBenchmark-finetune', 'STSBenchmark-fix'] 55 | 56 | def eval(self, name): 57 | # evaluate on evaluation [name], either takes string or list of strings 58 | if (isinstance(name, list)): 59 | self.results = {x: self.eval(x) for x in name} 60 | return self.results 61 | 62 | tpath = self.params.task_path 63 | assert name in self.list_tasks, str(name) + ' not in ' + str(self.list_tasks) 64 | 65 | # Original SentEval tasks 66 | if name == 'CR': 67 | self.evaluation = CREval(tpath + '/downstream/CR', seed=self.params.seed) 68 | elif name == 'MR': 69 | self.evaluation = MREval(tpath + '/downstream/MR', seed=self.params.seed) 70 | elif name == 'MPQA': 71 | self.evaluation = MPQAEval(tpath + '/downstream/MPQA', seed=self.params.seed) 72 | elif name == 'SUBJ': 73 | self.evaluation = SUBJEval(tpath + '/downstream/SUBJ', seed=self.params.seed) 74 | elif name == 'SST2': 75 | self.evaluation = SSTEval(tpath + '/downstream/SST/binary', nclasses=2, seed=self.params.seed) 76 | elif name == 'SST5': 77 | self.evaluation = SSTEval(tpath + '/downstream/SST/fine', nclasses=5, seed=self.params.seed) 78 | elif name == 'TREC': 79 | self.evaluation = TRECEval(tpath + '/downstream/TREC', seed=self.params.seed) 80 | elif name == 'MRPC': 81 | self.evaluation = MRPCEval(tpath + '/downstream/MRPC', seed=self.params.seed) 82 | elif name == 'SICKRelatedness': 83 | self.evaluation = SICKRelatednessEval(tpath + '/downstream/SICK', seed=self.params.seed) 84 | elif name == 'STSBenchmark': 85 | self.evaluation = STSBenchmarkEval(tpath + '/downstream/STS/STSBenchmark', seed=self.params.seed) 86 | elif name == 'STSBenchmark-fix': 87 | self.evaluation = STSBenchmarkEval(tpath + '/downstream/STS/STSBenchmark-fix', seed=self.params.seed) 88 | elif name == 'STSBenchmark-finetune': 89 | self.evaluation = STSBenchmarkFinetune(tpath + '/downstream/STS/STSBenchmark', seed=self.params.seed) 90 | elif name == 'SICKRelatedness-finetune': 91 | self.evaluation = SICKEval(tpath + '/downstream/SICK', seed=self.params.seed) 92 | elif name == 'SICKEntailment': 93 | self.evaluation = SICKEntailmentEval(tpath + '/downstream/SICK', seed=self.params.seed) 94 | elif name == 'SNLI': 95 | self.evaluation = SNLIEval(tpath + '/downstream/SNLI', seed=self.params.seed) 96 | elif name in ['STS12', 'STS13', 'STS14', 'STS15', 'STS16']: 97 | fpath = name + '-en-test' 98 | self.evaluation = eval(name + 'Eval')(tpath + '/downstream/STS/' + fpath, seed=self.params.seed) 99 | elif name == 'ImageCaptionRetrieval': 100 | self.evaluation = ImageCaptionRetrievalEval(tpath + '/downstream/COCO', seed=self.params.seed) 101 | 102 | # Probing Tasks 103 | elif name == 'Length': 104 | self.evaluation = LengthEval(tpath + '/probing', seed=self.params.seed) 105 | elif name == 'WordContent': 106 | self.evaluation = WordContentEval(tpath + '/probing', seed=self.params.seed) 107 | elif name == 'Depth': 108 | self.evaluation = DepthEval(tpath + '/probing', seed=self.params.seed) 109 | elif name == 'TopConstituents': 110 | self.evaluation = TopConstituentsEval(tpath + '/probing', seed=self.params.seed) 111 | elif name == 'BigramShift': 112 | self.evaluation = BigramShiftEval(tpath + '/probing', seed=self.params.seed) 113 | elif name == 'Tense': 114 | self.evaluation = TenseEval(tpath + '/probing', seed=self.params.seed) 115 | elif name == 'SubjNumber': 116 | self.evaluation = SubjNumberEval(tpath + '/probing', seed=self.params.seed) 117 | elif name == 'ObjNumber': 118 | self.evaluation = ObjNumberEval(tpath + '/probing', seed=self.params.seed) 119 | elif name == 'OddManOut': 120 | self.evaluation = OddManOutEval(tpath + '/probing', seed=self.params.seed) 121 | elif name == 'CoordinationInversion': 122 | self.evaluation = CoordinationInversionEval(tpath + '/probing', seed=self.params.seed) 123 | 124 | self.params.current_task = name 125 | self.evaluation.do_prepare(self.params, self.prepare) 126 | 127 | self.results = self.evaluation.run(self.params, self.batcher) 128 | 129 | return self.results 130 | -------------------------------------------------------------------------------- /sentence_transformers_congen/evaluation/TripletEvaluator.py: -------------------------------------------------------------------------------- 1 | from . import SentenceEvaluator, SimilarityFunction 2 | import logging 3 | import os 4 | import csv 5 | from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances 6 | from typing import List 7 | from ..readers import InputExample 8 | 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class TripletEvaluator(SentenceEvaluator): 14 | """ 15 | Evaluate a model based on a triplet: (sentence, positive_example, negative_example). 16 | Checks if distance(sentence, positive_example) < distance(sentence, negative_example). 17 | """ 18 | 19 | def __init__( 20 | self, 21 | anchors: List[str], 22 | positives: List[str], 23 | negatives: List[str], 24 | main_distance_function: SimilarityFunction = None, 25 | name: str = "", 26 | batch_size: int = 16, 27 | show_progress_bar: bool = False, 28 | write_csv: bool = True, 29 | ): 30 | """ 31 | :param anchors: Sentences to check similarity to. (e.g. a query) 32 | :param positives: List of positive sentences 33 | :param negatives: List of negative sentences 34 | :param main_distance_function: One of 0 (Cosine), 1 (Euclidean) or 2 (Manhattan). Defaults to None, returning all 3. 35 | :param name: Name for the output 36 | :param batch_size: Batch size used to compute embeddings 37 | :param show_progress_bar: If true, prints a progress bar 38 | :param write_csv: Write results to a CSV file 39 | """ 40 | self.anchors = anchors 41 | self.positives = positives 42 | self.negatives = negatives 43 | self.name = name 44 | 45 | assert len(self.anchors) == len(self.positives) 46 | assert len(self.anchors) == len(self.negatives) 47 | 48 | self.main_distance_function = main_distance_function 49 | 50 | self.batch_size = batch_size 51 | if show_progress_bar is None: 52 | show_progress_bar = ( 53 | logger.getEffectiveLevel() == logging.INFO or logger.getEffectiveLevel() == logging.DEBUG 54 | ) 55 | self.show_progress_bar = show_progress_bar 56 | 57 | self.csv_file: str = "triplet_evaluation" + ("_" + name if name else "") + "_results.csv" 58 | self.csv_headers = ["epoch", "steps", "accuracy_cosinus", "accuracy_manhatten", "accuracy_euclidean"] 59 | self.write_csv = write_csv 60 | 61 | @classmethod 62 | def from_input_examples(cls, examples: List[InputExample], **kwargs): 63 | anchors = [] 64 | positives = [] 65 | negatives = [] 66 | 67 | for example in examples: 68 | anchors.append(example.texts[0]) 69 | positives.append(example.texts[1]) 70 | negatives.append(example.texts[2]) 71 | return cls(anchors, positives, negatives, **kwargs) 72 | 73 | def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: 74 | if epoch != -1: 75 | if steps == -1: 76 | out_txt = " after epoch {}:".format(epoch) 77 | else: 78 | out_txt = " in epoch {} after {} steps:".format(epoch, steps) 79 | else: 80 | out_txt = ":" 81 | 82 | logger.info("TripletEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt) 83 | 84 | num_triplets = 0 85 | num_correct_cos_triplets, num_correct_manhatten_triplets, num_correct_euclidean_triplets = 0, 0, 0 86 | 87 | embeddings_anchors = model.encode( 88 | self.anchors, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True 89 | ) 90 | embeddings_positives = model.encode( 91 | self.positives, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True 92 | ) 93 | embeddings_negatives = model.encode( 94 | self.negatives, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True 95 | ) 96 | 97 | # Cosine distance 98 | pos_cos_distance = paired_cosine_distances(embeddings_anchors, embeddings_positives) 99 | neg_cos_distances = paired_cosine_distances(embeddings_anchors, embeddings_negatives) 100 | 101 | # Manhattan 102 | pos_manhattan_distance = paired_manhattan_distances(embeddings_anchors, embeddings_positives) 103 | neg_manhattan_distances = paired_manhattan_distances(embeddings_anchors, embeddings_negatives) 104 | 105 | # Euclidean 106 | pos_euclidean_distance = paired_euclidean_distances(embeddings_anchors, embeddings_positives) 107 | neg_euclidean_distances = paired_euclidean_distances(embeddings_anchors, embeddings_negatives) 108 | 109 | for idx in range(len(pos_cos_distance)): 110 | num_triplets += 1 111 | 112 | if pos_cos_distance[idx] < neg_cos_distances[idx]: 113 | num_correct_cos_triplets += 1 114 | 115 | if pos_manhattan_distance[idx] < neg_manhattan_distances[idx]: 116 | num_correct_manhatten_triplets += 1 117 | 118 | if pos_euclidean_distance[idx] < neg_euclidean_distances[idx]: 119 | num_correct_euclidean_triplets += 1 120 | 121 | accuracy_cos = num_correct_cos_triplets / num_triplets 122 | accuracy_manhattan = num_correct_manhatten_triplets / num_triplets 123 | accuracy_euclidean = num_correct_euclidean_triplets / num_triplets 124 | 125 | logger.info("Accuracy Cosine Distance: \t{:.2f}".format(accuracy_cos * 100)) 126 | logger.info("Accuracy Manhattan Distance:\t{:.2f}".format(accuracy_manhattan * 100)) 127 | logger.info("Accuracy Euclidean Distance:\t{:.2f}\n".format(accuracy_euclidean * 100)) 128 | 129 | if output_path is not None and self.write_csv: 130 | csv_path = os.path.join(output_path, self.csv_file) 131 | if not os.path.isfile(csv_path): 132 | with open(csv_path, newline="", mode="w", encoding="utf-8") as f: 133 | writer = csv.writer(f) 134 | writer.writerow(self.csv_headers) 135 | writer.writerow([epoch, steps, accuracy_cos, accuracy_manhattan, accuracy_euclidean]) 136 | 137 | else: 138 | with open(csv_path, newline="", mode="a", encoding="utf-8") as f: 139 | writer = csv.writer(f) 140 | writer.writerow([epoch, steps, accuracy_cos, accuracy_manhattan, accuracy_euclidean]) 141 | 142 | if self.main_distance_function == SimilarityFunction.COSINE: 143 | return accuracy_cos 144 | if self.main_distance_function == SimilarityFunction.MANHATTAN: 145 | return accuracy_manhattan 146 | if self.main_distance_function == SimilarityFunction.EUCLIDEAN: 147 | return accuracy_euclidean 148 | 149 | return max(accuracy_cos, accuracy_manhattan, accuracy_euclidean) 150 | --------------------------------------------------------------------------------