├── SentEval
    ├── senteval
    │   ├── tools
    │   │   ├── __init__.py
    │   │   └── relatedness.py
    │   ├── __init__.py
    │   ├── utils.py
    │   ├── trec.py
    │   ├── binary.py
    │   ├── sst.py
    │   ├── mrpc.py
    │   ├── snli.py
    │   ├── rank.py
    │   └── engine.py
    ├── data
    │   └── downstream
    │   │   └── download_dataset.sh
    ├── .gitignore
    ├── setup.py
    ├── LICENSE
    └── examples
    │   ├── skipthought.py
    │   ├── googleuse.py
    │   ├── gensen.py
    │   ├── infersent.py
    │   └── bow.py
├── sentence_transformers_congen
    ├── losses
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-36.pyc
    │   │   └── ConGenLoss.cpython-36.pyc
    │   └── ConGenLoss.py
    ├── __init__.py
    ├── __pycache__
    │   ├── util.cpython-36.pyc
    │   ├── __init__.cpython-36.pyc
    │   ├── SentenceTransformer.cpython-36.pyc
    │   └── model_card_templates.cpython-36.pyc
    ├── models
    │   ├── __pycache__
    │   │   ├── BoW.cpython-36.pyc
    │   │   ├── BoW.cpython-39.pyc
    │   │   ├── CNN.cpython-36.pyc
    │   │   ├── CNN.cpython-39.pyc
    │   │   ├── Asym.cpython-36.pyc
    │   │   ├── Asym.cpython-39.pyc
    │   │   ├── Dense.cpython-36.pyc
    │   │   ├── Dense.cpython-39.pyc
    │   │   ├── LSTM.cpython-36.pyc
    │   │   ├── LSTM.cpython-39.pyc
    │   │   ├── Dropout.cpython-36.pyc
    │   │   ├── Dropout.cpython-39.pyc
    │   │   ├── Pooling.cpython-36.pyc
    │   │   ├── Pooling.cpython-39.pyc
    │   │   ├── __init__.cpython-36.pyc
    │   │   ├── __init__.cpython-39.pyc
    │   │   ├── CLIPModel.cpython-36.pyc
    │   │   ├── CLIPModel.cpython-39.pyc
    │   │   ├── LayerNorm.cpython-36.pyc
    │   │   ├── LayerNorm.cpython-39.pyc
    │   │   ├── Normalize.cpython-36.pyc
    │   │   ├── Normalize.cpython-39.pyc
    │   │   ├── Transformer.cpython-36.pyc
    │   │   ├── Transformer.cpython-39.pyc
    │   │   ├── WordWeights.cpython-36.pyc
    │   │   ├── WordWeights.cpython-39.pyc
    │   │   ├── WordEmbeddings.cpython-36.pyc
    │   │   ├── WordEmbeddings.cpython-39.pyc
    │   │   ├── WeightedLayerPooling.cpython-36.pyc
    │   │   └── WeightedLayerPooling.cpython-39.pyc
    │   ├── tokenizer
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-36.pyc
    │   │   │   ├── __init__.cpython-39.pyc
    │   │   │   ├── WordTokenizer.cpython-36.pyc
    │   │   │   ├── WordTokenizer.cpython-39.pyc
    │   │   │   ├── PhraseTokenizer.cpython-36.pyc
    │   │   │   ├── PhraseTokenizer.cpython-39.pyc
    │   │   │   ├── WhitespaceTokenizer.cpython-36.pyc
    │   │   │   └── WhitespaceTokenizer.cpython-39.pyc
    │   │   ├── __init__.py
    │   │   ├── WhitespaceTokenizer.py
    │   │   ├── WordTokenizer.py
    │   │   └── PhraseTokenizer.py
    │   ├── __init__.py
    │   ├── Normalize.py
    │   ├── Dropout.py
    │   ├── LayerNorm.py
    │   ├── WeightedLayerPooling.py
    │   ├── LSTM.py
    │   ├── Dense.py
    │   ├── CNN.py
    │   ├── CLIPModel.py
    │   ├── WordWeights.py
    │   ├── BoW.py
    │   ├── T5.py
    │   ├── Asym.py
    │   ├── Pooling.py
    │   ├── WordEmbeddings.py
    │   └── Transformer.py
    ├── readers
    │   ├── __pycache__
    │   │   ├── __init__.cpython-36.pyc
    │   │   ├── __init__.cpython-37.pyc
    │   │   ├── __init__.cpython-39.pyc
    │   │   ├── InputExample.cpython-36.pyc
    │   │   ├── InputExample.cpython-37.pyc
    │   │   ├── InputExample.cpython-39.pyc
    │   │   ├── NLIDataReader.cpython-36.pyc
    │   │   ├── NLIDataReader.cpython-37.pyc
    │   │   ├── NLIDataReader.cpython-39.pyc
    │   │   ├── STSDataReader.cpython-36.pyc
    │   │   ├── STSDataReader.cpython-37.pyc
    │   │   ├── STSDataReader.cpython-39.pyc
    │   │   ├── TripletReader.cpython-36.pyc
    │   │   ├── TripletReader.cpython-37.pyc
    │   │   ├── TripletReader.cpython-39.pyc
    │   │   ├── LabelSentenceReader.cpython-36.pyc
    │   │   ├── LabelSentenceReader.cpython-37.pyc
    │   │   └── LabelSentenceReader.cpython-39.pyc
    │   ├── __init__.py
    │   ├── InputExample.py
    │   ├── PairedFilesReader.py
    │   ├── LabelSentenceReader.py
    │   ├── TripletReader.py
    │   ├── NLIDataReader.py
    │   └── STSDataReader.py
    ├── evaluation
    │   ├── __pycache__
    │   │   ├── __init__.cpython-36.pyc
    │   │   ├── __init__.cpython-37.pyc
    │   │   ├── __init__.cpython-39.pyc
    │   │   ├── MSEEvaluator.cpython-36.pyc
    │   │   ├── MSEEvaluator.cpython-39.pyc
    │   │   ├── SentenceEvaluator.cpython-36.pyc
    │   │   ├── SentenceEvaluator.cpython-37.pyc
    │   │   ├── SentenceEvaluator.cpython-39.pyc
    │   │   ├── TripletEvaluator.cpython-36.pyc
    │   │   ├── TripletEvaluator.cpython-39.pyc
    │   │   ├── RerankingEvaluator.cpython-36.pyc
    │   │   ├── RerankingEvaluator.cpython-39.pyc
    │   │   ├── SequentialEvaluator.cpython-36.pyc
    │   │   ├── SequentialEvaluator.cpython-39.pyc
    │   │   ├── SimilarityFunction.cpython-36.pyc
    │   │   ├── SimilarityFunction.cpython-37.pyc
    │   │   ├── SimilarityFunction.cpython-39.pyc
    │   │   ├── LabelAccuracyEvaluator.cpython-36.pyc
    │   │   ├── LabelAccuracyEvaluator.cpython-39.pyc
    │   │   ├── TranslationEvaluator.cpython-36.pyc
    │   │   ├── TranslationEvaluator.cpython-39.pyc
    │   │   ├── MSEEvaluatorFromDataFrame.cpython-36.pyc
    │   │   ├── MSEEvaluatorFromDataFrame.cpython-39.pyc
    │   │   ├── ParaphraseMiningEvaluator.cpython-36.pyc
    │   │   ├── ParaphraseMiningEvaluator.cpython-39.pyc
    │   │   ├── BinaryClassificationEvaluator.cpython-36.pyc
    │   │   ├── BinaryClassificationEvaluator.cpython-37.pyc
    │   │   ├── BinaryClassificationEvaluator.cpython-39.pyc
    │   │   ├── EmbeddingSimilarityEvaluator.cpython-36.pyc
    │   │   ├── EmbeddingSimilarityEvaluator.cpython-39.pyc
    │   │   ├── InformationRetrievalEvaluator.cpython-36.pyc
    │   │   └── InformationRetrievalEvaluator.cpython-39.pyc
    │   ├── SimilarityFunction.py
    │   ├── __init__.py
    │   ├── SequentialEvaluator.py
    │   ├── SentenceEvaluator.py
    │   ├── LabelAccuracyEvaluator.py
    │   ├── MSEEvaluator.py
    │   ├── MSEEvaluatorFromDataFrame.py
    │   ├── TranslationEvaluator.py
    │   ├── RerankingEvaluator.py
    │   ├── EmbeddingSimilarityEvaluator.py
    │   └── TripletEvaluator.py
    └── model_card_templates.py
├── requirements.txt
├── ConGen__Unsupervised_Control_and_Generalization_Distillation_For_Sentence_Representation.pdf
├── setup.py
├── train_congen.sh
└── evaluation.py


/SentEval/senteval/tools/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/sentence_transformers_congen/losses/__init__.py:
--------------------------------------------------------------------------------
1 | from .ConGenLoss import *


--------------------------------------------------------------------------------
/sentence_transformers_congen/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "1.0.0"
2 | from .SentenceTransformer import SentenceTransformer


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==1.8.1
2 | transformers==4.9.0
3 | sentence-transformers==2.0.0
4 | tensorflow==2.5.0
5 | protobuf==3.20.*
6 | 


--------------------------------------------------------------------------------
/SentEval/data/downstream/download_dataset.sh:
--------------------------------------------------------------------------------
1 | wget https://huggingface.co/datasets/princeton-nlp/datasets-for-simcse/resolve/main/senteval.tar
2 | tar xvf senteval.tar
3 | 


--------------------------------------------------------------------------------
/sentence_transformers_congen/__pycache__/util.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/__pycache__/util.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/__pycache__/BoW.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/BoW.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/__pycache__/BoW.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/BoW.cpython-39.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/__pycache__/CNN.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/CNN.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/__pycache__/CNN.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/CNN.cpython-39.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/__pycache__/Asym.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/Asym.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/__pycache__/Asym.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/Asym.cpython-39.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/__pycache__/Dense.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/Dense.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/__pycache__/Dense.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/Dense.cpython-39.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/__pycache__/LSTM.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/LSTM.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/__pycache__/LSTM.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/LSTM.cpython-39.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/losses/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/losses/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/__pycache__/Dropout.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/Dropout.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/__pycache__/Dropout.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/Dropout.cpython-39.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/__pycache__/Pooling.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/Pooling.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/__pycache__/Pooling.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/Pooling.cpython-39.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/losses/__pycache__/ConGenLoss.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/losses/__pycache__/ConGenLoss.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/__pycache__/CLIPModel.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/CLIPModel.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/__pycache__/CLIPModel.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/CLIPModel.cpython-39.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/__pycache__/LayerNorm.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/LayerNorm.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/__pycache__/LayerNorm.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/LayerNorm.cpython-39.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/__pycache__/Normalize.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/Normalize.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/__pycache__/Normalize.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/Normalize.cpython-39.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/__pycache__/Transformer.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/Transformer.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/__pycache__/Transformer.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/Transformer.cpython-39.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/__pycache__/WordWeights.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/WordWeights.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/__pycache__/WordWeights.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/WordWeights.cpython-39.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/readers/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/readers/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/readers/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/readers/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/readers/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/readers/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/__pycache__/SentenceTransformer.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/__pycache__/SentenceTransformer.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/__pycache__/model_card_templates.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/__pycache__/model_card_templates.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/evaluation/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/evaluation/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/evaluation/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/readers/__pycache__/InputExample.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/readers/__pycache__/InputExample.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/readers/__pycache__/InputExample.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/readers/__pycache__/InputExample.cpython-37.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/readers/__pycache__/InputExample.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/readers/__pycache__/InputExample.cpython-39.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/evaluation/__pycache__/MSEEvaluator.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/MSEEvaluator.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/evaluation/__pycache__/MSEEvaluator.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/MSEEvaluator.cpython-39.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/__pycache__/WordEmbeddings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/WordEmbeddings.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/__pycache__/WordEmbeddings.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/WordEmbeddings.cpython-39.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/readers/__pycache__/NLIDataReader.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/readers/__pycache__/NLIDataReader.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/readers/__pycache__/NLIDataReader.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/readers/__pycache__/NLIDataReader.cpython-37.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/readers/__pycache__/NLIDataReader.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/readers/__pycache__/NLIDataReader.cpython-39.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/readers/__pycache__/STSDataReader.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/readers/__pycache__/STSDataReader.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/readers/__pycache__/STSDataReader.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/readers/__pycache__/STSDataReader.cpython-37.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/readers/__pycache__/STSDataReader.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/readers/__pycache__/STSDataReader.cpython-39.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/readers/__pycache__/TripletReader.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/readers/__pycache__/TripletReader.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/readers/__pycache__/TripletReader.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/readers/__pycache__/TripletReader.cpython-37.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/readers/__pycache__/TripletReader.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/readers/__pycache__/TripletReader.cpython-39.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/tokenizer/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/tokenizer/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/tokenizer/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/tokenizer/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/evaluation/__pycache__/SentenceEvaluator.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/SentenceEvaluator.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/evaluation/__pycache__/SentenceEvaluator.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/SentenceEvaluator.cpython-37.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/evaluation/__pycache__/SentenceEvaluator.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/SentenceEvaluator.cpython-39.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/evaluation/__pycache__/TripletEvaluator.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/TripletEvaluator.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/evaluation/__pycache__/TripletEvaluator.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/TripletEvaluator.cpython-39.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/__pycache__/WeightedLayerPooling.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/WeightedLayerPooling.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/__pycache__/WeightedLayerPooling.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/__pycache__/WeightedLayerPooling.cpython-39.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/readers/__pycache__/LabelSentenceReader.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/readers/__pycache__/LabelSentenceReader.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/readers/__pycache__/LabelSentenceReader.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/readers/__pycache__/LabelSentenceReader.cpython-37.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/readers/__pycache__/LabelSentenceReader.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/readers/__pycache__/LabelSentenceReader.cpython-39.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/evaluation/SimilarityFunction.py:
--------------------------------------------------------------------------------
1 | from enum import Enum
2 | 
3 | class SimilarityFunction(Enum):
4 |     COSINE = 0
5 |     EUCLIDEAN = 1
6 |     MANHATTAN = 2
7 |     DOT_PRODUCT = 3
8 | 
9 | 


--------------------------------------------------------------------------------
/sentence_transformers_congen/evaluation/__pycache__/RerankingEvaluator.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/RerankingEvaluator.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/evaluation/__pycache__/RerankingEvaluator.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/RerankingEvaluator.cpython-39.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/evaluation/__pycache__/SequentialEvaluator.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/SequentialEvaluator.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/evaluation/__pycache__/SequentialEvaluator.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/SequentialEvaluator.cpython-39.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/evaluation/__pycache__/SimilarityFunction.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/SimilarityFunction.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/evaluation/__pycache__/SimilarityFunction.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/SimilarityFunction.cpython-37.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/evaluation/__pycache__/SimilarityFunction.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/SimilarityFunction.cpython-39.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/tokenizer/__pycache__/WordTokenizer.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/tokenizer/__pycache__/WordTokenizer.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/tokenizer/__pycache__/WordTokenizer.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/tokenizer/__pycache__/WordTokenizer.cpython-39.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/evaluation/__pycache__/LabelAccuracyEvaluator.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/LabelAccuracyEvaluator.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/evaluation/__pycache__/LabelAccuracyEvaluator.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/LabelAccuracyEvaluator.cpython-39.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/evaluation/__pycache__/TranslationEvaluator.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/TranslationEvaluator.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/evaluation/__pycache__/TranslationEvaluator.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/TranslationEvaluator.cpython-39.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | from .WordTokenizer import WordTokenizer, ENGLISH_STOP_WORDS
2 | from .WhitespaceTokenizer import WhitespaceTokenizer
3 | from .PhraseTokenizer import PhraseTokenizer
4 | 


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/tokenizer/__pycache__/PhraseTokenizer.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/tokenizer/__pycache__/PhraseTokenizer.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/tokenizer/__pycache__/PhraseTokenizer.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/tokenizer/__pycache__/PhraseTokenizer.cpython-39.pyc


--------------------------------------------------------------------------------
/ConGen__Unsupervised_Control_and_Generalization_Distillation_For_Sentence_Representation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/ConGen__Unsupervised_Control_and_Generalization_Distillation_For_Sentence_Representation.pdf


--------------------------------------------------------------------------------
/sentence_transformers_congen/evaluation/__pycache__/MSEEvaluatorFromDataFrame.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/MSEEvaluatorFromDataFrame.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/evaluation/__pycache__/MSEEvaluatorFromDataFrame.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/MSEEvaluatorFromDataFrame.cpython-39.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/evaluation/__pycache__/ParaphraseMiningEvaluator.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/ParaphraseMiningEvaluator.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/evaluation/__pycache__/ParaphraseMiningEvaluator.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/ParaphraseMiningEvaluator.cpython-39.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/tokenizer/__pycache__/WhitespaceTokenizer.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/tokenizer/__pycache__/WhitespaceTokenizer.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/tokenizer/__pycache__/WhitespaceTokenizer.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/models/tokenizer/__pycache__/WhitespaceTokenizer.cpython-39.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/evaluation/__pycache__/BinaryClassificationEvaluator.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/BinaryClassificationEvaluator.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/evaluation/__pycache__/BinaryClassificationEvaluator.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/BinaryClassificationEvaluator.cpython-37.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/evaluation/__pycache__/BinaryClassificationEvaluator.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/BinaryClassificationEvaluator.cpython-39.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/evaluation/__pycache__/EmbeddingSimilarityEvaluator.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/EmbeddingSimilarityEvaluator.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/evaluation/__pycache__/EmbeddingSimilarityEvaluator.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/EmbeddingSimilarityEvaluator.cpython-39.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/evaluation/__pycache__/InformationRetrievalEvaluator.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/InformationRetrievalEvaluator.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers_congen/evaluation/__pycache__/InformationRetrievalEvaluator.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KornWtp/ConGen/HEAD/sentence_transformers_congen/evaluation/__pycache__/InformationRetrievalEvaluator.cpython-39.pyc


--------------------------------------------------------------------------------
/SentEval/.gitignore:
--------------------------------------------------------------------------------
 1 | # SentEval data and .pyc files
 2 | 
 3 | 
 4 | 
 5 | # python
 6 | __pycache__/
 7 | *.py[cod]
 8 | *$py.class
 9 | 
10 | # log files
11 | *.log
12 | *.txt
13 | 
14 | # data files
15 | data/senteval_data*
16 | data/downstream/
17 | 


--------------------------------------------------------------------------------
/sentence_transformers_congen/readers/__init__.py:
--------------------------------------------------------------------------------
1 | from .InputExample import InputExample
2 | from .LabelSentenceReader import LabelSentenceReader
3 | from .NLIDataReader import NLIDataReader
4 | from .STSDataReader import STSDataReader, STSBenchmarkDataReader
5 | from .TripletReader import TripletReader


--------------------------------------------------------------------------------
/SentEval/senteval/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | from __future__ import absolute_import
 9 | 
10 | from senteval.engine import SE
11 | 


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/__init__.py:
--------------------------------------------------------------------------------
 1 | from .Transformer import Transformer
 2 | from .Asym import Asym
 3 | from .BoW import BoW
 4 | from .CNN import CNN
 5 | from .Dense import Dense
 6 | from .Dropout import Dropout
 7 | from .LayerNorm import LayerNorm
 8 | from .LSTM import LSTM
 9 | from .Normalize import Normalize
10 | from .Pooling import Pooling
11 | from .WeightedLayerPooling import WeightedLayerPooling
12 | from .WordEmbeddings import WordEmbeddings
13 | from .WordWeights import WordWeights
14 | from .CLIPModel import CLIPModel
15 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | with open("README.md", mode="r", encoding="utf-8") as readme_file:
 4 |     readme = readme_file.read()
 5 | 
 6 | 
 7 | 
 8 | setup(
 9 |     name="congen-sbert",
10 |     version="1.0.0",
11 |     author=" ",
12 |     author_email=" ",
13 |     description="Sentence representation with SBERT",
14 |     long_description=readme,
15 |     long_description_content_type="text/markdown",
16 |     packages=find_packages(),
17 |     install_requires=[
18 |         "torch==1.8.1",
19 |         "transformers==4.9.0",
20 |         "sentence-transformers==2.0.0",
21 |     ],
22 | )
23 | 


--------------------------------------------------------------------------------
/SentEval/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | import io
 9 | from setuptools import setup, find_packages
10 | 
11 | with io.open('./README.md', encoding='utf-8') as f:
12 |     readme = f.read()
13 | 
14 | setup(
15 |     name='SentEval',
16 |     version='0.1.0',
17 |     url='https://github.com/facebookresearch/SentEval',
18 |     packages=find_packages(exclude=['examples']),
19 |     license='Attribution-NonCommercial 4.0 International',
20 |     long_description=readme,
21 | )
22 | 


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/Normalize.py:
--------------------------------------------------------------------------------
 1 | from torch import Tensor
 2 | from torch import nn
 3 | from typing import Dict
 4 | import torch.nn.functional as F
 5 | 
 6 | class Normalize(nn.Module):
 7 |     """
 8 |     This layer normalizes embeddings to unit length
 9 |     """
10 |     def __init__(self):
11 |         super(Normalize, self).__init__()
12 | 
13 |     def forward(self, features: Dict[str, Tensor]):
14 |         features.update({'sentence_embedding': F.normalize(features['sentence_embedding'], p=2, dim=1)})
15 |         return features
16 | 
17 |     def save(self, output_path):
18 |         pass
19 | 
20 |     @staticmethod
21 |     def load(input_path):
22 |         return Normalize()
23 | 


--------------------------------------------------------------------------------
/sentence_transformers_congen/evaluation/__init__.py:
--------------------------------------------------------------------------------
 1 | from .SentenceEvaluator import SentenceEvaluator
 2 | from .SimilarityFunction import SimilarityFunction
 3 | from .BinaryClassificationEvaluator import BinaryClassificationEvaluator
 4 | from .EmbeddingSimilarityEvaluator import EmbeddingSimilarityEvaluator
 5 | from .InformationRetrievalEvaluator import InformationRetrievalEvaluator
 6 | from .LabelAccuracyEvaluator import LabelAccuracyEvaluator
 7 | from .MSEEvaluator import MSEEvaluator
 8 | from .MSEEvaluatorFromDataFrame import MSEEvaluatorFromDataFrame
 9 | from .ParaphraseMiningEvaluator import ParaphraseMiningEvaluator
10 | from .SequentialEvaluator import SequentialEvaluator
11 | from .TranslationEvaluator import TranslationEvaluator
12 | from .TripletEvaluator import TripletEvaluator
13 | from .RerankingEvaluator import RerankingEvaluator


--------------------------------------------------------------------------------
/sentence_transformers_congen/readers/InputExample.py:
--------------------------------------------------------------------------------
 1 | from typing import Union, List
 2 | 
 3 | 
 4 | class InputExample:
 5 |     """
 6 |     Structure for one input example with texts, the label and a unique id
 7 |     """
 8 |     def __init__(self, guid: str = '', texts: List[str] = None,  label: Union[int, float] = 0):
 9 |         """
10 |         Creates one InputExample with the given texts, guid and label
11 | 
12 | 
13 |         :param guid
14 |             id for the example
15 |         :param texts
16 |             the texts for the example. Note, str.strip() is called on the texts
17 |         :param label
18 |             the label for the example
19 |         """
20 |         self.guid = guid
21 |         self.texts = texts
22 |         self.label = label
23 | 
24 |     def __str__(self):
25 |         return "<InputExample> label: {}, texts: {}".format(str(self.label), "; ".join(self.texts))


--------------------------------------------------------------------------------
/sentence_transformers_congen/evaluation/SequentialEvaluator.py:
--------------------------------------------------------------------------------
 1 | from . import SentenceEvaluator
 2 | from typing import Iterable
 3 | 
 4 | class SequentialEvaluator(SentenceEvaluator):
 5 |     """
 6 |     This evaluator allows that multiple sub-evaluators are passed. When the model is evaluated,
 7 |     the data is passed sequentially to all sub-evaluators.
 8 | 
 9 |     All scores are passed to 'main_score_function', which derives one final score value
10 |     """
11 |     def __init__(self, evaluators: Iterable[SentenceEvaluator], main_score_function = lambda scores: scores[-1]):
12 |         self.evaluators = evaluators
13 |         self.main_score_function = main_score_function
14 | 
15 |     def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
16 |         scores = []
17 |         for evaluator in self.evaluators:
18 |             scores.append(evaluator(model, output_path, epoch, steps))
19 | 
20 |         return self.main_score_function(scores)
21 | 


--------------------------------------------------------------------------------
/train_congen.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # please see Appendix A.1 in our paper or https://github.com/KornWtp/ConGen#parameters for the full setup
 4 | CUDA_VISIBLE_DEVICES=$0 # GPU device number. 
 5 | python main.py \
 6 |     --model_save_path "your-output-model-path" \
 7 |     --teacher_model_name_or_path princeton-nlp/unsup-simcse-roberta-large \ # Default teacher
 8 |     --student_model_name_or_path nreimers/BERT-Tiny_L-2_H-128_A-2 \ # compressed model or large model
 9 |     --train_data_path "your-train-data-path" \ # https://drive.google.com/file/d/19O2NArJz_RlVNNGRbBnnWxNMW-7HaFZ8/view?usp=sharing
10 |     --dev_data_path "your-validation-data-path" \ # STS-B dev set
11 |     --train_batch_size 128 \
12 |     --eval_batch_size 128 \
13 |     --max_seq_length 128 \
14 |     --num_epochs 20 \
15 |     --learning_rate 5e-4 \ # see https://github.com/KornWtp/ConGen#parameters
16 |     --teacher_temp 0.05 \ # see https://github.com/KornWtp/ConGen#parameters
17 |     --student_temp 0.07 \ # see https://github.com/KornWtp/ConGen#parameters
18 |     --queue_size 65536 # see https://github.com/KornWtp/ConGen#parameters
19 | 


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/Dropout.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import Tensor
 3 | from torch import nn
 4 | from typing import Dict
 5 | import os
 6 | import json
 7 | 
 8 | 
 9 | class Dropout(nn.Module):
10 |     """Dropout layer.
11 | 
12 |     :param dropout: Sets a dropout value for dense layer.
13 |     """
14 |     def __init__(self, dropout: float = 0.2):
15 |         super(Dropout, self).__init__()
16 |         self.dropout = dropout
17 |         self.dropout_layer = nn.Dropout(self.dropout)
18 | 
19 |     def forward(self, features: Dict[str, Tensor]):
20 |         features.update({'sentence_embedding': self.dropout_layer(features['sentence_embedding'])})
21 |         return features
22 | 
23 |     def save(self, output_path):
24 |         with open(os.path.join(output_path, 'config.json'), 'w') as fOut:
25 |             json.dump({'dropout': self.dropout}, fOut)
26 | 
27 | 
28 | 
29 |     @staticmethod
30 |     def load(input_path):
31 |         with open(os.path.join(input_path, 'config.json')) as fIn:
32 |             config = json.load(fIn)
33 | 
34 |         model = Dropout(**config)
35 |         return model
36 | 


--------------------------------------------------------------------------------
/sentence_transformers_congen/evaluation/SentenceEvaluator.py:
--------------------------------------------------------------------------------
 1 | class SentenceEvaluator:
 2 |     """
 3 |     Base class for all evaluators
 4 | 
 5 |     Extend this class and implement __call__ for custom evaluators.
 6 |     """
 7 | 
 8 |     def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
 9 |         """
10 |         This is called during training to evaluate the model.
11 |         It returns a score for the evaluation with a higher score indicating a better result.
12 | 
13 |         :param model:
14 |             the model to evaluate
15 |         :param output_path:
16 |             path where predictions and metrics are written to
17 |         :param epoch
18 |             the epoch where the evaluation takes place.
19 |             This is used for the file prefixes.
20 |             If this is -1, then we assume evaluation on test data.
21 |         :param steps
22 |             the steps in the current epoch at time of the evaluation.
23 |             This is used for the file prefixes.
24 |             If this is -1, then we assume evaluation at the end of the epoch.
25 |         :return: a score for the evaluation with a higher score indicating a better result
26 |         """
27 |         pass
28 | 


--------------------------------------------------------------------------------
/sentence_transformers_congen/readers/PairedFilesReader.py:
--------------------------------------------------------------------------------
 1 | from . import InputExample
 2 | import csv
 3 | import gzip
 4 | import os
 5 | import gzip
 6 | 
 7 | class PairedFilesReader(object):
 8 |     """
 9 |     Reads in the a Pair Dataset, split in two files
10 |     """
11 |     def __init__(self, filepaths):
12 |         self.filepaths = filepaths
13 | 
14 | 
15 |     def get_examples(self, max_examples=0):
16 |         """
17 |         """
18 |         fIns = []
19 |         for filepath in self.filepaths:
20 |             fIn = gzip.open(filepath, 'rt', encoding='utf-8') if filepath.endswith('.gz') else open(filepath, encoding='utf-8')
21 |             fIns.append(fIn)
22 | 
23 |         examples = []
24 | 
25 |         eof = False
26 |         while not eof:
27 |             texts = []
28 |             for fIn in fIns:
29 |                 text = fIn.readline()
30 | 
31 |                 if text == '':
32 |                     eof = True
33 |                     break
34 | 
35 |                 texts.append(text)
36 | 
37 |             if eof:
38 |                 break;
39 | 
40 |             examples.append(InputExample(guid=str(len(examples)), texts=texts, label=1))
41 |             if max_examples > 0 and len(examples) >= max_examples:
42 |                 break
43 | 
44 |         return examples


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/LayerNorm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import Tensor
 3 | from torch import nn
 4 | from typing import Union, Tuple, List, Iterable, Dict
 5 | import os
 6 | import json
 7 | 
 8 | 
 9 | class LayerNorm(nn.Module):
10 |     def __init__(self, dimension: int):
11 |         super(LayerNorm, self).__init__()
12 |         self.dimension = dimension
13 |         self.norm = nn.LayerNorm(dimension)
14 |    
15 | 
16 |     def forward(self, features: Dict[str, Tensor]):
17 |         features['sentence_embedding'] = self.norm(features['sentence_embedding']) 
18 |         return features
19 | 
20 | 
21 |     def get_sentence_embedding_dimension(self):
22 |         return self.dimension
23 | 
24 |     def save(self, output_path):
25 |         with open(os.path.join(output_path, 'config.json'), 'w') as fOut:
26 |             json.dump({'dimension': self.dimension}, fOut, indent=2)
27 | 
28 |         torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin'))
29 | 
30 |     @staticmethod
31 |     def load(input_path):
32 |         with open(os.path.join(input_path, 'config.json')) as fIn:
33 |             config = json.load(fIn)
34 | 
35 |         model = LayerNorm(**config)
36 |         model.load_state_dict(torch.load(os.path.join(input_path, 'pytorch_model.bin'), map_location=torch.device('cpu')))
37 |         return model


--------------------------------------------------------------------------------
/sentence_transformers_congen/readers/LabelSentenceReader.py:
--------------------------------------------------------------------------------
 1 | from . import InputExample
 2 | import csv
 3 | import gzip
 4 | import os
 5 | 
 6 | class LabelSentenceReader:
 7 |     """Reads in a file that has at least two columns: a label and a sentence.
 8 |     This reader can for example be used with the BatchHardTripletLoss.
 9 |     Maps labels automatically to integers"""
10 |     def __init__(self, folder, label_col_idx=0, sentence_col_idx=1, separator='\t'):
11 |         self.folder = folder
12 |         self.label_map = {}
13 |         self.label_col_idx = label_col_idx
14 |         self.sentence_col_idx = sentence_col_idx
15 |         self.separator = separator
16 | 
17 |     def get_examples(self, filename, max_examples=0):
18 |         examples = []
19 | 
20 |         id = 0
21 |         for line in open(os.path.join(self.folder, filename), encoding="utf-8"):
22 |             splits = line.strip().split(self.separator)
23 |             label = splits[self.label_col_idx]
24 |             sentence = splits[self.sentence_col_idx]
25 | 
26 |             if label not in self.label_map:
27 |                 self.label_map[label] = len(self.label_map)
28 | 
29 |             label_id = self.label_map[label]
30 |             guid = "%s-%d" % (filename, id)
31 |             id += 1
32 |             examples.append(InputExample(guid=guid, texts=[sentence], label=label_id))
33 | 
34 |             if 0 < max_examples <= id:
35 |                 break
36 | 
37 |         return examples
38 | 


--------------------------------------------------------------------------------
/sentence_transformers_congen/readers/TripletReader.py:
--------------------------------------------------------------------------------
 1 | from . import InputExample
 2 | import csv
 3 | import gzip
 4 | import os
 5 | 
 6 | class TripletReader(object):
 7 |     """
 8 |     Reads in the a Triplet Dataset: Each line contains (at least) 3 columns, one anchor column (s1),
 9 |     one positive example (s2) and one negative example (s3)
10 |     """
11 |     def __init__(self, dataset_folder, s1_col_idx=0, s2_col_idx=1, s3_col_idx=2, has_header=False, delimiter="\t",
12 |                  quoting=csv.QUOTE_NONE):
13 |         self.dataset_folder = dataset_folder
14 |         self.s1_col_idx = s1_col_idx
15 |         self.s2_col_idx = s2_col_idx
16 |         self.s3_col_idx = s3_col_idx
17 |         self.has_header = has_header
18 |         self.delimiter = delimiter
19 |         self.quoting = quoting
20 | 
21 |     def get_examples(self, filename, max_examples=0):
22 |         """
23 | 
24 |         """
25 |         data = csv.reader(open(os.path.join(self.dataset_folder, filename), encoding="utf-8"), delimiter=self.delimiter,
26 |                           quoting=self.quoting)
27 |         examples = []
28 |         if self.has_header:
29 |             next(data)
30 | 
31 |         for id, row in enumerate(data):
32 |             s1 = row[self.s1_col_idx]
33 |             s2 = row[self.s2_col_idx]
34 |             s3 = row[self.s3_col_idx]
35 | 
36 |             examples.append(InputExample(texts=[s1, s2, s3]))
37 |             if max_examples > 0 and len(examples) >= max_examples:
38 |                 break
39 | 
40 |         return examples


--------------------------------------------------------------------------------
/SentEval/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD License
 2 | 
 3 | For SentEval software
 4 | 
 5 | Copyright (c) 2017-present, Facebook, Inc. All rights reserved.
 6 | 
 7 | Redistribution and use in source and binary forms, with or without modification,
 8 | are permitted provided that the following conditions are met:
 9 | 
10 |  * Redistributions of source code must retain the above copyright notice, this
11 |    list of conditions and the following disclaimer.
12 | 
13 |  * Redistributions in binary form must reproduce the above copyright notice,
14 |    this list of conditions and the following disclaimer in the documentation
15 |    and/or other materials provided with the distribution.
16 | 
17 |  * Neither the name Facebook nor the names of its contributors may be used to
18 |    endorse or promote products derived from this software without specific
19 |    prior written permission.
20 | 
21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
28 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | 


--------------------------------------------------------------------------------
/sentence_transformers_congen/readers/NLIDataReader.py:
--------------------------------------------------------------------------------
 1 | from . import InputExample
 2 | import csv
 3 | import gzip
 4 | import os
 5 | 
 6 | 
 7 | class NLIDataReader(object):
 8 |     """
 9 |     Reads in the Stanford NLI dataset and the MultiGenre NLI dataset
10 |     """
11 |     def __init__(self, dataset_folder):
12 |         self.dataset_folder = dataset_folder
13 | 
14 |     def get_examples(self, filename, max_examples=0):
15 |         """
16 |         data_splits specified which data split to use (train, dev, test).
17 |         Expects that self.dataset_folder contains the files s1.$data_split.gz,  s2.$data_split.gz,
18 |         labels.$data_split.gz, e.g., for the train split, s1.train.gz, s2.train.gz, labels.train.gz
19 |         """
20 |         s1 = gzip.open(os.path.join(self.dataset_folder, 's1.' + filename),
21 |                        mode="rt", encoding="utf-8").readlines()
22 |         s2 = gzip.open(os.path.join(self.dataset_folder, 's2.' + filename),
23 |                        mode="rt", encoding="utf-8").readlines()
24 |         labels = gzip.open(os.path.join(self.dataset_folder, 'labels.' + filename),
25 |                            mode="rt", encoding="utf-8").readlines()
26 | 
27 |         examples = []
28 |         id = 0
29 |         for sentence_a, sentence_b, label in zip(s1, s2, labels):
30 |             guid = "%s-%d" % (filename, id)
31 |             id += 1
32 |             examples.append(InputExample(guid=guid, texts=[sentence_a, sentence_b], label=self.map_label(label)))
33 | 
34 |             if 0 < max_examples <= len(examples):
35 |                 break
36 | 
37 |         return examples
38 | 
39 |     @staticmethod
40 |     def get_labels():
41 |         return {"contradiction": 0, "entailment": 1, "neutral": 2}
42 | 
43 |     def get_num_labels(self):
44 |         return len(self.get_labels())
45 | 
46 |     def map_label(self, label):
47 |         return self.get_labels()[label.strip().lower()]


--------------------------------------------------------------------------------
/SentEval/examples/skipthought.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | from __future__ import absolute_import, division, unicode_literals
 9 | 
10 | """
11 | Example of file for SkipThought in SentEval
12 | """
13 | import logging
14 | import sys
15 | sys.setdefaultencoding('utf8')
16 | 
17 | 
18 | # Set PATHs
19 | PATH_TO_SENTEVAL = '../'
20 | PATH_TO_DATA = '../data/senteval_data/'
21 | PATH_TO_SKIPTHOUGHT = ''
22 | 
23 | assert PATH_TO_SKIPTHOUGHT != '', 'Download skipthought and set correct PATH'
24 | 
25 | # import skipthought and Senteval
26 | sys.path.insert(0, PATH_TO_SKIPTHOUGHT)
27 | import skipthoughts
28 | sys.path.insert(0, PATH_TO_SENTEVAL)
29 | import senteval
30 | 
31 | 
32 | def prepare(params, samples):
33 |     return
34 | 
35 | def batcher(params, batch):
36 |     batch = [str(' '.join(sent), errors="ignore") if sent != [] else '.' for sent in batch]
37 |     embeddings = skipthoughts.encode(params['encoder'], batch,
38 |                                      verbose=False, use_eos=True)
39 |     return embeddings
40 | 
41 | 
42 | # Set params for SentEval
43 | params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 10, 'batch_size': 512}
44 | params_senteval['classifier'] = {'nhid': 0, 'optim': 'adam', 'batch_size': 64,
45 |                                  'tenacity': 5, 'epoch_size': 4}
46 | # Set up logger
47 | logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)
48 | 
49 | if __name__ == "__main__":
50 |     # Load SkipThought model
51 |     params_senteval['encoder'] = skipthoughts.load_model()
52 | 
53 |     se = senteval.engine.SE(params_senteval, batcher, prepare)
54 |     transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16',
55 |                       'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC',
56 |                       'SICKEntailment', 'SICKRelatedness', 'STSBenchmark',
57 |                       'Length', 'WordContent', 'Depth', 'TopConstituents',
58 |                       'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber',
59 |                       'OddManOut', 'CoordinationInversion']
60 |     results = se.eval(transfer_tasks)
61 |     print(results)
62 | 


--------------------------------------------------------------------------------
/sentence_transformers_congen/losses/ConGenLoss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import Tensor
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | from typing import Iterable, Dict
 6 | 
 7 | class ConGenLoss(nn.Module):
 8 |     def __init__(self, instanceQ_encoded, model, teacher_temp=0.1, student_temp=0.09):
 9 |         """
10 |         param model:    SentenceTransformerModel
11 |         teacher_temp:   distillation temperature for teacher model 
12 |         student_temp:   distillation temperature for student model 
13 |         """
14 |         super(ConGenLoss, self).__init__()
15 |         self.instanceQ_encoded = instanceQ_encoded
16 |         self.model = model
17 |         self.teacher_temp = teacher_temp
18 |         self.student_temp = student_temp
19 | 
20 |     def forward(self, 
21 |                 sents1_features: Iterable[Dict[str, Tensor]],
22 |                 sents2_features: Iterable[Dict[str, Tensor]], 
23 |                 Z_ref: Tensor):
24 | 
25 |         # Batch-size
26 |         batch_size = Z_ref.shape[0]
27 | 
28 |         Z_con = F.normalize(self.model(sents1_features)['sentence_embedding'], p=2, dim=1)
29 |         Z_gen = F.normalize(self.model(sents2_features)['sentence_embedding'], p=2, dim=1)
30 | 
31 |         # insert the current batch embedding from T
32 |         instanceQ_encoded = self.instanceQ_encoded
33 |         Q = torch.cat((instanceQ_encoded, Z_ref))
34 |     
35 |         # probability scores distribution for T, S: B X (N + 1)
36 |         T_ref = torch.einsum('nc,ck->nk', Z_ref, Q.t().clone().detach())
37 |         S_con = torch.einsum('nc,ck->nk', Z_con, Q.t().clone().detach())
38 |         S_gen = torch.einsum('nc,ck->nk', Z_gen, Q.t().clone().detach())
39 | 
40 | 
41 |         # Apply temperatures for soft-labels
42 |         T_ref = F.softmax(T_ref/self.teacher_temp, dim=1)
43 |         S_con = S_con / self.student_temp
44 |         S_gen = S_gen / self.student_temp
45 |         
46 | 
47 |         # loss computation, use log_softmax for stable computation
48 |         loss_Con = -torch.mul(T_ref, F.log_softmax(S_con, dim=1)).sum() / batch_size
49 |         loss_Gen = -torch.mul(T_ref, F.log_softmax(S_gen, dim=1)).sum() / batch_size
50 |         
51 |         # update the random sample queue
52 |         self.instanceQ_encoded = Q[batch_size:]
53 |   
54 |         return (loss_Con + loss_Gen) / 2


--------------------------------------------------------------------------------
/SentEval/examples/googleuse.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | from __future__ import absolute_import, division
 9 | 
10 | import os
11 | import sys
12 | import logging
13 | import tensorflow as tf
14 | import tensorflow_hub as hub
15 | tf.logging.set_verbosity(0)
16 | 
17 | # Set PATHs
18 | PATH_TO_SENTEVAL = '../'
19 | PATH_TO_DATA = '../data'
20 | 
21 | # import SentEval
22 | sys.path.insert(0, PATH_TO_SENTEVAL)
23 | import senteval
24 | 
25 | # tensorflow session
26 | session = tf.Session()
27 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
28 | 
29 | # SentEval prepare and batcher
30 | def prepare(params, samples):
31 |     return
32 | 
33 | def batcher(params, batch):
34 |     batch = [' '.join(sent) if sent != [] else '.' for sent in batch]
35 |     embeddings = params['google_use'](batch)
36 |     return embeddings
37 | 
38 | def make_embed_fn(module):
39 |   with tf.Graph().as_default():
40 |     sentences = tf.placeholder(tf.string)
41 |     embed = hub.Module(module)
42 |     embeddings = embed(sentences)
43 |     session = tf.train.MonitoredSession()
44 |   return lambda x: session.run(embeddings, {sentences: x})
45 | 
46 | # Start TF session and load Google Universal Sentence Encoder
47 | encoder = make_embed_fn("https://tfhub.dev/google/universal-sentence-encoder-large/2")
48 | 
49 | # Set params for SentEval
50 | params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5}
51 | params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128,
52 |                                  'tenacity': 3, 'epoch_size': 2}
53 | params_senteval['google_use'] = encoder
54 | 
55 | # Set up logger
56 | logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)
57 | 
58 | if __name__ == "__main__":
59 |     se = senteval.engine.SE(params_senteval, batcher, prepare)
60 |     transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16',
61 |                       'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC',
62 |                       'SICKEntailment', 'SICKRelatedness', 'STSBenchmark',
63 |                       'Length', 'WordContent', 'Depth', 'TopConstituents',
64 |                       'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber',
65 |                       'OddManOut', 'CoordinationInversion']
66 |     results = se.eval(transfer_tasks)
67 |     print(results)
68 | 


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/WeightedLayerPooling.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import Tensor
 3 | from torch import nn
 4 | from typing import Union, Tuple, List, Iterable, Dict
 5 | import os
 6 | import json
 7 | 
 8 | 
 9 | class WeightedLayerPooling(nn.Module):
10 |     """
11 |     Token embeddings are weighted mean of their different hidden layer representations
12 |     """
13 |     def __init__(self, word_embedding_dimension, num_hidden_layers: int = 12, layer_start: int = 4, layer_weights = None):
14 |         super(WeightedLayerPooling, self).__init__()
15 |         self.config_keys = ['word_embedding_dimension', 'layer_start', 'num_hidden_layers']
16 |         self.word_embedding_dimension = word_embedding_dimension
17 |         self.layer_start = layer_start
18 |         self.num_hidden_layers = num_hidden_layers
19 |         self.layer_weights = layer_weights if layer_weights is not None else nn.Parameter(torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float))
20 | 
21 |     def forward(self, features: Dict[str, Tensor]):
22 |         ft_all_layers = features['all_layer_embeddings']
23 | 
24 |         all_layer_embedding = torch.stack(ft_all_layers)
25 |         all_layer_embedding = all_layer_embedding[self.layer_start:, :, :, :]  # Start from 4th layers output
26 | 
27 |         weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
28 |         weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()
29 | 
30 |         features.update({'token_embeddings': weighted_average})
31 |         return features
32 | 
33 |     def get_word_embedding_dimension(self):
34 |         return self.word_embedding_dimension
35 | 
36 |     def get_config_dict(self):
37 |         return {key: self.__dict__[key] for key in self.config_keys}
38 | 
39 |     def save(self, output_path):
40 |         with open(os.path.join(output_path, 'config.json'), 'w') as fOut:
41 |             json.dump(self.get_config_dict(), fOut, indent=2)
42 | 
43 |         torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin'))
44 | 
45 | 
46 |     @staticmethod
47 |     def load(input_path):
48 |         with open(os.path.join(input_path, 'config.json')) as fIn:
49 |             config = json.load(fIn)
50 | 
51 |         model = WeightedLayerPooling(**config)
52 |         model.load_state_dict(torch.load(os.path.join(input_path, 'pytorch_model.bin'), map_location=torch.device('cpu')))
53 |         return model
54 | 


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/tokenizer/WhitespaceTokenizer.py:
--------------------------------------------------------------------------------
 1 | from typing import Union, Tuple, List, Iterable, Dict
 2 | import collections
 3 | import string
 4 | import os
 5 | import json
 6 | from .WordTokenizer import WordTokenizer, ENGLISH_STOP_WORDS
 7 | 
 8 | class WhitespaceTokenizer(WordTokenizer):
 9 |     """
10 |     Simple and fast white-space tokenizer. Splits sentence based on white spaces.
11 |     Punctuation are stripped from tokens.
12 |     """
13 |     def __init__(self, vocab: Iterable[str] = [], stop_words: Iterable[str] = ENGLISH_STOP_WORDS, do_lower_case: bool = False):
14 |         self.stop_words = set(stop_words)
15 |         self.do_lower_case = do_lower_case
16 |         self.set_vocab(vocab)
17 | 
18 |     def get_vocab(self):
19 |         return self.vocab
20 | 
21 |     def set_vocab(self, vocab: Iterable[str]):
22 |         self.vocab = vocab
23 |         self.word2idx = collections.OrderedDict([(word, idx) for idx, word in enumerate(vocab)])
24 | 
25 |     def tokenize(self, text: str) -> List[int]:
26 |         if self.do_lower_case:
27 |             text = text.lower()
28 | 
29 |         tokens = text.split()
30 | 
31 |         tokens_filtered = []
32 |         for token in tokens:
33 |             if token in self.stop_words:
34 |                 continue
35 |             elif token in self.word2idx:
36 |                 tokens_filtered.append(self.word2idx[token])
37 |                 continue
38 | 
39 |             token = token.strip(string.punctuation)
40 |             if token in self.stop_words:
41 |                 continue
42 |             elif len(token) > 0 and token in self.word2idx:
43 |                 tokens_filtered.append(self.word2idx[token])
44 |                 continue
45 | 
46 |             token = token.lower()
47 |             if token in self.stop_words:
48 |                 continue
49 |             elif token in self.word2idx:
50 |                 tokens_filtered.append(self.word2idx[token])
51 |                 continue
52 | 
53 |         return tokens_filtered
54 | 
55 |     def save(self, output_path: str):
56 |         with open(os.path.join(output_path, 'whitespacetokenizer_config.json'), 'w') as fOut:
57 |             json.dump({'vocab': list(self.word2idx.keys()), 'stop_words': list(self.stop_words), 'do_lower_case': self.do_lower_case}, fOut)
58 | 
59 |     @staticmethod
60 |     def load(input_path: str):
61 |         with open(os.path.join(input_path, 'whitespacetokenizer_config.json'), 'r') as fIn:
62 |             config = json.load(fIn)
63 | 
64 |         return WhitespaceTokenizer(**config)
65 | 


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/LSTM.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | from typing import List
 4 | import os
 5 | import json
 6 | 
 7 | 
 8 | 
 9 | class LSTM(nn.Module):
10 |     """
11 |     Bidirectional LSTM running over word embeddings.
12 |     """
13 |     def __init__(self, word_embedding_dimension: int, hidden_dim: int, num_layers: int = 1, dropout: float = 0, bidirectional: bool = True):
14 |         nn.Module.__init__(self)
15 |         self.config_keys = ['word_embedding_dimension', 'hidden_dim', 'num_layers', 'dropout', 'bidirectional']
16 |         self.word_embedding_dimension = word_embedding_dimension
17 |         self.hidden_dim = hidden_dim
18 |         self.num_layers = num_layers
19 |         self.dropout = dropout
20 |         self.bidirectional = bidirectional
21 | 
22 |         self.embeddings_dimension = hidden_dim
23 |         if self.bidirectional:
24 |             self.embeddings_dimension *= 2
25 | 
26 |         self.encoder = nn.LSTM(word_embedding_dimension, hidden_dim, num_layers=num_layers, dropout=dropout, bidirectional=bidirectional, batch_first=True)
27 | 
28 |     def forward(self, features):
29 |         token_embeddings = features['token_embeddings']
30 |         sentence_lengths = torch.clamp(features['sentence_lengths'], min=1)
31 | 
32 |         packed = nn.utils.rnn.pack_padded_sequence(token_embeddings, sentence_lengths, batch_first=True, enforce_sorted=False)
33 |         packed = self.encoder(packed)
34 |         unpack = nn.utils.rnn.pad_packed_sequence(packed[0], batch_first=True)[0]
35 |         features.update({'token_embeddings': unpack})
36 |         return features
37 | 
38 |     def get_word_embedding_dimension(self) -> int:
39 |         return self.embeddings_dimension
40 | 
41 |     def tokenize(self, text: str) -> List[int]:
42 |         raise NotImplementedError()
43 | 
44 |     def save(self, output_path: str):
45 |         with open(os.path.join(output_path, 'lstm_config.json'), 'w') as fOut:
46 |             json.dump(self.get_config_dict(), fOut, indent=2)
47 | 
48 |         torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin'))
49 | 
50 |     def get_config_dict(self):
51 |         return {key: self.__dict__[key] for key in self.config_keys}
52 | 
53 |     @staticmethod
54 |     def load(input_path: str):
55 |         with open(os.path.join(input_path, 'lstm_config.json'), 'r') as fIn:
56 |             config = json.load(fIn)
57 | 
58 |         weights = torch.load(os.path.join(input_path, 'pytorch_model.bin'))
59 |         model = LSTM(**config)
60 |         model.load_state_dict(weights)
61 |         return model
62 | 
63 | 


--------------------------------------------------------------------------------
/SentEval/examples/gensen.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | """
 9 | Clone GenSen repo here: https://github.com/Maluuba/gensen.git
10 | And follow instructions for loading the model used in batcher
11 | """
12 | 
13 | from __future__ import absolute_import, division, unicode_literals
14 | 
15 | import sys
16 | import logging
17 | # import GenSen package
18 | from gensen import GenSen, GenSenSingle
19 | 
20 | # Set PATHs
21 | PATH_TO_SENTEVAL = '../'
22 | PATH_TO_DATA = '../data'
23 | 
24 | # import SentEval
25 | sys.path.insert(0, PATH_TO_SENTEVAL)
26 | import senteval
27 | 
28 | # SentEval prepare and batcher
29 | def prepare(params, samples):
30 |     return
31 | 
32 | def batcher(params, batch):
33 |     batch = [' '.join(sent) if sent != [] else '.' for sent in batch]
34 |     _, reps_h_t = gensen.get_representation(
35 |         sentences, pool='last', return_numpy=True, tokenize=True
36 |     )
37 |     embeddings = reps_h_t
38 |     return embeddings
39 | 
40 | # Load GenSen model
41 | gensen_1 = GenSenSingle(
42 |     model_folder='../data/models',
43 |     filename_prefix='nli_large_bothskip',
44 |     pretrained_emb='../data/embedding/glove.840B.300d.h5'
45 | )
46 | gensen_2 = GenSenSingle(
47 |     model_folder='../data/models',
48 |     filename_prefix='nli_large_bothskip_parse',
49 |     pretrained_emb='../data/embedding/glove.840B.300d.h5'
50 | )
51 | gensen_encoder = GenSen(gensen_1, gensen_2)
52 | reps_h, reps_h_t = gensen.get_representation(
53 |     sentences, pool='last', return_numpy=True, tokenize=True
54 | )
55 | 
56 | # Set params for SentEval
57 | params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5}
58 | params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128,
59 |                                  'tenacity': 3, 'epoch_size': 2}
60 | params_senteval['gensen'] = gensen_encoder
61 | 
62 | # Set up logger
63 | logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)
64 | 
65 | if __name__ == "__main__":
66 |     se = senteval.engine.SE(params_senteval, batcher, prepare)
67 |     transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16',
68 |                       'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC',
69 |                       'SICKEntailment', 'SICKRelatedness', 'STSBenchmark',
70 |                       'Length', 'WordContent', 'Depth', 'TopConstituents',
71 |                       'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber',
72 |                       'OddManOut', 'CoordinationInversion']
73 |     results = se.eval(transfer_tasks)
74 |     print(results)
75 | 


--------------------------------------------------------------------------------
/SentEval/examples/infersent.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | """
 9 | InferSent models. See https://github.com/facebookresearch/InferSent.
10 | """
11 | 
12 | from __future__ import absolute_import, division, unicode_literals
13 | 
14 | import sys
15 | import os
16 | import torch
17 | import logging
18 | 
19 | # get models.py from InferSent repo
20 | from models import InferSent
21 | 
22 | # Set PATHs
23 | PATH_SENTEVAL = '../'
24 | PATH_TO_DATA = '../data'
25 | PATH_TO_W2V = 'PATH/TO/glove.840B.300d.txt'  # or crawl-300d-2M.vec for V2
26 | MODEL_PATH = 'infersent1.pkl'
27 | V = 1 # version of InferSent
28 | 
29 | assert os.path.isfile(MODEL_PATH) and os.path.isfile(PATH_TO_W2V), \
30 |     'Set MODEL and GloVe PATHs'
31 | 
32 | # import senteval
33 | sys.path.insert(0, PATH_SENTEVAL)
34 | import senteval
35 | 
36 | 
37 | def prepare(params, samples):
38 |     params.infersent.build_vocab([' '.join(s) for s in samples], tokenize=False)
39 | 
40 | 
41 | def batcher(params, batch):
42 |     sentences = [' '.join(s) for s in batch]
43 |     embeddings = params.infersent.encode(sentences, bsize=params.batch_size, tokenize=False)
44 |     return embeddings
45 | 
46 | 
47 | """
48 | Evaluation of trained model on Transfer Tasks (SentEval)
49 | """
50 | 
51 | # define senteval params
52 | params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5}
53 | params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128,
54 |                                  'tenacity': 3, 'epoch_size': 2}
55 | # Set up logger
56 | logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)
57 | 
58 | if __name__ == "__main__":
59 |     # Load InferSent model
60 |     params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
61 |                     'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
62 |     model = InferSent(params_model)
63 |     model.load_state_dict(torch.load(MODEL_PATH))
64 |     model.set_w2v_path(PATH_TO_W2V)
65 | 
66 |     params_senteval['infersent'] = model.cuda()
67 | 
68 |     se = senteval.engine.SE(params_senteval, batcher, prepare)
69 |     transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16',
70 |                       'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC',
71 |                       'SICKEntailment', 'SICKRelatedness', 'STSBenchmark',
72 |                       'Length', 'WordContent', 'Depth', 'TopConstituents',
73 |                       'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber',
74 |                       'OddManOut', 'CoordinationInversion']
75 |     results = se.eval(transfer_tasks)
76 |     print(results)
77 | 


--------------------------------------------------------------------------------
/sentence_transformers_congen/readers/STSDataReader.py:
--------------------------------------------------------------------------------
 1 | from . import InputExample
 2 | import csv
 3 | import gzip
 4 | import os
 5 | 
 6 | class STSDataReader:
 7 |     """
 8 |     Reads in the STS dataset. Each line contains two sentences (s1_col_idx, s2_col_idx) and one label (score_col_idx)
 9 | 
10 |     Default values expects a tab seperated file with the first & second column the sentence pair and third column the score (0...1). Default config normalizes scores from 0...5 to 0...1
11 |     """
12 |     def __init__(self, dataset_folder, s1_col_idx=0, s2_col_idx=1, score_col_idx=2, delimiter="\t",
13 |                  quoting=csv.QUOTE_NONE, normalize_scores=True, min_score=0, max_score=5):
14 |         self.dataset_folder = dataset_folder
15 |         self.score_col_idx = score_col_idx
16 |         self.s1_col_idx = s1_col_idx
17 |         self.s2_col_idx = s2_col_idx
18 |         self.delimiter = delimiter
19 |         self.quoting = quoting
20 |         self.normalize_scores = normalize_scores
21 |         self.min_score = min_score
22 |         self.max_score = max_score
23 | 
24 |     def get_examples(self, filename, max_examples=0):
25 |         """
26 |         filename specified which data split to use (train.csv, dev.csv, test.csv).
27 |         """
28 |         filepath = os.path.join(self.dataset_folder, filename)
29 |         with gzip.open(filepath, 'rt', encoding='utf8') if filename.endswith('.gz') else open(filepath, encoding="utf-8") as fIn:
30 |             data = csv.reader(fIn, delimiter=self.delimiter, quoting=self.quoting)
31 |             examples = []
32 |             for id, row in enumerate(data):
33 |                 score = float(row[self.score_col_idx])
34 |                 if self.normalize_scores:  # Normalize to a 0...1 value
35 |                     score = (score - self.min_score) / (self.max_score - self.min_score)
36 | 
37 |                 s1 = row[self.s1_col_idx]
38 |                 s2 = row[self.s2_col_idx]
39 |                 examples.append(InputExample(guid=filename+str(id), texts=[s1, s2], label=score))
40 | 
41 |                 if max_examples > 0 and len(examples) >= max_examples:
42 |                     break
43 | 
44 |         return examples
45 | 
46 | class STSBenchmarkDataReader(STSDataReader):
47 |     """
48 |     Reader especially for the STS benchmark dataset. There, the sentences are in column 5 and 6, the score is in column 4.
49 |     Scores are normalized from 0...5 to 0...1
50 |     """
51 |     def __init__(self, dataset_folder, s1_col_idx=5, s2_col_idx=6, score_col_idx=4, delimiter="\t",
52 |                  quoting=csv.QUOTE_NONE, normalize_scores=True, min_score=0, max_score=5):
53 |         super().__init__(dataset_folder=dataset_folder, s1_col_idx=s1_col_idx, s2_col_idx=s2_col_idx, score_col_idx=score_col_idx, delimiter=delimiter,
54 |                  quoting=quoting, normalize_scores=normalize_scores, min_score=min_score, max_score=max_score)


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/Dense.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import Tensor
 3 | from torch import nn
 4 | from torch import functional as F
 5 | from typing import Union, Tuple, List, Iterable, Dict
 6 | import os
 7 | import json
 8 | from ..util import fullname, import_from_string
 9 | 
10 | 
11 | class Dense(nn.Module):
12 |     """Feed-forward function with  activiation function.
13 | 
14 |     This layer takes a fixed-sized sentence embedding and passes it through a feed-forward layer. Can be used to generate deep averaging networs (DAN).
15 | 
16 |     :param in_features: Size of the input dimension
17 |     :param out_features: Output size
18 |     :param bias: Add a bias vector
19 |     :param activation_function: Pytorch activation function applied on output
20 |     :param init_weight: Initial value for the matrix of the linear layer
21 |     :param init_bias: Initial value for the bias of the linear layer
22 |     """
23 |     def __init__(self, in_features: int, out_features: int, bias: bool = True, activation_function=nn.Tanh(), init_weight: Tensor = None, init_bias: Tensor = None):
24 |         super(Dense, self).__init__()
25 |         self.in_features = in_features
26 |         self.out_features = out_features
27 |         self.bias = bias
28 |         self.activation_function = activation_function
29 |         self.linear = nn.Linear(in_features, out_features, bias=bias)
30 | 
31 |         if init_weight is not None:
32 |             self.linear.weight = nn.Parameter(init_weight)
33 | 
34 |         if init_bias is not None:
35 |             self.linear.bias = nn.Parameter(init_bias)
36 | 
37 |     def forward(self, features: Dict[str, Tensor]):
38 |         features.update({'sentence_embedding': self.activation_function(self.linear(features['sentence_embedding']))})
39 |         return features
40 | 
41 |     def get_sentence_embedding_dimension(self) -> int:
42 |         return self.out_features
43 | 
44 |     def get_config_dict(self):
45 |         return {'in_features': self.in_features, 'out_features': self.out_features, 'bias': self.bias, 'activation_function': fullname(self.activation_function)}
46 | 
47 |     def save(self, output_path):
48 |         with open(os.path.join(output_path, 'config.json'), 'w') as fOut:
49 |             json.dump(self.get_config_dict(), fOut)
50 | 
51 |         torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin'))
52 | 
53 |     def __repr__(self):
54 |         return "Dense({})".format(self.get_config_dict())
55 |     @staticmethod
56 |     def load(input_path):
57 |         with open(os.path.join(input_path, 'config.json')) as fIn:
58 |             config = json.load(fIn)
59 | 
60 |         config['activation_function'] = import_from_string(config['activation_function'])()
61 |         model = Dense(**config)
62 |         model.load_state_dict(torch.load(os.path.join(input_path, 'pytorch_model.bin'), map_location=torch.device('cpu')))
63 |         return model
64 | 


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/CNN.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn, Tensor
 3 | from typing import Union, Tuple, List, Iterable, Dict
 4 | import logging
 5 | import gzip
 6 | from tqdm import tqdm
 7 | import numpy as np
 8 | import os
 9 | import json
10 | from ..util import import_from_string, fullname, http_get
11 | from .tokenizer import WordTokenizer, WhitespaceTokenizer
12 | 
13 | 
14 | class CNN(nn.Module):
15 |     """CNN-layer with multiple kernel-sizes over the word embeddings"""
16 | 
17 |     def __init__(self, in_word_embedding_dimension: int, out_channels: int = 256, kernel_sizes: List[int] = [1, 3, 5], stride_sizes: List[int] = None):
18 |         nn.Module.__init__(self)
19 |         self.config_keys = ['in_word_embedding_dimension', 'out_channels', 'kernel_sizes']
20 |         self.in_word_embedding_dimension = in_word_embedding_dimension
21 |         self.out_channels = out_channels
22 |         self.kernel_sizes = kernel_sizes
23 | 
24 |         self.embeddings_dimension = out_channels*len(kernel_sizes)
25 |         self.convs = nn.ModuleList()
26 | 
27 |         in_channels = in_word_embedding_dimension
28 |         if stride_sizes is None:
29 |             stride_sizes = [1] * len(kernel_sizes)
30 | 
31 |         for kernel_size, stride in zip(kernel_sizes, stride_sizes):
32 |             padding_size = int((kernel_size - 1) / 2)
33 |             conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels,
34 |                              kernel_size=kernel_size,
35 |                              stride=stride,
36 |                              padding=padding_size)
37 |             self.convs.append(conv)
38 | 
39 |     def forward(self, features):
40 |         token_embeddings = features['token_embeddings']
41 | 
42 |         token_embeddings = token_embeddings.transpose(1, -1)
43 |         vectors = [conv(token_embeddings) for conv in self.convs]
44 |         out = torch.cat(vectors, 1).transpose(1, -1)
45 | 
46 |         features.update({'token_embeddings': out})
47 |         return features
48 | 
49 |     def get_word_embedding_dimension(self) -> int:
50 |         return self.embeddings_dimension
51 | 
52 |     def tokenize(self, text: str) -> List[int]:
53 |         raise NotImplementedError()
54 | 
55 |     def save(self, output_path: str):
56 |         with open(os.path.join(output_path, 'cnn_config.json'), 'w') as fOut:
57 |             json.dump(self.get_config_dict(), fOut, indent=2)
58 | 
59 |         torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin'))
60 | 
61 |     def get_config_dict(self):
62 |         return {key: self.__dict__[key] for key in self.config_keys}
63 | 
64 |     @staticmethod
65 |     def load(input_path: str):
66 |         with open(os.path.join(input_path, 'cnn_config.json'), 'r') as fIn:
67 |             config = json.load(fIn)
68 | 
69 |         weights = torch.load(os.path.join(input_path, 'pytorch_model.bin'), map_location=torch.device('cpu'))
70 |         model = CNN(**config)
71 |         model.load_state_dict(weights)
72 |         return model
73 | 
74 | 


--------------------------------------------------------------------------------
/SentEval/senteval/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | from __future__ import absolute_import, division, unicode_literals
 9 | 
10 | import numpy as np
11 | import re
12 | import inspect
13 | from torch import optim
14 | 
15 | 
16 | def create_dictionary(sentences):
17 |     words = {}
18 |     for s in sentences:
19 |         for word in s:
20 |             if word in words:
21 |                 words[word] += 1
22 |             else:
23 |                 words[word] = 1
24 |     words['<s>'] = 1e9 + 4
25 |     words['</s>'] = 1e9 + 3
26 |     words['<p>'] = 1e9 + 2
27 |     # words['<UNK>'] = 1e9 + 1
28 |     sorted_words = sorted(words.items(), key=lambda x: -x[1])  # inverse sort
29 |     id2word = []
30 |     word2id = {}
31 |     for i, (w, _) in enumerate(sorted_words):
32 |         id2word.append(w)
33 |         word2id[w] = i
34 | 
35 |     return id2word, word2id
36 | 
37 | 
38 | def cosine(u, v):
39 |     return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))
40 | 
41 | 
42 | class dotdict(dict):
43 |     """ dot.notation access to dictionary attributes """
44 |     __getattr__ = dict.get
45 |     __setattr__ = dict.__setitem__
46 |     __delattr__ = dict.__delitem__
47 | 
48 | 
49 | def get_optimizer(s):
50 |     """
51 |     Parse optimizer parameters.
52 |     Input should be of the form:
53 |         - "sgd,lr=0.01"
54 |         - "adagrad,lr=0.1,lr_decay=0.05"
55 |     """
56 |     if "," in s:
57 |         method = s[:s.find(',')]
58 |         optim_params = {}
59 |         for x in s[s.find(',') + 1:].split(','):
60 |             split = x.split('=')
61 |             assert len(split) == 2
62 |             assert re.match("^[+-]?(\d+(\.\d*)?|\.\d+)$", split[1]) is not None
63 |             optim_params[split[0]] = float(split[1])
64 |     else:
65 |         method = s
66 |         optim_params = {}
67 | 
68 |     if method == 'adadelta':
69 |         optim_fn = optim.Adadelta
70 |     elif method == 'adagrad':
71 |         optim_fn = optim.Adagrad
72 |     elif method == 'adam':
73 |         optim_fn = optim.Adam
74 |     elif method == 'adamax':
75 |         optim_fn = optim.Adamax
76 |     elif method == 'asgd':
77 |         optim_fn = optim.ASGD
78 |     elif method == 'rmsprop':
79 |         optim_fn = optim.RMSprop
80 |     elif method == 'rprop':
81 |         optim_fn = optim.Rprop
82 |     elif method == 'sgd':
83 |         optim_fn = optim.SGD
84 |         assert 'lr' in optim_params
85 |     else:
86 |         raise Exception('Unknown optimization method: "%s"' % method)
87 | 
88 |     # check that we give good parameters to the optimizer
89 |     expected_args = inspect.getargspec(optim_fn.__init__)[0]
90 |     assert expected_args[:2] == ['self', 'params']
91 |     if not all(k in expected_args[2:] for k in optim_params.keys()):
92 |         raise Exception('Unexpected parameters: expected "%s", got "%s"' % (
93 |             str(expected_args[2:]), str(optim_params.keys())))
94 | 
95 |     return optim_fn, optim_params
96 | 


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/CLIPModel.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | import transformers
 3 | import torch
 4 | from PIL import Image
 5 | 
 6 | 
 7 | class CLIPModel(nn.Module):
 8 |     def __init__(self,  model_name: str = "openai/clip-vit-base-patch32", processor_name = None):
 9 |         super(CLIPModel, self).__init__()
10 | 
11 |         if processor_name is None:
12 |             processor_name = model_name
13 | 
14 |         self.model = transformers.CLIPModel.from_pretrained(model_name)
15 |         self.processor = transformers.CLIPProcessor.from_pretrained(processor_name)
16 | 
17 |     def __repr__(self):
18 |         return "CLIPModel()"
19 | 
20 |     def forward(self, features):
21 |         image_embeds = []
22 |         text_embeds = []
23 | 
24 |         if 'pixel_values' in features:
25 |             vision_outputs = self.model.vision_model(pixel_values=features['pixel_values'])
26 |             image_embeds = self.model.visual_projection(vision_outputs[1])
27 | 
28 |         if 'input_ids' in features:
29 |             text_outputs = self.model.text_model(
30 |                 input_ids=features.get('input_ids'),
31 |                 attention_mask=features.get('attention_mask', None),
32 |                 position_ids=features.get('position_ids', None),
33 |                 output_attentions=features.get('output_attentions', None),
34 |                 output_hidden_states=features.get('output_hidden_states', None),
35 |             )
36 |             text_embeds = self.model.text_projection(text_outputs[1])
37 | 
38 |         sentence_embedding = []
39 |         image_features = iter(image_embeds)
40 |         text_features = iter(text_embeds)
41 | 
42 |         for idx, input_type in enumerate(features['image_text_info']):
43 |             if input_type == 0:
44 |                 sentence_embedding.append(next(image_features))
45 |             else:
46 |                 sentence_embedding.append(next(text_features))
47 | 
48 |         features['sentence_embedding'] = torch.stack(sentence_embedding).float()
49 | 
50 |         return features
51 | 
52 | 
53 |     def tokenize(self, texts):
54 |         images = []
55 |         texts_values = []
56 |         image_text_info = []
57 | 
58 |         for idx, data in enumerate(texts):
59 |             if isinstance(data, Image.Image):  # An Image
60 |                 images.append(data)
61 |                 image_text_info.append(0)
62 |             else:  # A text
63 |                 texts_values.append(data)
64 |                 image_text_info.append(1)
65 | 
66 |         if len(texts_values) == 0:
67 |             texts_values = None
68 |         if len(images) == 0:
69 |             images = None
70 | 
71 |         inputs = self.processor(text=texts_values, images=images, return_tensors="pt", padding=True)
72 |         inputs['image_text_info'] = image_text_info
73 |         return inputs
74 | 
75 | 
76 |     def save(self, output_path: str):
77 |         self.model.save_pretrained(output_path)
78 |         self.processor.save_pretrained(output_path)
79 | 
80 |     @staticmethod
81 |     def load(input_path: str):
82 |         return CLIPModel(model_name=input_path)
83 | 
84 | 
85 | 


--------------------------------------------------------------------------------
/sentence_transformers_congen/evaluation/LabelAccuracyEvaluator.py:
--------------------------------------------------------------------------------
 1 | from . import SentenceEvaluator
 2 | import torch
 3 | from torch.utils.data import DataLoader
 4 | import logging
 5 | from ..util import batch_to_device
 6 | import os
 7 | import csv
 8 | 
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | class LabelAccuracyEvaluator(SentenceEvaluator):
13 |     """
14 |     Evaluate a model based on its accuracy on a labeled dataset
15 | 
16 |     This requires a model with LossFunction.SOFTMAX
17 | 
18 |     The results are written in a CSV. If a CSV already exists, then values are appended.
19 |     """
20 | 
21 |     def __init__(self, dataloader: DataLoader, name: str = "", softmax_model = None, write_csv: bool = True):
22 |         """
23 |         Constructs an evaluator for the given dataset
24 | 
25 |         :param dataloader:
26 |             the data for the evaluation
27 |         """
28 |         self.dataloader = dataloader
29 |         self.name = name
30 |         self.softmax_model = softmax_model
31 | 
32 |         if name:
33 |             name = "_"+name
34 | 
35 |         self.write_csv = write_csv
36 |         self.csv_file = "accuracy_evaluation"+name+"_results.csv"
37 |         self.csv_headers = ["epoch", "steps", "accuracy"]
38 | 
39 |     def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
40 |         model.eval()
41 |         total = 0
42 |         correct = 0
43 | 
44 |         if epoch != -1:
45 |             if steps == -1:
46 |                 out_txt = " after epoch {}:".format(epoch)
47 |             else:
48 |                 out_txt = " in epoch {} after {} steps:".format(epoch, steps)
49 |         else:
50 |             out_txt = ":"
51 | 
52 |         logger.info("Evaluation on the "+self.name+" dataset"+out_txt)
53 |         self.dataloader.collate_fn = model.smart_batching_collate
54 |         for step, batch in enumerate(self.dataloader):
55 |             features, label_ids = batch
56 |             for idx in range(len(features)):
57 |                 features[idx] = batch_to_device(features[idx], model.device)
58 |             label_ids = label_ids.to(model.device)
59 |             with torch.no_grad():
60 |                 _, prediction = self.softmax_model(features, labels=None)
61 | 
62 |             total += prediction.size(0)
63 |             correct += torch.argmax(prediction, dim=1).eq(label_ids).sum().item()
64 |         accuracy = correct/total
65 | 
66 |         logger.info("Accuracy: {:.4f} ({}/{})\n".format(accuracy, correct, total))
67 | 
68 |         if output_path is not None and self.write_csv:
69 |             csv_path = os.path.join(output_path, self.csv_file)
70 |             if not os.path.isfile(csv_path):
71 |                 with open(csv_path, newline='', mode="w", encoding="utf-8") as f:
72 |                     writer = csv.writer(f)
73 |                     writer.writerow(self.csv_headers)
74 |                     writer.writerow([epoch, steps, accuracy])
75 |             else:
76 |                 with open(csv_path, newline='', mode="a", encoding="utf-8") as f:
77 |                     writer = csv.writer(f)
78 |                     writer.writerow([epoch, steps, accuracy])
79 | 
80 |         return accuracy
81 | 


--------------------------------------------------------------------------------
/sentence_transformers_congen/evaluation/MSEEvaluator.py:
--------------------------------------------------------------------------------
 1 | from sentence_transformers.evaluation import SentenceEvaluator
 2 | import numpy as np
 3 | import logging
 4 | import os
 5 | import csv
 6 | from typing import List
 7 | 
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | class MSEEvaluator(SentenceEvaluator):
12 |     """
13 |     Computes the mean squared error (x100) between the computed sentence embedding
14 |     and some target sentence embedding.
15 | 
16 |     The MSE is computed between ||teacher.encode(source_sentences) - student.encode(target_sentences)||.
17 | 
18 |     For multilingual knowledge distillation (https://arxiv.org/abs/2004.09813), source_sentences are in English
19 |     and target_sentences are in a different language like German, Chinese, Spanish...
20 | 
21 |     :param source_sentences: Source sentences are embedded with the teacher model
22 |     :param target_sentences: Target sentences are ambedding with the student model.
23 |     :param show_progress_bar: Show progress bar when computing embeddings
24 |     :param batch_size: Batch size to compute sentence embeddings
25 |     :param name: Name of the evaluator
26 |     :param write_csv: Write results to CSV file
27 |     """
28 |     def __init__(self, source_sentences: List[str], target_sentences: List[str], teacher_model = None, show_progress_bar: bool = False, batch_size: int = 32, name: str = '', write_csv: bool = True):
29 |         self.source_embeddings = teacher_model.encode(source_sentences, show_progress_bar=show_progress_bar, batch_size=batch_size, convert_to_numpy=True)
30 | 
31 |         self.target_sentences = target_sentences
32 |         self.show_progress_bar = show_progress_bar
33 |         self.batch_size = batch_size
34 |         self.name = name
35 | 
36 |         self.csv_file = "mse_evaluation_" + name + "_results.csv"
37 |         self.csv_headers = ["epoch", "steps", "MSE"]
38 |         self.write_csv = write_csv
39 | 
40 |     def __call__(self, model, output_path, epoch  = -1, steps = -1):
41 |         if epoch != -1:
42 |             if steps == -1:
43 |                 out_txt = " after epoch {}:".format(epoch)
44 |             else:
45 |                 out_txt = " in epoch {} after {} steps:".format(epoch, steps)
46 |         else:
47 |             out_txt = ":"
48 | 
49 |         target_embeddings = model.encode(self.target_sentences, show_progress_bar=self.show_progress_bar, batch_size=self.batch_size, convert_to_numpy=True)
50 | 
51 |         mse = ((self.source_embeddings - target_embeddings)**2).mean()
52 |         mse *= 100
53 | 
54 |         logger.info("MSE evaluation (lower = better) on "+self.name+" dataset"+out_txt)
55 |         logger.info("MSE (*100):\t{:4f}".format(mse))
56 | 
57 |         if output_path is not None and self.write_csv:
58 |             csv_path = os.path.join(output_path, self.csv_file)
59 |             output_file_exists = os.path.isfile(csv_path)
60 |             with open(csv_path, newline='', mode="a" if output_file_exists else 'w', encoding="utf-8") as f:
61 |                 writer = csv.writer(f)
62 |                 if not output_file_exists:
63 |                     writer.writerow(self.csv_headers)
64 | 
65 |                 writer.writerow([epoch, steps, mse])
66 | 
67 |         return -mse #Return negative score as SentenceTransformers maximizes the performance
68 | 


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/WordWeights.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import Tensor
 3 | from torch import nn
 4 | from typing import Union, Tuple, List, Iterable, Dict
 5 | import os
 6 | import json
 7 | import logging
 8 | 
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | class WordWeights(nn.Module):
13 |     """This model can weight word embeddings, for example, with idf-values."""
14 | 
15 |     def __init__(self, vocab: List[str], word_weights: Dict[str, float], unknown_word_weight: float = 1):
16 |         """
17 | 
18 |         :param vocab:
19 |             Vocabulary of the tokenizer
20 |         :param word_weights:
21 |             Mapping of tokens to a float weight value. Words embeddings are multiplied by  this float value. Tokens in word_weights must not be equal to the vocab (can contain more or less values)
22 |         :param unknown_word_weight:
23 |             Weight for words in vocab, that do not appear in the word_weights lookup. These can be for example rare words in the vocab, where no weight exists.
24 |         """
25 |         super(WordWeights, self).__init__()
26 |         self.config_keys = ['vocab', 'word_weights', 'unknown_word_weight']
27 |         self.vocab = vocab
28 |         self.word_weights = word_weights
29 |         self.unknown_word_weight = unknown_word_weight
30 | 
31 |         weights = []
32 |         num_unknown_words = 0
33 |         for word in vocab:
34 |             weight = unknown_word_weight
35 |             if word in word_weights:
36 |                 weight = word_weights[word]
37 |             elif word.lower() in word_weights:
38 |                 weight = word_weights[word.lower()]
39 |             else:
40 |                 num_unknown_words += 1
41 |             weights.append(weight)
42 | 
43 |         logger.info("{} of {} words without a weighting value. Set weight to {}".format(num_unknown_words, len(vocab), unknown_word_weight))
44 | 
45 |         self.emb_layer = nn.Embedding(len(vocab), 1)
46 |         self.emb_layer.load_state_dict({'weight': torch.FloatTensor(weights).unsqueeze(1)})
47 | 
48 | 
49 |     def forward(self, features: Dict[str, Tensor]):
50 |         attention_mask = features['attention_mask']
51 |         token_embeddings = features['token_embeddings']
52 | 
53 |         #Compute a weight value for each token
54 |         token_weights_raw = self.emb_layer(features['input_ids']).squeeze(-1)
55 |         token_weights = token_weights_raw * attention_mask.float()
56 |         token_weights_sum = torch.sum(token_weights, 1)
57 | 
58 |         #Multiply embedding by token weight value
59 |         token_weights_expanded = token_weights.unsqueeze(-1).expand(token_embeddings.size())
60 |         token_embeddings = token_embeddings * token_weights_expanded
61 | 
62 |         features.update({'token_embeddings': token_embeddings, 'token_weights_sum': token_weights_sum})
63 |         return features
64 | 
65 |     def get_config_dict(self):
66 |         return {key: self.__dict__[key] for key in self.config_keys}
67 | 
68 |     def save(self, output_path):
69 |         with open(os.path.join(output_path, 'config.json'), 'w') as fOut:
70 |             json.dump(self.get_config_dict(), fOut, indent=2)
71 | 
72 |     @staticmethod
73 |     def load(input_path):
74 |         with open(os.path.join(input_path, 'config.json')) as fIn:
75 |             config = json.load(fIn)
76 | 
77 |         return WordWeights(**config)
78 | 


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/BoW.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import Tensor
 3 | from torch import nn
 4 | from typing import Union, Tuple, List, Iterable, Dict
 5 | import os
 6 | import json
 7 | import logging
 8 | import numpy as np
 9 | from .tokenizer import WhitespaceTokenizer
10 | 
11 | 
12 | logger = logging.getLogger(__name__)
13 | 
14 | class BoW(nn.Module):
15 |     """Implements a Bag-of-Words (BoW) model to derive sentence embeddings.
16 | 
17 |     A weighting can be added to allow the generation of tf-idf vectors. The output vector has the size of the vocab.
18 |     """
19 | 
20 |     def __init__(self, vocab: List[str], word_weights: Dict[str, float] = {}, unknown_word_weight: float = 1, cumulative_term_frequency: bool = True):
21 |         super(BoW, self).__init__()
22 |         vocab = list(set(vocab)) #Ensure vocab is unique
23 |         self.config_keys = ['vocab', 'word_weights', 'unknown_word_weight', 'cumulative_term_frequency']
24 |         self.vocab = vocab
25 |         self.word_weights = word_weights
26 |         self.unknown_word_weight = unknown_word_weight
27 |         self.cumulative_term_frequency = cumulative_term_frequency
28 | 
29 |         #Maps wordIdx -> word weight
30 |         self.weights = []
31 |         num_unknown_words = 0
32 |         for word in vocab:
33 |             weight = unknown_word_weight
34 |             if word in word_weights:
35 |                 weight = word_weights[word]
36 |             elif word.lower() in word_weights:
37 |                 weight = word_weights[word.lower()]
38 |             else:
39 |                 num_unknown_words += 1
40 |             self.weights.append(weight)
41 | 
42 |         logger.info("{} out of {} words without a weighting value. Set weight to {}".format(num_unknown_words, len(vocab), unknown_word_weight))
43 | 
44 |         self.tokenizer = WhitespaceTokenizer(vocab, stop_words=set(), do_lower_case=False)
45 |         self.sentence_embedding_dimension = len(vocab)
46 | 
47 | 
48 |     def forward(self, features: Dict[str, Tensor]):
49 |         #Nothing to do, everything is done in get_sentence_features
50 |         return features
51 | 
52 |     def tokenize(self, texts: List[str]) -> List[int]:
53 |         tokenized =  [self.tokenizer.tokenize(text) for text in texts]
54 |         return self.get_sentence_features(tokenized)
55 | 
56 |     def get_sentence_embedding_dimension(self):
57 |         return self.sentence_embedding_dimension
58 | 
59 |     def get_sentence_features(self, tokenized_texts: List[List[int]], pad_seq_length: int = 0):
60 |         vectors = []
61 | 
62 |         for tokens in tokenized_texts:
63 |             vector = np.zeros(self.get_sentence_embedding_dimension(), dtype=np.float32)
64 |             for token in tokens:
65 |                 if self.cumulative_term_frequency:
66 |                     vector[token] += self.weights[token]
67 |                 else:
68 |                     vector[token] = self.weights[token]
69 |             vectors.append(vector)
70 | 
71 |         return {'sentence_embedding': torch.tensor(vectors, dtype=torch.float)}
72 | 
73 |     def get_config_dict(self):
74 |         return {key: self.__dict__[key] for key in self.config_keys}
75 | 
76 |     def save(self, output_path):
77 |         with open(os.path.join(output_path, 'config.json'), 'w') as fOut:
78 |             json.dump(self.get_config_dict(), fOut, indent=2)
79 | 
80 |     @staticmethod
81 |     def load(input_path):
82 |         with open(os.path.join(input_path, 'config.json')) as fIn:
83 |             config = json.load(fIn)
84 | 
85 |         return BoW(**config)
86 | 


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/T5.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | from transformers import T5Model, T5Tokenizer
 3 | import json
 4 | from typing import List, Dict, Optional
 5 | import os
 6 | import numpy as np
 7 | import logging
 8 | 
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | class T5(nn.Module):
13 |     """DEPRECATED: Please use models.Transformer instead.
14 | 
15 |     T5 model to generate token embeddings.
16 | 
17 |     Each token is mapped to an output vector from BERT.
18 |     """
19 |     def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: Optional[bool] = None, task_identifier: str = 'stsb sentence1: ', model_args: Dict = {}, tokenizer_args: Dict = {}):
20 |         super(T5, self).__init__()
21 |         self.config_keys = ['max_seq_length', 'do_lower_case', 'task_identifier']
22 |         self.do_lower_case = do_lower_case
23 | 
24 |         if max_seq_length > 512:
25 |             logger.warning("T5 only allows a max_seq_length of 512. Value will be set to 512")
26 |             max_seq_length = 512
27 |         self.max_seq_length = max_seq_length
28 | 
29 |         if self.do_lower_case is not None:
30 |             tokenizer_args['do_lower_case'] = do_lower_case
31 | 
32 |         self.t5model = T5Model.from_pretrained(model_name_or_path, **model_args)
33 |         self.tokenizer = T5Tokenizer.from_pretrained(model_name_or_path, **tokenizer_args)
34 |         self.task_identifier = task_identifier
35 | 
36 |     def forward(self, features):
37 |         """Returns token_embeddings, cls_token"""
38 |         output_states = self.t5model.encoder(input_ids=features['input_ids'], attention_mask=features['attention_mask'])
39 |         output_tokens = output_states[0]
40 |         cls_tokens = output_tokens[:, 0, :]  # CLS token is first token
41 |         features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens})
42 | 
43 |         if len(output_states) > 1:
44 |             features.update({'all_layer_embeddings': output_states[1]})
45 | 
46 |         return features
47 | 
48 |     def get_word_embedding_dimension(self) -> int:
49 |         return self.t5model.config.hidden_size
50 | 
51 |     def tokenize(self, text: str) -> List[int]:
52 |         """
53 |         Tokenizes a text and maps tokens to token-ids
54 |         """
55 |         return self.tokenizer.encode(self.task_identifier+text)
56 | 
57 |     def get_sentence_features(self, tokens: List[int], pad_seq_length: int):
58 |         """
59 |         Convert tokenized sentence in its embedding ids, segment ids and mask
60 | 
61 |         :param tokens:
62 |             a tokenized sentence
63 |         :param pad_seq_length:
64 |             the maximal length of the sequence. Cannot be greater than self.sentence_transformer_config.max_seq_length
65 |         :return: embedding ids, segment ids and mask for the sentence
66 |         """
67 | 
68 |         pad_seq_length = min(pad_seq_length, self.max_seq_length)
69 |         return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, padding='max_length', return_tensors='pt', truncation=True, prepend_batch_axis=True)
70 | 
71 |     def get_config_dict(self):
72 |         return {key: self.__dict__[key] for key in self.config_keys}
73 | 
74 |     def save(self, output_path: str):
75 |         self.t5model.save_pretrained(output_path)
76 |         self.tokenizer.save_pretrained(output_path)
77 | 
78 |         with open(os.path.join(output_path, 'sentence_T5_config.json'), 'w') as fOut:
79 |             json.dump(self.get_config_dict(), fOut, indent=2)
80 | 
81 |     @staticmethod
82 |     def load(input_path: str):
83 |         with open(os.path.join(input_path, 'sentence_T5_config.json')) as fIn:
84 |             config = json.load(fIn)
85 |         return T5(model_name_or_path=input_path, **config)
86 | 
87 | 
88 | 
89 | 
90 | 
91 | 
92 | 


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/tokenizer/WordTokenizer.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Union, Tuple, List, Iterable, Dict
 3 | 
 4 | ENGLISH_STOP_WORDS = ['!', '"', "''", "``", '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`',  '{', '|', '}', '~', 'a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'ain', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'aren', 'around', 'as', 'at', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'below', 'beside', 'besides', 'between', 'beyond', 'bill', 'both', 'bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant', 'co', 'con', 'could', 'couldn', 'couldnt', 'cry', 'd', 'de', 'describe', 'detail', 'did', 'didn', 'do', 'does', 'doesn', 'doing', 'don', 'done', 'down', 'due', 'during', 'each', 'eg', 'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'etc', 'even', 'ever', 'every', 'everyone', 'everything', 'everywhere', 'except', 'few', 'fifteen', 'fifty', 'fill', 'find', 'fire', 'first', 'five', 'for', 'former', 'formerly', 'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'get', 'give', 'go', 'had', 'hadn', 'has', 'hasn', 'hasnt', 'have', 'haven', 'having', 'he', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'however', 'hundred', 'i', 'ie', 'if', 'in', 'inc', 'indeed', 'interest', 'into', 'is', 'isn', 'it', 'its', 'itself', 'just', 'keep', 'last', 'latter', 'latterly', 'least', 'less', 'll', 'ltd', 'm', 'ma', 'made', 'many', 'may', 'me', 'meanwhile', 'might', 'mightn', 'mill', 'mine', 'more', 'moreover', 'most', 'mostly', 'move', 'much', 'must', 'mustn', 'my', 'myself', 'name', 'namely', 'needn', 'neither', 'never', 'nevertheless', 'next', 'nine', 'no', 'nobody', 'none', 'noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'o', 'of', 'off', 'often', 'on', 'once', 'one', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'part', 'per', 'perhaps', 'please', 'put', 'rather', 're', 's', 'same', 'see', 'seem', 'seemed', 'seeming', 'seems', 'serious', 'several', 'shan', 'she', 'should', 'shouldn', 'show', 'side', 'since', 'sincere', 'six', 'sixty', 'so', 'some', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhere', 'still', 'such', 'system', 't', 'take', 'ten', 'than', 'that', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby', 'therefore', 'therein', 'thereupon', 'these', 'they', 'thick', 'thin', 'third', 'this', 'those', 'though', 'three', 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', 'top', 'toward', 'towards', 'twelve', 'twenty', 'two', 'un', 'under', 'until', 'up', 'upon', 'us', 've', 'very', 'via', 'was', 'wasn', 'we', 'well', 'were', 'weren', 'what', 'whatever', 'when', 'whence', 'whenever', 'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whoever', 'whole', 'whom', 'whose', 'why', 'will', 'with', 'within', 'without', 'won', 'would', 'wouldn', 'y', 'yet', 'you', 'your', 'yours', 'yourself', 'yourselves']
 5 | 
 6 | 
 7 | class WordTokenizer(ABC):
 8 |     @abstractmethod
 9 |     def set_vocab(self, vocab: Iterable[str]):
10 |         pass
11 | 
12 |     @abstractmethod
13 |     def get_vocab(self, vocab: Iterable[str]):
14 |         pass
15 | 
16 |     @abstractmethod
17 |     def tokenize(self, text: str) -> List[int]:
18 |         pass
19 | 
20 |     @abstractmethod
21 |     def save(self, output_path: str):
22 |         pass
23 | 
24 |     @staticmethod
25 |     @abstractmethod
26 |     def load(input_path: str):
27 |         pass


--------------------------------------------------------------------------------
/SentEval/senteval/trec.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | '''
 9 | TREC question-type classification
10 | '''
11 | 
12 | from __future__ import absolute_import, division, unicode_literals
13 | 
14 | import os
15 | import io
16 | import logging
17 | import numpy as np
18 | 
19 | from senteval.tools.validation import KFoldClassifier
20 | 
21 | 
22 | class TRECEval(object):
23 |     def __init__(self, task_path, seed=1111):
24 |         logging.info('***** Transfer task : TREC *****\n\n')
25 |         self.seed = seed
26 |         self.train = self.loadFile(os.path.join(task_path, 'train_5500.label'))
27 |         self.test = self.loadFile(os.path.join(task_path, 'TREC_10.label'))
28 | 
29 |     def do_prepare(self, params, prepare):
30 |         samples = self.train['X'] + self.test['X']
31 |         return prepare(params, samples)
32 | 
33 |     def loadFile(self, fpath):
34 |         trec_data = {'X': [], 'y': []}
35 |         tgt2idx = {'ABBR': 0, 'DESC': 1, 'ENTY': 2,
36 |                    'HUM': 3, 'LOC': 4, 'NUM': 5}
37 |         with io.open(fpath, 'r', encoding='latin-1') as f:
38 |             for line in f:
39 |                 target, sample = line.strip().split(':', 1)
40 |                 sample = sample.split(' ', 1)[1].split()
41 |                 assert target in tgt2idx, target
42 |                 trec_data['X'].append(sample)
43 |                 trec_data['y'].append(tgt2idx[target])
44 |         return trec_data
45 | 
46 |     def run(self, params, batcher):
47 |         train_embeddings, test_embeddings = [], []
48 | 
49 |         # Sort to reduce padding
50 |         sorted_corpus_train = sorted(zip(self.train['X'], self.train['y']),
51 |                                      key=lambda z: (len(z[0]), z[1]))
52 |         train_samples = [x for (x, y) in sorted_corpus_train]
53 |         train_labels = [y for (x, y) in sorted_corpus_train]
54 | 
55 |         sorted_corpus_test = sorted(zip(self.test['X'], self.test['y']),
56 |                                     key=lambda z: (len(z[0]), z[1]))
57 |         test_samples = [x for (x, y) in sorted_corpus_test]
58 |         test_labels = [y for (x, y) in sorted_corpus_test]
59 | 
60 |         # Get train embeddings
61 |         for ii in range(0, len(train_labels), params.batch_size):
62 |             batch = train_samples[ii:ii + params.batch_size]
63 |             embeddings = batcher(params, batch)
64 |             train_embeddings.append(embeddings)
65 |         train_embeddings = np.vstack(train_embeddings)
66 |         logging.info('Computed train embeddings')
67 | 
68 |         # Get test embeddings
69 |         for ii in range(0, len(test_labels), params.batch_size):
70 |             batch = test_samples[ii:ii + params.batch_size]
71 |             embeddings = batcher(params, batch)
72 |             test_embeddings.append(embeddings)
73 |         test_embeddings = np.vstack(test_embeddings)
74 |         logging.info('Computed test embeddings')
75 | 
76 |         config_classifier = {'nclasses': 6, 'seed': self.seed,
77 |                              'usepytorch': params.usepytorch,
78 |                              'classifier': params.classifier,
79 |                              'kfold': params.kfold}
80 |         clf = KFoldClassifier({'X': train_embeddings,
81 |                                'y': np.array(train_labels)},
82 |                               {'X': test_embeddings,
83 |                                'y': np.array(test_labels)},
84 |                               config_classifier)
85 |         devacc, testacc, _ = clf.run()
86 |         logging.debug('\nDev acc : {0} Test acc : {1} \
87 |             for TREC\n'.format(devacc, testacc))
88 |         return {'devacc': devacc, 'acc': testacc,
89 |                 'ndev': len(self.train['X']), 'ntest': len(self.test['X'])}
90 | 


--------------------------------------------------------------------------------
/SentEval/examples/bow.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017-present, Facebook, Inc.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | #
  7 | 
  8 | from __future__ import absolute_import, division, unicode_literals
  9 | 
 10 | import sys
 11 | import io
 12 | import numpy as np
 13 | import logging
 14 | 
 15 | 
 16 | # Set PATHs
 17 | PATH_TO_SENTEVAL = '../'
 18 | PATH_TO_DATA = '../data'
 19 | # PATH_TO_VEC = 'glove/glove.840B.300d.txt'
 20 | PATH_TO_VEC = 'fasttext/crawl-300d-2M.vec'
 21 | 
 22 | # import SentEval
 23 | sys.path.insert(0, PATH_TO_SENTEVAL)
 24 | import senteval
 25 | 
 26 | 
 27 | # Create dictionary
 28 | def create_dictionary(sentences, threshold=0):
 29 |     words = {}
 30 |     for s in sentences:
 31 |         for word in s:
 32 |             words[word] = words.get(word, 0) + 1
 33 | 
 34 |     if threshold > 0:
 35 |         newwords = {}
 36 |         for word in words:
 37 |             if words[word] >= threshold:
 38 |                 newwords[word] = words[word]
 39 |         words = newwords
 40 |     words['<s>'] = 1e9 + 4
 41 |     words['</s>'] = 1e9 + 3
 42 |     words['<p>'] = 1e9 + 2
 43 | 
 44 |     sorted_words = sorted(words.items(), key=lambda x: -x[1])  # inverse sort
 45 |     id2word = []
 46 |     word2id = {}
 47 |     for i, (w, _) in enumerate(sorted_words):
 48 |         id2word.append(w)
 49 |         word2id[w] = i
 50 | 
 51 |     return id2word, word2id
 52 | 
 53 | # Get word vectors from vocabulary (glove, word2vec, fasttext ..)
 54 | def get_wordvec(path_to_vec, word2id):
 55 |     word_vec = {}
 56 | 
 57 |     with io.open(path_to_vec, 'r', encoding='utf-8') as f:
 58 |         # if word2vec or fasttext file : skip first line "next(f)"
 59 |         for line in f:
 60 |             word, vec = line.split(' ', 1)
 61 |             if word in word2id:
 62 |                 word_vec[word] = np.fromstring(vec, sep=' ')
 63 | 
 64 |     logging.info('Found {0} words with word vectors, out of \
 65 |         {1} words'.format(len(word_vec), len(word2id)))
 66 |     return word_vec
 67 | 
 68 | 
 69 | # SentEval prepare and batcher
 70 | def prepare(params, samples):
 71 |     _, params.word2id = create_dictionary(samples)
 72 |     params.word_vec = get_wordvec(PATH_TO_VEC, params.word2id)
 73 |     params.wvec_dim = 300
 74 |     return
 75 | 
 76 | def batcher(params, batch):
 77 |     batch = [sent if sent != [] else ['.'] for sent in batch]
 78 |     embeddings = []
 79 | 
 80 |     for sent in batch:
 81 |         sentvec = []
 82 |         for word in sent:
 83 |             if word in params.word_vec:
 84 |                 sentvec.append(params.word_vec[word])
 85 |         if not sentvec:
 86 |             vec = np.zeros(params.wvec_dim)
 87 |             sentvec.append(vec)
 88 |         sentvec = np.mean(sentvec, 0)
 89 |         embeddings.append(sentvec)
 90 | 
 91 |     embeddings = np.vstack(embeddings)
 92 |     return embeddings
 93 | 
 94 | 
 95 | # Set params for SentEval
 96 | params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5}
 97 | params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128,
 98 |                                  'tenacity': 3, 'epoch_size': 2}
 99 | 
100 | # Set up logger
101 | logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)
102 | 
103 | if __name__ == "__main__":
104 |     se = senteval.engine.SE(params_senteval, batcher, prepare)
105 |     transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16',
106 |                       'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC',
107 |                       'SICKEntailment', 'SICKRelatedness', 'STSBenchmark',
108 |                       'Length', 'WordContent', 'Depth', 'TopConstituents',
109 |                       'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber',
110 |                       'OddManOut', 'CoordinationInversion']
111 |     results = se.eval(transfer_tasks)
112 |     print(results)
113 | 


--------------------------------------------------------------------------------
/SentEval/senteval/binary.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | '''
 9 | Binary classifier and corresponding datasets : MR, CR, SUBJ, MPQA
10 | '''
11 | from __future__ import absolute_import, division, unicode_literals
12 | 
13 | import io
14 | import os
15 | import numpy as np
16 | import logging
17 | 
18 | from senteval.tools.validation import InnerKFoldClassifier
19 | 
20 | 
21 | class BinaryClassifierEval(object):
22 |     def __init__(self, pos, neg, seed=1111):
23 |         self.seed = seed
24 |         self.samples, self.labels = pos + neg, [1] * len(pos) + [0] * len(neg)
25 |         self.n_samples = len(self.samples)
26 | 
27 |     def do_prepare(self, params, prepare):
28 |         # prepare is given the whole text
29 |         return prepare(params, self.samples)
30 |         # prepare puts everything it outputs in "params" : params.word2id etc
31 |         # Those output will be further used by "batcher".
32 | 
33 |     def loadFile(self, fpath):
34 |         with io.open(fpath, 'r', encoding='latin-1') as f:
35 |             return [line.split() for line in f.read().splitlines()]
36 | 
37 |     def run(self, params, batcher):
38 |         enc_input = []
39 |         # Sort to reduce padding
40 |         sorted_corpus = sorted(zip(self.samples, self.labels),
41 |                                key=lambda z: (len(z[0]), z[1]))
42 |         sorted_samples = [x for (x, y) in sorted_corpus]
43 |         sorted_labels = [y for (x, y) in sorted_corpus]
44 |         logging.info('Generating sentence embeddings')
45 |         for ii in range(0, self.n_samples, params.batch_size):
46 |             batch = sorted_samples[ii:ii + params.batch_size]
47 |             embeddings = batcher(params, batch)
48 |             enc_input.append(embeddings)
49 |         enc_input = np.vstack(enc_input)
50 |         logging.info('Generated sentence embeddings')
51 | 
52 |         config = {'nclasses': 2, 'seed': self.seed,
53 |                   'usepytorch': params.usepytorch,
54 |                   'classifier': params.classifier,
55 |                   'nhid': params.nhid, 'kfold': params.kfold}
56 |         clf = InnerKFoldClassifier(enc_input, np.array(sorted_labels), config)
57 |         devacc, testacc = clf.run()
58 |         logging.debug('Dev acc : {0} Test acc : {1}\n'.format(devacc, testacc))
59 |         return {'devacc': devacc, 'acc': testacc, 'ndev': self.n_samples,
60 |                 'ntest': self.n_samples}
61 | 
62 | 
63 | class CREval(BinaryClassifierEval):
64 |     def __init__(self, task_path, seed=1111):
65 |         logging.debug('***** Transfer task : CR *****\n\n')
66 |         pos = self.loadFile(os.path.join(task_path, 'custrev.pos'))
67 |         neg = self.loadFile(os.path.join(task_path, 'custrev.neg'))
68 |         super(self.__class__, self).__init__(pos, neg, seed)
69 | 
70 | 
71 | class MREval(BinaryClassifierEval):
72 |     def __init__(self, task_path, seed=1111):
73 |         logging.debug('***** Transfer task : MR *****\n\n')
74 |         pos = self.loadFile(os.path.join(task_path, 'rt-polarity.pos'))
75 |         neg = self.loadFile(os.path.join(task_path, 'rt-polarity.neg'))
76 |         super(self.__class__, self).__init__(pos, neg, seed)
77 | 
78 | 
79 | class SUBJEval(BinaryClassifierEval):
80 |     def __init__(self, task_path, seed=1111):
81 |         logging.debug('***** Transfer task : SUBJ *****\n\n')
82 |         obj = self.loadFile(os.path.join(task_path, 'subj.objective'))
83 |         subj = self.loadFile(os.path.join(task_path, 'subj.subjective'))
84 |         super(self.__class__, self).__init__(obj, subj, seed)
85 | 
86 | 
87 | class MPQAEval(BinaryClassifierEval):
88 |     def __init__(self, task_path, seed=1111):
89 |         logging.debug('***** Transfer task : MPQA *****\n\n')
90 |         pos = self.loadFile(os.path.join(task_path, 'mpqa.pos'))
91 |         neg = self.loadFile(os.path.join(task_path, 'mpqa.neg'))
92 |         super(self.__class__, self).__init__(pos, neg, seed)
93 | 


--------------------------------------------------------------------------------
/SentEval/senteval/sst.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | '''
 9 | SST - binary classification
10 | '''
11 | 
12 | from __future__ import absolute_import, division, unicode_literals
13 | 
14 | import os
15 | import io
16 | import logging
17 | import numpy as np
18 | 
19 | from senteval.tools.validation import SplitClassifier
20 | 
21 | 
22 | class SSTEval(object):
23 |     def __init__(self, task_path, nclasses=2, seed=1111):
24 |         self.seed = seed
25 | 
26 |         # binary of fine-grained
27 |         assert nclasses in [2, 5]
28 |         self.nclasses = nclasses
29 |         self.task_name = 'Binary' if self.nclasses == 2 else 'Fine-Grained'
30 |         logging.debug('***** Transfer task : SST %s classification *****\n\n', self.task_name)
31 | 
32 |         train = self.loadFile(os.path.join(task_path, 'sentiment-train'))
33 |         dev = self.loadFile(os.path.join(task_path, 'sentiment-dev'))
34 |         test = self.loadFile(os.path.join(task_path, 'sentiment-test'))
35 |         self.sst_data = {'train': train, 'dev': dev, 'test': test}
36 | 
37 |     def do_prepare(self, params, prepare):
38 |         samples = self.sst_data['train']['X'] + self.sst_data['dev']['X'] + \
39 |                   self.sst_data['test']['X']
40 |         return prepare(params, samples)
41 | 
42 |     def loadFile(self, fpath):
43 |         sst_data = {'X': [], 'y': []}
44 |         with io.open(fpath, 'r', encoding='utf-8') as f:
45 |             for line in f:
46 |                 if self.nclasses == 2:
47 |                     sample = line.strip().split('\t')
48 |                     sst_data['y'].append(int(sample[1]))
49 |                     sst_data['X'].append(sample[0].split())
50 |                 elif self.nclasses == 5:
51 |                     sample = line.strip().split(' ', 1)
52 |                     sst_data['y'].append(int(sample[0]))
53 |                     sst_data['X'].append(sample[1].split())
54 |         assert max(sst_data['y']) == self.nclasses - 1
55 |         return sst_data
56 | 
57 |     def run(self, params, batcher):
58 |         sst_embed = {'train': {}, 'dev': {}, 'test': {}}
59 |         bsize = params.batch_size
60 | 
61 |         for key in self.sst_data:
62 |             logging.info('Computing embedding for {0}'.format(key))
63 |             # Sort to reduce padding
64 |             sorted_data = sorted(zip(self.sst_data[key]['X'],
65 |                                      self.sst_data[key]['y']),
66 |                                  key=lambda z: (len(z[0]), z[1]))
67 |             self.sst_data[key]['X'], self.sst_data[key]['y'] = map(list, zip(*sorted_data))
68 | 
69 |             sst_embed[key]['X'] = []
70 |             for ii in range(0, len(self.sst_data[key]['y']), bsize):
71 |                 batch = self.sst_data[key]['X'][ii:ii + bsize]
72 |                 embeddings = batcher(params, batch)
73 |                 sst_embed[key]['X'].append(embeddings)
74 |             sst_embed[key]['X'] = np.vstack(sst_embed[key]['X'])
75 |             sst_embed[key]['y'] = np.array(self.sst_data[key]['y'])
76 |             logging.info('Computed {0} embeddings'.format(key))
77 | 
78 |         config_classifier = {'nclasses': self.nclasses, 'seed': self.seed,
79 |                              'usepytorch': params.usepytorch,
80 |                              'classifier': params.classifier}
81 | 
82 |         clf = SplitClassifier(X={'train': sst_embed['train']['X'],
83 |                                  'valid': sst_embed['dev']['X'],
84 |                                  'test': sst_embed['test']['X']},
85 |                               y={'train': sst_embed['train']['y'],
86 |                                  'valid': sst_embed['dev']['y'],
87 |                                  'test': sst_embed['test']['y']},
88 |                               config=config_classifier)
89 | 
90 |         devacc, testacc = clf.run()
91 |         logging.debug('\nDev acc : {0} Test acc : {1} for \
92 |             SST {2} classification\n'.format(devacc, testacc, self.task_name))
93 | 
94 |         return {'devacc': devacc, 'acc': testacc,
95 |                 'ndev': len(sst_embed['dev']['X']),
96 |                 'ntest': len(sst_embed['test']['X'])}
97 | 


--------------------------------------------------------------------------------
/sentence_transformers_congen/evaluation/MSEEvaluatorFromDataFrame.py:
--------------------------------------------------------------------------------
 1 | from sentence_transformers.evaluation import SentenceEvaluator
 2 | from sentence_transformers.util import batch_to_device
 3 | from sentence_transformers import SentenceTransformer
 4 | from typing import List, Tuple, Dict
 5 | import torch
 6 | import numpy as np
 7 | import logging
 8 | import os
 9 | import csv
10 | 
11 | 
12 | logger = logging.getLogger(__name__)
13 | 
14 | 
15 | class MSEEvaluatorFromDataFrame(SentenceEvaluator):
16 |     """
17 |     Computes the mean squared error (x100) between the computed sentence embedding
18 |     and some target sentence embedding.
19 |     :param dataframe:
20 |         It must have the following format. Rows contains different, parallel sentences. Columns are the respective language codes
21 |         [{'en': 'My sentence', 'es': 'Sentence in Spanisch', 'fr': 'Sentence in French'...},
22 |          {'en': 'My second sentence', ....]
23 |     :param combinations:
24 |         Must be of the format [('en', 'es'), ('en', 'fr'), ...]
25 |         First entry in a tuple is the source language. The sentence in the respective language will be fetched from the dataframe and passed to the teacher model.
26 |         Second entry in a tuple the the target language. Sentence will be fetched from the dataframe and passed to the student model
27 |     """
28 |     def __init__(self, dataframe: List[Dict[str, str]], teacher_model: SentenceTransformer, combinations: List[Tuple[str, str]], batch_size: int = 8, name='', write_csv: bool = True):
29 | 
30 |         self.combinations = combinations
31 |         self.name = name
32 |         self.batch_size = batch_size
33 | 
34 | 
35 |         if name:
36 |             name = "_"+name
37 | 
38 |         self.csv_file = "mse_evaluation" + name + "_results.csv"
39 |         self.csv_headers = ["epoch", "steps"]
40 |         self.write_csv = write_csv
41 |         self.data = {}
42 | 
43 |         logger.info("Compute teacher embeddings")
44 |         all_source_sentences = set()
45 |         for src_lang, trg_lang in self.combinations:
46 |             src_sentences = []
47 |             trg_sentences = []
48 | 
49 |             for row in dataframe:
50 |                 if row[src_lang].strip() != "" and row[trg_lang].strip() != "":
51 |                     all_source_sentences.add(row[src_lang])
52 |                     src_sentences.append(row[src_lang])
53 |                     trg_sentences.append(row[trg_lang])
54 | 
55 |             self.data[(src_lang, trg_lang)] = (src_sentences, trg_sentences)
56 |             self.csv_headers.append("{}-{}".format(src_lang, trg_lang))
57 | 
58 |         all_source_sentences = list(all_source_sentences)
59 |         all_src_embeddings = teacher_model.encode(all_source_sentences, batch_size=self.batch_size)
60 |         self.teacher_embeddings = {sent: emb for sent, emb in zip(all_source_sentences, all_src_embeddings)}
61 | 
62 |     def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int  = -1):
63 |         model.eval()
64 | 
65 |         mse_scores = []
66 |         for src_lang, trg_lang in self.combinations:
67 |             src_sentences, trg_sentences = self.data[(src_lang, trg_lang)]
68 | 
69 |             src_embeddings = np.asarray([self.teacher_embeddings[sent] for sent in src_sentences])
70 |             trg_embeddings = np.asarray(model.encode(trg_sentences, batch_size=self.batch_size))
71 | 
72 |             mse = ((src_embeddings - trg_embeddings) ** 2).mean()
73 |             mse *= 100
74 |             mse_scores.append(mse)
75 | 
76 |             logger.info("MSE evaluation on {} dataset - {}-{}:".format(self.name, src_lang, trg_lang))
77 |             logger.info("MSE (*100):\t{:4f}".format(mse))
78 | 
79 |         if output_path is not None and self.write_csv:
80 |             csv_path = os.path.join(output_path, self.csv_file)
81 |             output_file_exists = os.path.isfile(csv_path)
82 |             with open(csv_path, newline='', mode="a" if output_file_exists else 'w', encoding="utf-8") as f:
83 |                 writer = csv.writer(f)
84 |                 if not output_file_exists:
85 |                     writer.writerow(self.csv_headers)
86 | 
87 |                 writer.writerow([epoch, steps]+mse_scores)
88 | 
89 |         return -np.mean(mse_scores) #Return negative score as SentenceTransformers maximizes the performance
90 | 
91 | 


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/tokenizer/PhraseTokenizer.py:
--------------------------------------------------------------------------------
 1 | from typing import Union, Tuple, List, Iterable, Dict
 2 | import collections
 3 | import string
 4 | import os
 5 | import json
 6 | import logging
 7 | from .WordTokenizer import WordTokenizer, ENGLISH_STOP_WORDS
 8 | import nltk
 9 | 
10 | 
11 | logger = logging.getLogger(__name__)
12 | 
13 | class PhraseTokenizer(WordTokenizer):
14 |     """Tokenizes the text with respect to existent phrases in the vocab.
15 | 
16 |     This tokenizers respects phrases that are in the vocab. Phrases are separated with 'ngram_separator', for example,
17 |     in Google News word2vec file, ngrams are separated with a _ like New_York. These phrases are detected in text and merged as one special token. (New York is the ... => [New_York, is, the])
18 |     """
19 |     def __init__(self, vocab: Iterable[str] = [], stop_words: Iterable[str] = ENGLISH_STOP_WORDS, do_lower_case: bool = False, ngram_separator: str = "_", max_ngram_length: int = 5):
20 |         self.stop_words = set(stop_words)
21 |         self.do_lower_case = do_lower_case
22 |         self.ngram_separator = ngram_separator
23 |         self.max_ngram_length = max_ngram_length
24 |         self.set_vocab(vocab)
25 | 
26 |     def get_vocab(self):
27 |         return self.vocab
28 | 
29 |     def set_vocab(self, vocab: Iterable[str]):
30 |         self.vocab = vocab
31 |         self.word2idx = collections.OrderedDict([(word, idx) for idx, word in enumerate(vocab)])
32 | 
33 |         # Check for ngram in vocab
34 |         self.ngram_lookup = set()
35 |         self.ngram_lengths = set()
36 |         for word in vocab:
37 | 
38 |             if self.ngram_separator is not None and self.ngram_separator in word:
39 |                 # Sum words might me malformed in e.g. google news word2vec, containing two or more _ after each other
40 |                 ngram_count = word.count(self.ngram_separator) + 1
41 |                 if self.ngram_separator + self.ngram_separator not in word and ngram_count <= self.max_ngram_length:
42 |                     self.ngram_lookup.add(word)
43 |                     self.ngram_lengths.add(ngram_count)
44 | 
45 |         if len(vocab) > 0:
46 |             logger.info("PhraseTokenizer - Phrase ngram lengths: {}".format(self.ngram_lengths))
47 |             logger.info("PhraseTokenizer - Num phrases: {}".format(len(self.ngram_lookup)))
48 | 
49 |     def tokenize(self, text: str) -> List[int]:
50 |         tokens = nltk.word_tokenize(text, preserve_line=True)
51 | 
52 |         #phrase detection
53 |         for ngram_len in sorted(self.ngram_lengths, reverse=True):
54 |             idx = 0
55 |             while idx <= len(tokens) - ngram_len:
56 |                 ngram = self.ngram_separator.join(tokens[idx:idx + ngram_len])
57 |                 if ngram in self.ngram_lookup:
58 |                     tokens[idx:idx + ngram_len] = [ngram]
59 |                 elif ngram.lower() in self.ngram_lookup:
60 |                     tokens[idx:idx + ngram_len] = [ngram.lower()]
61 |                 idx += 1
62 | 
63 |         #Map tokens to idx, filter stop words
64 |         tokens_filtered = []
65 |         for token in tokens:
66 |             if token in self.stop_words:
67 |                 continue
68 |             elif token in self.word2idx:
69 |                 tokens_filtered.append(self.word2idx[token])
70 |                 continue
71 | 
72 |             token = token.lower()
73 |             if token in self.stop_words:
74 |                 continue
75 |             elif token in self.word2idx:
76 |                 tokens_filtered.append(self.word2idx[token])
77 |                 continue
78 | 
79 |             token = token.strip(string.punctuation)
80 |             if token in self.stop_words:
81 |                 continue
82 |             elif len(token) > 0 and token in self.word2idx:
83 |                 tokens_filtered.append(self.word2idx[token])
84 |                 continue
85 | 
86 |         return tokens_filtered
87 | 
88 |     def save(self, output_path: str):
89 |         with open(os.path.join(output_path, 'phrasetokenizer_config.json'), 'w') as fOut:
90 |             json.dump({'vocab': list(self.word2idx.keys()), 'stop_words': list(self.stop_words), 'do_lower_case': self.do_lower_case, 'ngram_separator': self.ngram_separator, 'max_ngram_length': self.max_ngram_length}, fOut)
91 | 
92 |     @staticmethod
93 |     def load(input_path: str):
94 |         with open(os.path.join(input_path, 'phrasetokenizer_config.json'), 'r') as fIn:
95 |             config = json.load(fIn)
96 | 
97 |         return PhraseTokenizer(**config)
98 | 


--------------------------------------------------------------------------------
/SentEval/senteval/mrpc.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017-present, Facebook, Inc.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | #
  7 | 
  8 | '''
  9 | MRPC : Microsoft Research Paraphrase (detection) Corpus
 10 | '''
 11 | from __future__ import absolute_import, division, unicode_literals
 12 | 
 13 | import os
 14 | import logging
 15 | import numpy as np
 16 | import io
 17 | 
 18 | from senteval.tools.validation import KFoldClassifier
 19 | 
 20 | from sklearn.metrics import f1_score
 21 | 
 22 | 
 23 | class MRPCEval(object):
 24 |     def __init__(self, task_path, seed=1111):
 25 |         logging.info('***** Transfer task : MRPC *****\n\n')
 26 |         self.seed = seed
 27 |         train = self.loadFile(os.path.join(task_path,
 28 |                               'msr_paraphrase_train.txt'))
 29 |         test = self.loadFile(os.path.join(task_path,
 30 |                              'msr_paraphrase_test.txt'))
 31 |         self.mrpc_data = {'train': train, 'test': test}
 32 | 
 33 |     def do_prepare(self, params, prepare):
 34 |         # TODO : Should we separate samples in "train, test"?
 35 |         samples = self.mrpc_data['train']['X_A'] + \
 36 |                   self.mrpc_data['train']['X_B'] + \
 37 |                   self.mrpc_data['test']['X_A'] + self.mrpc_data['test']['X_B']
 38 |         return prepare(params, samples)
 39 | 
 40 |     def loadFile(self, fpath):
 41 |         mrpc_data = {'X_A': [], 'X_B': [], 'y': []}
 42 |         with io.open(fpath, 'r', encoding='utf-8') as f:
 43 |             for line in f:
 44 |                 text = line.strip().split('\t')
 45 |                 mrpc_data['X_A'].append(text[3].split())
 46 |                 mrpc_data['X_B'].append(text[4].split())
 47 |                 mrpc_data['y'].append(text[0])
 48 | 
 49 |         mrpc_data['X_A'] = mrpc_data['X_A'][1:]
 50 |         mrpc_data['X_B'] = mrpc_data['X_B'][1:]
 51 |         mrpc_data['y'] = [int(s) for s in mrpc_data['y'][1:]]
 52 |         return mrpc_data
 53 | 
 54 |     def run(self, params, batcher):
 55 |         mrpc_embed = {'train': {}, 'test': {}}
 56 | 
 57 |         for key in self.mrpc_data:
 58 |             logging.info('Computing embedding for {0}'.format(key))
 59 |             # Sort to reduce padding
 60 |             text_data = {}
 61 |             sorted_corpus = sorted(zip(self.mrpc_data[key]['X_A'],
 62 |                                        self.mrpc_data[key]['X_B'],
 63 |                                        self.mrpc_data[key]['y']),
 64 |                                    key=lambda z: (len(z[0]), len(z[1]), z[2]))
 65 | 
 66 |             text_data['A'] = [x for (x, y, z) in sorted_corpus]
 67 |             text_data['B'] = [y for (x, y, z) in sorted_corpus]
 68 |             text_data['y'] = [z for (x, y, z) in sorted_corpus]
 69 | 
 70 |             for txt_type in ['A', 'B']:
 71 |                 mrpc_embed[key][txt_type] = []
 72 |                 for ii in range(0, len(text_data['y']), params.batch_size):
 73 |                     batch = text_data[txt_type][ii:ii + params.batch_size]
 74 |                     embeddings = batcher(params, batch)
 75 |                     mrpc_embed[key][txt_type].append(embeddings)
 76 |                 mrpc_embed[key][txt_type] = np.vstack(mrpc_embed[key][txt_type])
 77 |             mrpc_embed[key]['y'] = np.array(text_data['y'])
 78 |             logging.info('Computed {0} embeddings'.format(key))
 79 | 
 80 |         # Train
 81 |         trainA = mrpc_embed['train']['A']
 82 |         trainB = mrpc_embed['train']['B']
 83 |         trainF = np.c_[np.abs(trainA - trainB), trainA * trainB]
 84 |         trainY = mrpc_embed['train']['y']
 85 | 
 86 |         # Test
 87 |         testA = mrpc_embed['test']['A']
 88 |         testB = mrpc_embed['test']['B']
 89 |         testF = np.c_[np.abs(testA - testB), testA * testB]
 90 |         testY = mrpc_embed['test']['y']
 91 | 
 92 |         config = {'nclasses': 2, 'seed': self.seed,
 93 |                   'usepytorch': params.usepytorch,
 94 |                   'classifier': params.classifier,
 95 |                   'nhid': params.nhid, 'kfold': params.kfold}
 96 |         clf = KFoldClassifier(train={'X': trainF, 'y': trainY},
 97 |                               test={'X': testF, 'y': testY}, config=config)
 98 | 
 99 |         devacc, testacc, yhat = clf.run()
100 |         testf1 = round(100*f1_score(testY, yhat), 2)
101 |         logging.debug('Dev acc : {0} Test acc {1}; Test F1 {2} for MRPC.\n'
102 |                       .format(devacc, testacc, testf1))
103 |         return {'devacc': devacc, 'acc': testacc, 'f1': testf1,
104 |                 'ndev': len(trainA), 'ntest': len(testA)}
105 | 


--------------------------------------------------------------------------------
/sentence_transformers_congen/evaluation/TranslationEvaluator.py:
--------------------------------------------------------------------------------
  1 | from . import SentenceEvaluator
  2 | import logging
  3 | from ..util import pytorch_cos_sim
  4 | import os
  5 | import csv
  6 | import numpy as np
  7 | import scipy.spatial
  8 | from typing import List
  9 | import torch
 10 | 
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | class TranslationEvaluator(SentenceEvaluator):
 15 |     """
 16 |     Given two sets of sentences in different languages, e.g. (en_1, en_2, en_3...) and (fr_1, fr_2, fr_3, ...),
 17 |     and assuming that fr_i is the translation of en_i.
 18 |     Checks if vec(en_i) has the highest similarity to vec(fr_i). Computes the accurarcy in both directions
 19 |     """
 20 |     def __init__(self, source_sentences: List[str], target_sentences: List[str],  show_progress_bar: bool = False, batch_size: int = 16, name: str = '', print_wrong_matches: bool = False, write_csv: bool = True):
 21 |         """
 22 |         Constructs an evaluator based for the dataset
 23 | 
 24 |         The labels need to indicate the similarity between the sentences.
 25 | 
 26 |         :param source_sentences:
 27 |             List of sentences in source language
 28 |         :param target_sentences:
 29 |             List of sentences in target language
 30 |         :param print_wrong_matches:
 31 |             Prints incorrect matches
 32 |         :param write_csv:
 33 |             Write results to CSV file
 34 |         """
 35 |         self.source_sentences = source_sentences
 36 |         self.target_sentences = target_sentences
 37 |         self.name = name
 38 |         self.batch_size = batch_size
 39 |         self.show_progress_bar = show_progress_bar
 40 |         self.print_wrong_matches = print_wrong_matches
 41 | 
 42 |         assert len(self.source_sentences) == len(self.target_sentences)
 43 | 
 44 |         if name:
 45 |             name = "_"+name
 46 | 
 47 |         self.csv_file = "translation_evaluation"+name+"_results.csv"
 48 |         self.csv_headers = ["epoch", "steps", "src2trg", "trg2src"]
 49 |         self.write_csv = write_csv
 50 | 
 51 |     def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
 52 |         if epoch != -1:
 53 |             if steps == -1:
 54 |                 out_txt = " after epoch {}:".format(epoch)
 55 |             else:
 56 |                 out_txt = " in epoch {} after {} steps:".format(epoch, steps)
 57 |         else:
 58 |             out_txt = ":"
 59 | 
 60 |         logger.info("Evaluating translation matching Accuracy on "+self.name+" dataset"+out_txt)
 61 | 
 62 |         embeddings1 = torch.stack(model.encode(self.source_sentences, show_progress_bar=self.show_progress_bar, batch_size=self.batch_size, convert_to_numpy=False))
 63 |         embeddings2 = torch.stack(model.encode(self.target_sentences, show_progress_bar=self.show_progress_bar, batch_size=self.batch_size, convert_to_numpy=False))
 64 | 
 65 | 
 66 |         cos_sims = pytorch_cos_sim(embeddings1, embeddings2).detach().cpu().numpy()
 67 | 
 68 |         correct_src2trg = 0
 69 |         correct_trg2src = 0
 70 | 
 71 |         for i in range(len(cos_sims)):
 72 |             max_idx = np.argmax(cos_sims[i])
 73 | 
 74 |             if i == max_idx:
 75 |                 correct_src2trg += 1
 76 |             elif self.print_wrong_matches:
 77 |                 print("i:", i, "j:", max_idx, "INCORRECT" if i != max_idx else "CORRECT")
 78 |                 print("Src:", self.source_sentences[i])
 79 |                 print("Trg:", self.target_sentences[max_idx])
 80 |                 print("Argmax score:", cos_sims[i][max_idx], "vs. correct score:", cos_sims[i][i])
 81 | 
 82 |                 results = zip(range(len(cos_sims[i])), cos_sims[i])
 83 |                 results = sorted(results, key=lambda x: x[1], reverse=True)
 84 |                 for idx, score in results[0:5]:
 85 |                     print("\t", idx, "(Score: %.4f)" % (score), self.target_sentences[idx])
 86 | 
 87 | 
 88 | 
 89 |         cos_sims = cos_sims.T
 90 |         for i in range(len(cos_sims)):
 91 |             max_idx = np.argmax(cos_sims[i])
 92 |             if i == max_idx:
 93 |                 correct_trg2src += 1
 94 | 
 95 |         acc_src2trg = correct_src2trg / len(cos_sims)
 96 |         acc_trg2src = correct_trg2src / len(cos_sims)
 97 | 
 98 |         logger.info("Accuracy src2trg: {:.2f}".format(acc_src2trg*100))
 99 |         logger.info("Accuracy trg2src: {:.2f}".format(acc_trg2src*100))
100 | 
101 |         if output_path is not None and self.write_csv:
102 |             csv_path = os.path.join(output_path, self.csv_file)
103 |             output_file_exists = os.path.isfile(csv_path)
104 |             with open(csv_path, newline='', mode="a" if output_file_exists else 'w', encoding="utf-8") as f:
105 |                 writer = csv.writer(f)
106 |                 if not output_file_exists:
107 |                     writer.writerow(self.csv_headers)
108 | 
109 |                 writer.writerow([epoch, steps, acc_src2trg, acc_trg2src])
110 | 
111 |         return (acc_src2trg+acc_trg2src)/2
112 | 


--------------------------------------------------------------------------------
/SentEval/senteval/snli.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017-present, Facebook, Inc.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | #
  7 | 
  8 | '''
  9 | SNLI - Entailment
 10 | '''
 11 | from __future__ import absolute_import, division, unicode_literals
 12 | 
 13 | import codecs
 14 | import os
 15 | import io
 16 | import copy
 17 | import logging
 18 | import numpy as np
 19 | 
 20 | from senteval.tools.validation import SplitClassifier
 21 | 
 22 | 
 23 | class SNLIEval(object):
 24 |     def __init__(self, taskpath, seed=1111):
 25 |         logging.debug('***** Transfer task : SNLI Entailment*****\n\n')
 26 |         self.seed = seed
 27 |         train1 = self.loadFile(os.path.join(taskpath, 's1.train'))
 28 |         train2 = self.loadFile(os.path.join(taskpath, 's2.train'))
 29 | 
 30 |         trainlabels = io.open(os.path.join(taskpath, 'labels.train'),
 31 |                               encoding='utf-8').read().splitlines()
 32 | 
 33 |         valid1 = self.loadFile(os.path.join(taskpath, 's1.dev'))
 34 |         valid2 = self.loadFile(os.path.join(taskpath, 's2.dev'))
 35 |         validlabels = io.open(os.path.join(taskpath, 'labels.dev'),
 36 |                               encoding='utf-8').read().splitlines()
 37 | 
 38 |         test1 = self.loadFile(os.path.join(taskpath, 's1.test'))
 39 |         test2 = self.loadFile(os.path.join(taskpath, 's2.test'))
 40 |         testlabels = io.open(os.path.join(taskpath, 'labels.test'),
 41 |                              encoding='utf-8').read().splitlines()
 42 | 
 43 |         # sort data (by s2 first) to reduce padding
 44 |         sorted_train = sorted(zip(train2, train1, trainlabels),
 45 |                               key=lambda z: (len(z[0]), len(z[1]), z[2]))
 46 |         train2, train1, trainlabels = map(list, zip(*sorted_train))
 47 | 
 48 |         sorted_valid = sorted(zip(valid2, valid1, validlabels),
 49 |                               key=lambda z: (len(z[0]), len(z[1]), z[2]))
 50 |         valid2, valid1, validlabels = map(list, zip(*sorted_valid))
 51 | 
 52 |         sorted_test = sorted(zip(test2, test1, testlabels),
 53 |                              key=lambda z: (len(z[0]), len(z[1]), z[2]))
 54 |         test2, test1, testlabels = map(list, zip(*sorted_test))
 55 | 
 56 |         self.samples = train1 + train2 + valid1 + valid2 + test1 + test2
 57 |         self.data = {'train': (train1, train2, trainlabels),
 58 |                      'valid': (valid1, valid2, validlabels),
 59 |                      'test': (test1, test2, testlabels)
 60 |                      }
 61 | 
 62 |     def do_prepare(self, params, prepare):
 63 |         return prepare(params, self.samples)
 64 | 
 65 |     def loadFile(self, fpath):
 66 |         with codecs.open(fpath, 'rb', 'latin-1') as f:
 67 |             return [line.split() for line in
 68 |                     f.read().splitlines()]
 69 | 
 70 |     def run(self, params, batcher):
 71 |         self.X, self.y = {}, {}
 72 |         dico_label = {'entailment': 0,  'neutral': 1, 'contradiction': 2}
 73 |         for key in self.data:
 74 |             if key not in self.X:
 75 |                 self.X[key] = []
 76 |             if key not in self.y:
 77 |                 self.y[key] = []
 78 | 
 79 |             input1, input2, mylabels = self.data[key]
 80 |             enc_input = []
 81 |             n_labels = len(mylabels)
 82 |             for ii in range(0, n_labels, params.batch_size):
 83 |                 batch1 = input1[ii:ii + params.batch_size]
 84 |                 batch2 = input2[ii:ii + params.batch_size]
 85 | 
 86 |                 if len(batch1) == len(batch2) and len(batch1) > 0:
 87 |                     enc1 = batcher(params, batch1)
 88 |                     enc2 = batcher(params, batch2)
 89 |                     enc_input.append(np.hstack((enc1, enc2, enc1 * enc2,
 90 |                                                 np.abs(enc1 - enc2))))
 91 |                 if (ii*params.batch_size) % (20000*params.batch_size) == 0:
 92 |                     logging.info("PROGRESS (encoding): %.2f%%" %
 93 |                                  (100 * ii / n_labels))
 94 |             self.X[key] = np.vstack(enc_input)
 95 |             self.y[key] = [dico_label[y] for y in mylabels]
 96 | 
 97 |         config = {'nclasses': 3, 'seed': self.seed,
 98 |                   'usepytorch': params.usepytorch,
 99 |                   'cudaEfficient': True,
100 |                   'nhid': params.nhid, 'noreg': True}
101 | 
102 |         config_classifier = copy.deepcopy(params.classifier)
103 |         config_classifier['max_epoch'] = 15
104 |         config_classifier['epoch_size'] = 1
105 |         config['classifier'] = config_classifier
106 | 
107 |         clf = SplitClassifier(self.X, self.y, config)
108 |         devacc, testacc = clf.run()
109 |         logging.debug('Dev acc : {0} Test acc : {1} for SNLI\n'
110 |                       .format(devacc, testacc))
111 |         return {'devacc': devacc, 'acc': testacc,
112 |                 'ndev': len(self.data['valid'][0]),
113 |                 'ntest': len(self.data['test'][0])}
114 | 


--------------------------------------------------------------------------------
/SentEval/senteval/rank.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017-present, Facebook, Inc.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | #
  7 | 
  8 | '''
  9 | Image-Caption Retrieval with COCO dataset
 10 | '''
 11 | from __future__ import absolute_import, division, unicode_literals
 12 | 
 13 | import os
 14 | import sys
 15 | import logging
 16 | import numpy as np
 17 | 
 18 | try:
 19 |     import cPickle as pickle
 20 | except ImportError:
 21 |     import pickle
 22 | 
 23 | from senteval.tools.ranking import ImageSentenceRankingPytorch
 24 | 
 25 | 
 26 | class ImageCaptionRetrievalEval(object):
 27 |     def __init__(self, task_path, seed=1111):
 28 |         logging.debug('***** Transfer task: Image Caption Retrieval *****\n\n')
 29 | 
 30 |         # Get captions and image features
 31 |         self.seed = seed
 32 |         train, dev, test = self.loadFile(task_path)
 33 |         self.coco_data = {'train': train, 'dev': dev, 'test': test}
 34 | 
 35 |     def do_prepare(self, params, prepare):
 36 |         samples = self.coco_data['train']['sent'] + \
 37 |                   self.coco_data['dev']['sent'] + \
 38 |                   self.coco_data['test']['sent']
 39 |         prepare(params, samples)
 40 | 
 41 |     def loadFile(self, fpath):
 42 |         coco = {}
 43 | 
 44 |         for split in ['train', 'valid', 'test']:
 45 |             list_sent = []
 46 |             list_img_feat = []
 47 |             if sys.version_info < (3, 0):
 48 |                 with open(os.path.join(fpath, split + '.pkl')) as f:
 49 |                     cocodata = pickle.load(f)
 50 |             else:
 51 |                 with open(os.path.join(fpath, split + '.pkl'), 'rb') as f:
 52 |                     cocodata = pickle.load(f, encoding='latin1')
 53 | 
 54 |             for imgkey in range(len(cocodata['features'])):
 55 |                 assert len(cocodata['image_to_caption_ids'][imgkey]) >= 5, \
 56 |                        cocodata['image_to_caption_ids'][imgkey]
 57 |                 for captkey in cocodata['image_to_caption_ids'][imgkey][0:5]:
 58 |                     sent = cocodata['captions'][captkey]['cleaned_caption']
 59 |                     sent += ' .'  # add punctuation to end of sentence in COCO
 60 |                     list_sent.append(sent.encode('utf-8').split())
 61 |                     list_img_feat.append(cocodata['features'][imgkey])
 62 |             assert len(list_sent) == len(list_img_feat) and \
 63 |                 len(list_sent) % 5 == 0
 64 |             list_img_feat = np.array(list_img_feat).astype('float32')
 65 |             coco[split] = {'sent': list_sent, 'imgfeat': list_img_feat}
 66 |         return coco['train'], coco['valid'], coco['test']
 67 | 
 68 |     def run(self, params, batcher):
 69 |         coco_embed = {'train': {'sentfeat': [], 'imgfeat': []},
 70 |                       'dev': {'sentfeat': [], 'imgfeat': []},
 71 |                       'test': {'sentfeat': [], 'imgfeat': []}}
 72 | 
 73 |         for key in self.coco_data:
 74 |             logging.info('Computing embedding for {0}'.format(key))
 75 |             # Sort to reduce padding
 76 |             self.coco_data[key]['sent'] = np.array(self.coco_data[key]['sent'])
 77 |             self.coco_data[key]['sent'], idx_sort = np.sort(self.coco_data[key]['sent']), np.argsort(self.coco_data[key]['sent'])
 78 |             idx_unsort = np.argsort(idx_sort)
 79 | 
 80 |             coco_embed[key]['X'] = []
 81 |             nsent = len(self.coco_data[key]['sent'])
 82 |             for ii in range(0, nsent, params.batch_size):
 83 |                 batch = self.coco_data[key]['sent'][ii:ii + params.batch_size]
 84 |                 embeddings = batcher(params, batch)
 85 |                 coco_embed[key]['sentfeat'].append(embeddings)
 86 |             coco_embed[key]['sentfeat'] = np.vstack(coco_embed[key]['sentfeat'])[idx_unsort]
 87 |             coco_embed[key]['imgfeat'] = np.array(self.coco_data[key]['imgfeat'])
 88 |             logging.info('Computed {0} embeddings'.format(key))
 89 | 
 90 |         config = {'seed': self.seed, 'projdim': 1000, 'margin': 0.2}
 91 |         clf = ImageSentenceRankingPytorch(train=coco_embed['train'],
 92 |                                           valid=coco_embed['dev'],
 93 |                                           test=coco_embed['test'],
 94 |                                           config=config)
 95 | 
 96 |         bestdevscore, r1_i2t, r5_i2t, r10_i2t, medr_i2t, \
 97 |             r1_t2i, r5_t2i, r10_t2i, medr_t2i = clf.run()
 98 | 
 99 |         logging.debug("\nTest scores | Image to text: \
100 |             {0}, {1}, {2}, {3}".format(r1_i2t, r5_i2t, r10_i2t, medr_i2t))
101 |         logging.debug("Test scores | Text to image: \
102 |             {0}, {1}, {2}, {3}\n".format(r1_t2i, r5_t2i, r10_t2i, medr_t2i))
103 | 
104 |         return {'devacc': bestdevscore,
105 |                 'acc': [(r1_i2t, r5_i2t, r10_i2t, medr_i2t),
106 |                         (r1_t2i, r5_t2i, r10_t2i, medr_t2i)],
107 |                 'ndev': len(coco_embed['dev']['sentfeat']),
108 |                 'ntest': len(coco_embed['test']['sentfeat'])}
109 | 


--------------------------------------------------------------------------------
/SentEval/senteval/tools/relatedness.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017-present, Facebook, Inc.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | #
  7 | 
  8 | """
  9 | Semantic Relatedness (supervised) with Pytorch
 10 | """
 11 | from __future__ import absolute_import, division, unicode_literals
 12 | 
 13 | import copy
 14 | import numpy as np
 15 | 
 16 | import torch
 17 | from torch import nn
 18 | import torch.optim as optim
 19 | 
 20 | from scipy.stats import pearsonr, spearmanr
 21 | 
 22 | 
 23 | class RelatednessPytorch(object):
 24 |     # Can be used for SICK-Relatedness, and STS14
 25 |     def __init__(self, train, valid, test, devscores, config):
 26 |         # fix seed
 27 |         np.random.seed(config['seed'])
 28 |         torch.manual_seed(config['seed'])
 29 |         assert torch.cuda.is_available(), 'torch.cuda required for Relatedness'
 30 |         torch.cuda.manual_seed(config['seed'])
 31 | 
 32 |         self.train = train
 33 |         self.valid = valid
 34 |         self.test = test
 35 |         self.devscores = devscores
 36 | 
 37 |         self.inputdim = train['X'].shape[1]
 38 |         self.nclasses = config['nclasses']
 39 |         self.seed = config['seed']
 40 |         self.l2reg = 0.
 41 |         self.batch_size = 64
 42 |         self.maxepoch = 1000
 43 |         self.early_stop = True
 44 | 
 45 |         self.model = nn.Sequential(
 46 |             nn.Linear(self.inputdim, self.nclasses),
 47 |             nn.Softmax(dim=-1),
 48 |         )
 49 |         self.loss_fn = nn.MSELoss()
 50 | 
 51 |         if torch.cuda.is_available():
 52 |             self.model = self.model.cuda()
 53 |             self.loss_fn = self.loss_fn.cuda()
 54 | 
 55 |         self.loss_fn.size_average = False
 56 |         self.optimizer = optim.Adam(self.model.parameters(),
 57 |                                     weight_decay=self.l2reg)
 58 | 
 59 |     def prepare_data(self, trainX, trainy, devX, devy, testX, testy):
 60 |         # Transform probs to log-probs for KL-divergence
 61 |         trainX = torch.from_numpy(trainX).float().cuda()
 62 |         trainy = torch.from_numpy(trainy).float().cuda()
 63 |         devX = torch.from_numpy(devX).float().cuda()
 64 |         devy = torch.from_numpy(devy).float().cuda()
 65 |         testX = torch.from_numpy(testX).float().cuda()
 66 |         testY = torch.from_numpy(testy).float().cuda()
 67 | 
 68 |         return trainX, trainy, devX, devy, testX, testy
 69 | 
 70 |     def run(self):
 71 |         self.nepoch = 0
 72 |         bestpr = -1
 73 |         early_stop_count = 0
 74 |         r = np.arange(1, 6)
 75 |         stop_train = False
 76 | 
 77 |         # Preparing data
 78 |         trainX, trainy, devX, devy, testX, testy = self.prepare_data(
 79 |             self.train['X'], self.train['y'],
 80 |             self.valid['X'], self.valid['y'],
 81 |             self.test['X'], self.test['y'])
 82 | 
 83 |         # Training
 84 |         while not stop_train and self.nepoch <= self.maxepoch:
 85 |             self.trainepoch(trainX, trainy, nepoches=50)
 86 |             yhat = np.dot(self.predict_proba(devX), r)
 87 |             pr = spearmanr(yhat, self.devscores)[0]
 88 |             pr = 0 if pr != pr else pr  # if NaN bc std=0
 89 |             # early stop on Pearson
 90 |             if pr > bestpr:
 91 |                 bestpr = pr
 92 |                 bestmodel = copy.deepcopy(self.model)
 93 |             elif self.early_stop:
 94 |                 if early_stop_count >= 3:
 95 |                     stop_train = True
 96 |                 early_stop_count += 1
 97 |         self.model = bestmodel
 98 | 
 99 |         yhat = np.dot(self.predict_proba(testX), r)
100 | 
101 |         return bestpr, yhat
102 | 
103 |     def trainepoch(self, X, y, nepoches=1):
104 |         self.model.train()
105 |         for _ in range(self.nepoch, self.nepoch + nepoches):
106 |             permutation = np.random.permutation(len(X))
107 |             all_costs = []
108 |             for i in range(0, len(X), self.batch_size):
109 |                 # forward
110 |                 idx = torch.from_numpy(permutation[i:i + self.batch_size]).long().cuda()
111 |                 Xbatch = X[idx]
112 |                 ybatch = y[idx]
113 |                 output = self.model(Xbatch)
114 |                 # loss
115 |                 loss = self.loss_fn(output, ybatch)
116 |                 all_costs.append(loss.item())
117 |                 # backward
118 |                 self.optimizer.zero_grad()
119 |                 loss.backward()
120 |                 # Update parameters
121 |                 self.optimizer.step()
122 |         self.nepoch += nepoches
123 | 
124 |     def predict_proba(self, devX):
125 |         self.model.eval()
126 |         probas = []
127 |         with torch.no_grad():
128 |             for i in range(0, len(devX), self.batch_size):
129 |                 Xbatch = devX[i:i + self.batch_size]
130 |                 if len(probas) == 0:
131 |                     probas = self.model(Xbatch).data.cpu().numpy()
132 |                 else:
133 |                     probas = np.concatenate((probas, self.model(Xbatch).data.cpu().numpy()), axis=0)
134 |         return probas
135 | 


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/Asym.py:
--------------------------------------------------------------------------------
  1 | from torch import Tensor
  2 | from torch import nn
  3 | from typing import List, Dict
  4 | import os
  5 | import json
  6 | from ..util import import_from_string
  7 | from collections import OrderedDict
  8 | from typing import List, Dict, Optional, Union, Tuple
  9 | 
 10 | class Asym(nn.Sequential):
 11 |     def __init__(self, sub_modules: Dict[str, List[nn.Module]], allow_empty_key: bool = True):
 12 |         """
 13 |         This model allows to create asymmetric SentenceTransformer models, that apply different models depending on the specified input key.
 14 | 
 15 |         In the below example, we create two different Dense models for 'query' and 'doc'. Text that is passed as {'query': 'My query'} will
 16 |         be passed along along the first Dense model, and text that will be passed as {'doc': 'My document'} will use the other Dense model.
 17 | 
 18 |         Note, that when you call encode(), that only inputs of the same type can be encoded. Mixed-Types cannot be encoded.
 19 | 
 20 |         Example::
 21 |             word_embedding_model = models.Transformer(model_name)
 22 |             pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
 23 |             asym_model = models.Asym({'query': [models.Dense(word_embedding_model.get_word_embedding_dimension(), 128)], 'doc': [models.Dense(word_embedding_model.get_word_embedding_dimension(), 128)]})
 24 |             model = SentenceTransformer(modules=[word_embedding_model, pooling_model, asym_model])
 25 | 
 26 |             model.encode([{'query': 'Q1'}, {'query': 'Q2'}]
 27 |             model.encode([{'doc': 'Doc1'}, {'doc': 'Doc2'}]
 28 | 
 29 |             #You can train it with InputExample like this. Note, that the order must always be the same:
 30 |             train_example = InputExample(texts=[{'query': 'Train query', 'doc': 'Doc query'}], label=1)
 31 | 
 32 | 
 33 |         :param sub_modules: Dict in the format str -> List[models]. The models in the specified list will be applied for input marked with the respective key.
 34 |         :param allow_empty_key: If true, inputs without a key can be processed. If false, an exception will be thrown if no key is specified.
 35 |         """
 36 |         self.sub_modules = sub_modules
 37 |         self.allow_empty_key = allow_empty_key
 38 | 
 39 |         ordered_dict = OrderedDict()
 40 |         for name, models in sub_modules.items():
 41 |             if not isinstance(models, List):
 42 |                 models = [models]
 43 | 
 44 |             for idx, model in enumerate(models):
 45 |                 ordered_dict[name+"-"+str(idx)] = model
 46 |         super(Asym, self).__init__(ordered_dict)
 47 | 
 48 | 
 49 |     def forward(self, features: Dict[str, Tensor]):
 50 |         if 'text_keys' in features and len(features['text_keys']) > 0:
 51 |             text_key = features['text_keys'][0]
 52 |             for model in self.sub_modules[text_key]:
 53 |                 features = model(features)
 54 |         elif not self.allow_empty_key:
 55 |             raise ValueError('Input did not specify any keys and allow_empty_key is False')
 56 | 
 57 |         return features
 58 | 
 59 |     def get_sentence_embedding_dimension(self) -> int:
 60 |         raise NotImplementedError()
 61 | 
 62 |     def save(self, output_path):
 63 |         model_lookup = {}
 64 |         model_types = {}
 65 |         model_structure = {}
 66 | 
 67 |         for name, models in self.sub_modules.items():
 68 |             model_structure[name] = []
 69 |             for model in models:
 70 |                 model_id = str(id(model))+'_'+type(model).__name__
 71 |                 model_lookup[model_id] = model
 72 |                 model_types[model_id] = type(model).__module__
 73 |                 model_structure[name].append(model_id)
 74 | 
 75 |         for model_id, model in model_lookup.items():
 76 |             model_path = os.path.join(output_path, str(model_id))
 77 |             os.makedirs(model_path, exist_ok=True)
 78 |             model.save(model_path)
 79 | 
 80 |         with open(os.path.join(output_path, 'config.json'), 'w', encoding='utf8') as fOut:
 81 |             json.dump({'types': model_types, 'structure': model_structure,
 82 |                        'parameters': {'allow_empty_key': self.allow_empty_key}},
 83 |                       fOut, indent=2)
 84 | 
 85 |     def tokenize(self, texts: Union[List[str], List[Tuple[str, str]]]):
 86 |         """
 87 |         Tokenizes a text and maps tokens to token-ids
 88 |         """
 89 |         if not isinstance(texts[0], dict):
 90 |             raise AttributeError("Asym. model requires that texts are passed as dicts: {'key': 'text'}")
 91 | 
 92 | 
 93 |         module_key = None
 94 | 
 95 |         for lookup in texts:
 96 |             text_key, text = next(iter(lookup.items()))
 97 |             if module_key is None:
 98 |                 module_key = text_key
 99 | 
100 |             assert text_key == module_key   #Mixed batches are not allowed
101 |         return self.sub_modules[module_key][0].tokenize(texts)
102 | 
103 | 
104 |     @staticmethod
105 |     def load(input_path):
106 |         with open(os.path.join(input_path, 'config.json')) as fIn:
107 |             config = json.load(fIn)
108 | 
109 |         modules = {}
110 |         for model_id, model_type in config['types'].items():
111 |             module_class = import_from_string(model_type)
112 |             module = module_class.load(os.path.join(input_path, model_id))
113 |             modules[model_id] = module
114 | 
115 |         model_structure = {}
116 |         for key_name, models_list in config['structure'].items():
117 |             model_structure[key_name] = []
118 |             for model_id in models_list:
119 |                 model_structure[key_name].append(modules[model_id])
120 | 
121 |         model = Asym(model_structure, **config['parameters'])
122 |         return model


--------------------------------------------------------------------------------
/sentence_transformers_congen/evaluation/RerankingEvaluator.py:
--------------------------------------------------------------------------------
  1 | from . import SentenceEvaluator
  2 | import logging
  3 | import numpy as np
  4 | import os
  5 | import csv
  6 | from ..util import cos_sim, dot_score
  7 | import torch
  8 | from sklearn.metrics import average_precision_score
  9 | import tqdm
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | class RerankingEvaluator(SentenceEvaluator):
 14 |     """
 15 |     This class evaluates a SentenceTransformer model for the task of re-ranking.
 16 | 
 17 |     Given a query and a list of documents, it computes the score [query, doc_i] for all possible
 18 |     documents and sorts them in decreasing order. Then, MRR@10 and MAP is compute to measure the quality of the ranking.
 19 | 
 20 |     :param samples: Must be a list and each element is of the form: {'query': '', 'positive': [], 'negative': []}. Query is the search query,
 21 |      positive is a list of positive (relevant) documents, negative is a list of negative (irrelevant) documents.
 22 |     """
 23 |     def __init__(self, samples, mrr_at_k: int = 10, name: str = '', write_csv: bool = True, similarity_fct=cos_sim, batch_size: int = 64, show_progress_bar: bool = False):
 24 |         self.samples = samples
 25 |         self.name = name
 26 |         self.mrr_at_k = mrr_at_k
 27 |         self.similarity_fct = similarity_fct
 28 |         self.batch_size = batch_size
 29 |         self.show_progress_bar = show_progress_bar
 30 | 
 31 |         if isinstance(self.samples, dict):
 32 |             self.samples = list(self.samples.values())
 33 | 
 34 |         ### Remove sample with empty positive / negative set
 35 |         self.samples = [sample for sample in self.samples if len(sample['positive']) > 0 and len(sample['negative']) > 0]
 36 | 
 37 | 
 38 |         self.csv_file = "RerankingEvaluator" + ("_" + name if name else '') + "_results.csv"
 39 |         self.csv_headers = ["epoch", "steps", "MAP", "MRR@{}".format(mrr_at_k)]
 40 |         self.write_csv = write_csv
 41 | 
 42 |     def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
 43 |         if epoch != -1:
 44 |             if steps == -1:
 45 |                 out_txt = " after epoch {}:".format(epoch)
 46 |             else:
 47 |                 out_txt = " in epoch {} after {} steps:".format(epoch, steps)
 48 |         else:
 49 |             out_txt = ":"
 50 | 
 51 |         logger.info("RerankingEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt)
 52 | 
 53 | 
 54 |         scores = self.compute_metrices(model)
 55 |         mean_ap = scores['map']
 56 |         mean_mrr = scores['mrr']
 57 | 
 58 |         #### Some stats about the dataset
 59 |         num_positives = [len(sample['positive']) for sample in self.samples]
 60 |         num_negatives = [len(sample['negative']) for sample in self.samples]
 61 | 
 62 |         logger.info("Queries: {} \t Positives: Min {:.1f}, Mean {:.1f}, Max {:.1f} \t Negatives: Min {:.1f}, Mean {:.1f}, Max {:.1f}".format(len(self.samples), np.min(num_positives), np.mean(num_positives),
 63 |                                                                                                                                              np.max(num_positives), np.min(num_negatives),
 64 |                                                                                                                                              np.mean(num_negatives), np.max(num_negatives)))
 65 |         logger.info("MAP: {:.2f}".format(mean_ap * 100))
 66 |         logger.info("MRR@{}: {:.2f}".format(self.mrr_at_k, mean_mrr * 100))
 67 | 
 68 |         #### Write results to disc
 69 |         if output_path is not None and self.write_csv:
 70 |             csv_path = os.path.join(output_path, self.csv_file)
 71 |             output_file_exists = os.path.isfile(csv_path)
 72 |             with open(csv_path, newline='', mode="a" if output_file_exists else 'w', encoding="utf-8") as f:
 73 |                 writer = csv.writer(f)
 74 |                 if not output_file_exists:
 75 |                     writer.writerow(self.csv_headers)
 76 | 
 77 |                 writer.writerow([epoch, steps, mean_ap, mean_mrr])
 78 | 
 79 |         return mean_ap
 80 | 
 81 |     def compute_metrices(self, model):
 82 |         all_mrr_scores = []
 83 |         all_ap_scores = []
 84 | 
 85 | 
 86 |         for instance in tqdm.tqdm(self.samples, disable=not self.show_progress_bar, desc="Samples"):
 87 |             query = instance['query']
 88 |             positive = list(instance['positive'])
 89 |             negative = list(instance['negative'])
 90 | 
 91 |             if len(positive) == 0 or len(negative) == 0:
 92 |                 continue
 93 | 
 94 |             docs = positive + negative
 95 |             is_relevant = [True]*len(positive) + [False]*len(negative)
 96 | 
 97 |             query_emb = model.encode([query], convert_to_tensor=True, batch_size=self.batch_size, show_progress_bar=False)
 98 |             docs_emb = model.encode(docs, convert_to_tensor=True, batch_size=self.batch_size, show_progress_bar=False)
 99 | 
100 |             pred_scores = self.similarity_fct(query_emb, docs_emb)
101 |             if len(pred_scores.shape) > 1:
102 |                 pred_scores = pred_scores[0]
103 | 
104 |             pred_scores_argsort = torch.argsort(-pred_scores)  #Sort in decreasing order
105 | 
106 |             #Compute MRR score
107 |             mrr_score = 0
108 |             for rank, index in enumerate(pred_scores_argsort[0:self.mrr_at_k]):
109 |                 if is_relevant[index]:
110 |                     mrr_score = 1 / (rank+1)
111 |                     break
112 |             all_mrr_scores.append(mrr_score)
113 | 
114 |             # Compute AP
115 |             all_ap_scores.append(average_precision_score(is_relevant, pred_scores.cpu().tolist()))
116 | 
117 |         mean_ap = np.mean(all_ap_scores)
118 |         mean_mrr = np.mean(all_mrr_scores)
119 | 
120 |         return {'map': mean_ap, 'mrr': mean_mrr}
121 | 
122 | 


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/Pooling.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import Tensor
  3 | from torch import nn
  4 | from typing import Union, Tuple, List, Iterable, Dict
  5 | import os
  6 | import json
  7 | 
  8 | 
  9 | class Pooling(nn.Module):
 10 |     """Performs pooling (max or mean) on the token embeddings.
 11 | 
 12 |     Using pooling, it generates from a variable sized sentence a fixed sized sentence embedding. This layer also allows to use the CLS token if it is returned by the underlying word embedding model.
 13 |     You can concatenate multiple poolings together.
 14 | 
 15 |     :param word_embedding_dimension: Dimensions for the word embeddings
 16 |     :param pooling_mode: Can be a string: mean/max/cls. If set, overwrites the other pooling_mode_* settings
 17 |     :param pooling_mode_cls_token: Use the first token (CLS token) as text representations
 18 |     :param pooling_mode_max_tokens: Use max in each dimension over all tokens.
 19 |     :param pooling_mode_mean_tokens: Perform mean-pooling
 20 |     :param pooling_mode_mean_sqrt_len_tokens: Perform mean-pooling, but devide by sqrt(input_length).
 21 |     """
 22 |     def __init__(self,
 23 |                  word_embedding_dimension: int,
 24 |                  pooling_mode: str = None,
 25 |                  pooling_mode_cls_token: bool = False,
 26 |                  pooling_mode_max_tokens: bool = False,
 27 |                  pooling_mode_mean_tokens: bool = True,
 28 |                  pooling_mode_mean_sqrt_len_tokens: bool = False,
 29 |                  ):
 30 |         super(Pooling, self).__init__()
 31 | 
 32 |         self.config_keys = ['word_embedding_dimension',  'pooling_mode_cls_token', 'pooling_mode_mean_tokens', 'pooling_mode_max_tokens', 'pooling_mode_mean_sqrt_len_tokens']
 33 | 
 34 |         if pooling_mode is not None:        #Set pooling mode by string
 35 |             pooling_mode = pooling_mode.lower()
 36 |             assert pooling_mode in ['mean', 'max', 'cls']
 37 |             pooling_mode_cls_token = (pooling_mode == 'cls')
 38 |             pooling_mode_max_tokens = (pooling_mode == 'max')
 39 |             pooling_mode_mean_tokens = (pooling_mode == 'mean')
 40 | 
 41 |         self.word_embedding_dimension = word_embedding_dimension
 42 |         self.pooling_mode_cls_token = pooling_mode_cls_token
 43 |         self.pooling_mode_mean_tokens = pooling_mode_mean_tokens
 44 |         self.pooling_mode_max_tokens = pooling_mode_max_tokens
 45 |         self.pooling_mode_mean_sqrt_len_tokens = pooling_mode_mean_sqrt_len_tokens
 46 | 
 47 |         pooling_mode_multiplier = sum([pooling_mode_cls_token, pooling_mode_max_tokens, pooling_mode_mean_tokens, pooling_mode_mean_sqrt_len_tokens])
 48 |         self.pooling_output_dimension = (pooling_mode_multiplier * word_embedding_dimension)
 49 | 
 50 | 
 51 |     def __repr__(self):
 52 |         return "Pooling({})".format(self.get_config_dict())
 53 | 
 54 |     def get_pooling_mode_str(self) -> str:
 55 |         """
 56 |         Returns the pooling mode as string
 57 |         """
 58 |         modes = []
 59 |         if self.pooling_mode_cls_token:
 60 |             modes.append('cls')
 61 |         if self.pooling_mode_mean_tokens:
 62 |             modes.append('mean')
 63 |         if self.pooling_mode_max_tokens:
 64 |             modes.append('max')
 65 |         if self.pooling_mode_mean_sqrt_len_tokens:
 66 |             modes.append('mean_sqrt_len_tokens')
 67 | 
 68 |         return "+".join(modes)
 69 | 
 70 |     def forward(self, features: Dict[str, Tensor]):
 71 |         token_embeddings = features['token_embeddings']
 72 |         attention_mask = features['attention_mask']
 73 | 
 74 |         ## Pooling strategy
 75 |         output_vectors = []
 76 |         if self.pooling_mode_cls_token:
 77 |             cls_token = features.get('cls_token_embeddings', token_embeddings[:, 0])  # Take first token by default
 78 |             output_vectors.append(cls_token)
 79 |         if self.pooling_mode_max_tokens:
 80 |             input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
 81 |             token_embeddings[input_mask_expanded == 0] = -1e9  # Set padding tokens to large negative value
 82 |             max_over_time = torch.max(token_embeddings, 1)[0]
 83 |             output_vectors.append(max_over_time)
 84 |         if self.pooling_mode_mean_tokens or self.pooling_mode_mean_sqrt_len_tokens:
 85 |             input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
 86 |             sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
 87 | 
 88 |             #If tokens are weighted (by WordWeights layer), feature 'token_weights_sum' will be present
 89 |             if 'token_weights_sum' in features:
 90 |                 sum_mask = features['token_weights_sum'].unsqueeze(-1).expand(sum_embeddings.size())
 91 |             else:
 92 |                 sum_mask = input_mask_expanded.sum(1)
 93 | 
 94 |             sum_mask = torch.clamp(sum_mask, min=1e-9)
 95 | 
 96 |             if self.pooling_mode_mean_tokens:
 97 |                 output_vectors.append(sum_embeddings / sum_mask)
 98 |             if self.pooling_mode_mean_sqrt_len_tokens:
 99 |                 output_vectors.append(sum_embeddings / torch.sqrt(sum_mask))
100 | 
101 |         output_vector = torch.cat(output_vectors, 1)
102 |         features.update({'sentence_embedding': output_vector})
103 |         return features
104 | 
105 |     def get_sentence_embedding_dimension(self):
106 |         return self.pooling_output_dimension
107 | 
108 |     def get_config_dict(self):
109 |         return {key: self.__dict__[key] for key in self.config_keys}
110 | 
111 |     def save(self, output_path):
112 |         with open(os.path.join(output_path, 'config.json'), 'w') as fOut:
113 |             json.dump(self.get_config_dict(), fOut, indent=2)
114 | 
115 |     @staticmethod
116 |     def load(input_path):
117 |         with open(os.path.join(input_path, 'config.json')) as fIn:
118 |             config = json.load(fIn)
119 | 
120 |         return Pooling(**config)
121 | 


--------------------------------------------------------------------------------
/sentence_transformers_congen/model_card_templates.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | from .util import fullname
  4 | 
  5 | class ModelCardTemplate:
  6 |     __TAGS__ = ["sentence-transformers", "feature-extraction", "sentence-similarity"]
  7 |     __DEFAULT_VARS__ = {
  8 |         "{PIPELINE_TAG}": "sentence-similarity",
  9 |         "{MODEL_DESCRIPTION}": "<!--- Describe your model here -->",
 10 |         "{TRAINING_SECTION}": "",
 11 |         "{USAGE_TRANSFORMERS_SECTION}": "",
 12 |         "{EVALUATION}": "<!--- Describe how your model was evaluated -->",
 13 |         "{CITING}": "<!--- Describe where people can find more information -->"
 14 |     }
 15 | 
 16 |     __MODEL_CARD__ = """
 17 | ---
 18 | pipeline_tag: {PIPELINE_TAG}
 19 | tags:
 20 | {TAGS}
 21 | ---
 22 | 
 23 | # {MODEL_NAME}
 24 | 
 25 | This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a {NUM_DIMENSIONS} dimensional dense vector space and can be used for tasks like clustering or semantic search.
 26 | 
 27 | {MODEL_DESCRIPTION}
 28 | 
 29 | ## Usage (Sentence-Transformers)
 30 | 
 31 | Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
 32 | 
 33 | ```
 34 | pip install -U sentence-transformers
 35 | ```
 36 | 
 37 | Then you can use the model like this:
 38 | 
 39 | ```python
 40 | from sentence_transformers import SentenceTransformer
 41 | sentences = ["This is an example sentence", "Each sentence is converted"]
 42 | 
 43 | model = SentenceTransformer('{MODEL_NAME}')
 44 | embeddings = model.encode(sentences)
 45 | print(embeddings)
 46 | ```
 47 | 
 48 | {USAGE_TRANSFORMERS_SECTION}
 49 | 
 50 | ## Evaluation Results
 51 | 
 52 | {EVALUATION}
 53 | 
 54 | For an automated evaluation of this model, see the *Sentence Embeddings Benchmark*: [https://seb.sbert.net](https://seb.sbert.net?model_name={MODEL_NAME})
 55 | 
 56 | {TRAINING_SECTION}
 57 | 
 58 | ## Full Model Architecture
 59 | ```
 60 | {FULL_MODEL_STR}
 61 | ```
 62 | 
 63 | ## Citing & Authors
 64 | 
 65 | {CITING}
 66 | 
 67 | """
 68 | 
 69 | 
 70 | 
 71 |     __TRAINING_SECTION__ = """
 72 | ## Training
 73 | The model was trained with the parameters:
 74 | 
 75 | {LOSS_FUNCTIONS}
 76 | 
 77 | Parameters of the fit()-Method:
 78 | ```
 79 | {FIT_PARAMETERS}
 80 | ```
 81 | """
 82 | 
 83 | 
 84 |     __USAGE_TRANSFORMERS__ = """\n
 85 | ## Usage (HuggingFace Transformers)
 86 | Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
 87 | 
 88 | ```python
 89 | from transformers import AutoTokenizer, AutoModel
 90 | import torch
 91 | 
 92 | {POOLING_FUNCTION}
 93 | 
 94 | # Sentences we want sentence embeddings for
 95 | sentences = ['This is an example sentence', 'Each sentence is converted']
 96 | 
 97 | # Load model from HuggingFace Hub
 98 | tokenizer = AutoTokenizer.from_pretrained('{MODEL_NAME}')
 99 | model = AutoModel.from_pretrained('{MODEL_NAME}')
100 | 
101 | # Tokenize sentences
102 | encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
103 | 
104 | # Compute token embeddings
105 | with torch.no_grad():
106 |     model_output = model(**encoded_input)
107 | 
108 | # Perform pooling. In this case, {POOLING_MODE} pooling.
109 | sentence_embeddings = {POOLING_FUNCTION_NAME}(model_output, encoded_input['attention_mask'])
110 | 
111 | print("Sentence embeddings:")
112 | print(sentence_embeddings)
113 | ```
114 | 
115 | """
116 | 
117 | 
118 | 
119 |     @staticmethod
120 |     def model_card_get_pooling_function(pooling_mode):
121 |         if pooling_mode == 'max':
122 |             return "max_pooling", """
123 | # Max Pooling - Take the max value over time for every dimension. 
124 | def max_pooling(model_output, attention_mask):
125 |     token_embeddings = model_output[0] #First element of model_output contains all token embeddings
126 |     input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
127 |     token_embeddings[input_mask_expanded == 0] = -1e9  # Set padding tokens to large negative value
128 |     return torch.max(token_embeddings, 1)[0]
129 | """
130 |         elif pooling_mode == 'mean':
131 |             return "mean_pooling", """
132 | #Mean Pooling - Take attention mask into account for correct averaging
133 | def mean_pooling(model_output, attention_mask):
134 |     token_embeddings = model_output[0] #First element of model_output contains all token embeddings
135 |     input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
136 |     return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
137 | """
138 | 
139 |         elif pooling_mode == 'cls':
140 |             return "cls_pooling", """
141 | def cls_pooling(model_output, attention_mask):
142 |     return model_output[0][:,0]
143 | """
144 | 
145 |     @staticmethod
146 |     def get_train_objective_info(dataloader, loss):
147 |         try:
148 |             if hasattr(dataloader, 'get_config_dict'):
149 |                 train_loader = dataloader.get_config_dict()
150 |             else:
151 |                 loader_params = {}
152 |                 loader_params['batch_size'] = dataloader.batch_size if hasattr(dataloader, 'batch_size') else 'unknown'
153 |                 if hasattr(dataloader, 'sampler'):
154 |                     loader_params['sampler'] = fullname(dataloader.sampler)
155 |                 if hasattr(dataloader, 'batch_sampler'):
156 |                     loader_params['batch_sampler'] = fullname(dataloader.batch_sampler)
157 | 
158 |             dataloader_str = """**DataLoader**:\n\n`{}` of length {} with parameters:
159 | ```
160 | {}
161 | ```""".format(fullname(dataloader), len(dataloader), loader_params)
162 | 
163 |             loss_str = "**Loss**:\n\n`{}` {}".format(fullname(loss),
164 |  """with parameters:
165 |   ```
166 |   {}
167 |   ```""".format(loss.get_config_dict()) if hasattr(loss, 'get_config_dict') else "")
168 | 
169 |             return [dataloader_str, loss_str]
170 | 
171 |         except Exception as e:
172 |             logging.WARN("Exception when creating get_train_objective_info: {}".format(str(e)))
173 |             return ""


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/WordEmbeddings.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn, Tensor
  3 | from typing import Union, Tuple, List, Iterable, Dict
  4 | import logging
  5 | import gzip
  6 | from tqdm import tqdm
  7 | import numpy as np
  8 | import os
  9 | import json
 10 | from ..util import import_from_string, fullname, http_get
 11 | from .tokenizer import WordTokenizer, WhitespaceTokenizer
 12 | 
 13 | 
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | class WordEmbeddings(nn.Module):
 17 |     def __init__(self, tokenizer: WordTokenizer, embedding_weights, update_embeddings: bool = False, max_seq_length: int = 1000000):
 18 |         nn.Module.__init__(self)
 19 |         if isinstance(embedding_weights, list):
 20 |             embedding_weights = np.asarray(embedding_weights)
 21 | 
 22 |         if isinstance(embedding_weights, np.ndarray):
 23 |             embedding_weights = torch.from_numpy(embedding_weights)
 24 | 
 25 |         num_embeddings, embeddings_dimension = embedding_weights.size()
 26 |         self.embeddings_dimension = embeddings_dimension
 27 |         self.emb_layer = nn.Embedding(num_embeddings, embeddings_dimension)
 28 |         self.emb_layer.load_state_dict({'weight': embedding_weights})
 29 |         self.emb_layer.weight.requires_grad = update_embeddings
 30 |         self.tokenizer = tokenizer
 31 |         self.update_embeddings = update_embeddings
 32 |         self.max_seq_length = max_seq_length
 33 | 
 34 |     def forward(self, features):
 35 |         token_embeddings = self.emb_layer(features['input_ids'])
 36 |         cls_tokens = None
 37 |         features.update({'token_embeddings': token_embeddings, 'cls_token_embeddings': cls_tokens, 'attention_mask': features['attention_mask']})
 38 |         return features
 39 | 
 40 |     def tokenize(self, texts: List[str]):
 41 |         tokenized_texts = [self.tokenizer.tokenize(text) for text in texts]
 42 |         sentence_lengths = [len(tokens) for tokens in tokenized_texts]
 43 |         max_len = max(sentence_lengths)
 44 | 
 45 |         input_ids = []
 46 |         attention_masks = []
 47 |         for tokens in tokenized_texts:
 48 |             padding = [0] * (max_len - len(tokens))
 49 |             input_ids.append(tokens + padding)
 50 |             attention_masks.append([1]*len(tokens) + padding)
 51 | 
 52 |         output = {'input_ids': torch.tensor(input_ids, dtype=torch.long),
 53 |                 'attention_mask': torch.tensor(attention_masks, dtype=torch.long),
 54 |                 'sentence_lengths': torch.tensor(sentence_lengths, dtype=torch.long)}
 55 | 
 56 |         return output
 57 | 
 58 | 
 59 | 
 60 |     def get_word_embedding_dimension(self) -> int:
 61 |         return self.embeddings_dimension
 62 | 
 63 |     def save(self, output_path: str):
 64 |         with open(os.path.join(output_path, 'wordembedding_config.json'), 'w') as fOut:
 65 |             json.dump(self.get_config_dict(), fOut, indent=2)
 66 | 
 67 |         torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin'))
 68 |         self.tokenizer.save(output_path)
 69 | 
 70 |     def get_config_dict(self):
 71 |         return {'tokenizer_class': fullname(self.tokenizer), 'update_embeddings': self.update_embeddings, 'max_seq_length': self.max_seq_length}
 72 | 
 73 |     @staticmethod
 74 |     def load(input_path: str):
 75 |         with open(os.path.join(input_path, 'wordembedding_config.json'), 'r') as fIn:
 76 |             config = json.load(fIn)
 77 | 
 78 |         tokenizer_class = import_from_string(config['tokenizer_class'])
 79 |         tokenizer = tokenizer_class.load(input_path)
 80 |         weights = torch.load(os.path.join(input_path, 'pytorch_model.bin'), map_location=torch.device('cpu'))
 81 |         embedding_weights = weights['emb_layer.weight']
 82 |         model = WordEmbeddings(tokenizer=tokenizer, embedding_weights=embedding_weights, update_embeddings=config['update_embeddings'])
 83 |         return model
 84 | 
 85 |     @staticmethod
 86 |     def from_text_file(embeddings_file_path: str, update_embeddings: bool = False, item_separator: str = " ", tokenizer=WhitespaceTokenizer(), max_vocab_size: int = None):
 87 |         logger.info("Read in embeddings file {}".format(embeddings_file_path))
 88 | 
 89 |         if not os.path.exists(embeddings_file_path):
 90 |             logger.info("{} does not exist, try to download from server".format(embeddings_file_path))
 91 | 
 92 |             if '/' in embeddings_file_path or '\\' in embeddings_file_path:
 93 |                 raise ValueError("Embeddings file not found: ".format(embeddings_file_path))
 94 | 
 95 |             url = "https://public.ukp.informatik.tu-darmstadt.de/reimers/embeddings/"+embeddings_file_path
 96 |             http_get(url, embeddings_file_path)
 97 | 
 98 |         embeddings_dimension = None
 99 |         vocab = []
100 |         embeddings = []
101 | 
102 |         with gzip.open(embeddings_file_path, "rt", encoding="utf8") if embeddings_file_path.endswith('.gz') else open(embeddings_file_path, encoding="utf8") as fIn:
103 |             iterator = tqdm(fIn, desc="Load Word Embeddings", unit="Embeddings")
104 |             for line in iterator:
105 |                 split = line.rstrip().split(item_separator)
106 |                 word = split[0]
107 | 
108 |                 if embeddings_dimension == None:
109 |                     embeddings_dimension = len(split) - 1
110 |                     vocab.append("PADDING_TOKEN")
111 |                     embeddings.append(np.zeros(embeddings_dimension))
112 | 
113 |                 if (len(split) - 1) != embeddings_dimension:  # Assure that all lines in the embeddings file are of the same length
114 |                     logger.error("ERROR: A line in the embeddings file had more or less  dimensions than expected. Skip token.")
115 |                     continue
116 | 
117 |                 vector = np.array([float(num) for num in split[1:]])
118 |                 embeddings.append(vector)
119 |                 vocab.append(word)
120 | 
121 |                 if max_vocab_size is not None and max_vocab_size > 0 and len(vocab) > max_vocab_size:
122 |                     break
123 | 
124 |             embeddings = np.asarray(embeddings)
125 | 
126 |             tokenizer.set_vocab(vocab)
127 |             return WordEmbeddings(tokenizer=tokenizer, embedding_weights=embeddings, update_embeddings=update_embeddings)
128 | 
129 | 


--------------------------------------------------------------------------------
/sentence_transformers_congen/models/Transformer.py:
--------------------------------------------------------------------------------
  1 | from torch import nn
  2 | from transformers import AutoModel, AutoTokenizer, AutoConfig
  3 | import json
  4 | from typing import List, Dict, Optional, Union, Tuple
  5 | import os
  6 | 
  7 | 
  8 | class Transformer(nn.Module):
  9 |     """Huggingface AutoModel to generate token embeddings.
 10 |     Loads the correct class, e.g. BERT / RoBERTa etc.
 11 | 
 12 |     :param model_name_or_path: Huggingface models name (https://huggingface.co/models)
 13 |     :param max_seq_length: Truncate any inputs longer than max_seq_length
 14 |     :param model_args: Arguments (key, value pairs) passed to the Huggingface Transformers model
 15 |     :param cache_dir: Cache dir for Huggingface Transformers to store/load models
 16 |     :param tokenizer_args: Arguments (key, value pairs) passed to the Huggingface Tokenizer model
 17 |     :param do_lower_case: If true, lowercases the input (independent if the model is cased or not)
 18 |     :param tokenizer_name_or_path: Name or path of the tokenizer. When None, then model_name_or_path is used
 19 |     """
 20 |     def __init__(self, model_name_or_path: str, max_seq_length: Optional[int] = None,
 21 |                  model_args: Dict = {}, cache_dir: Optional[str] = None,
 22 |                  tokenizer_args: Dict = {}, do_lower_case: bool = False,
 23 |                  tokenizer_name_or_path : str = None):
 24 |         super(Transformer, self).__init__()
 25 |         self.config_keys = ['max_seq_length', 'do_lower_case']
 26 |         self.do_lower_case = do_lower_case
 27 | 
 28 |         config = AutoConfig.from_pretrained(model_name_or_path, **model_args, cache_dir=cache_dir)
 29 |         self.auto_model = AutoModel.from_pretrained(model_name_or_path, config=config, cache_dir=cache_dir)
 30 |         self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path if tokenizer_name_or_path is not None else model_name_or_path, cache_dir=cache_dir, **tokenizer_args)
 31 | 
 32 |         #No max_seq_length set. Try to infer from model
 33 |         if max_seq_length is None:
 34 |             if hasattr(self.auto_model, "config") and hasattr(self.auto_model.config, "max_position_embeddings") and hasattr(self.tokenizer, "model_max_length"):
 35 |                 max_seq_length = min(self.auto_model.config.max_position_embeddings, self.tokenizer.model_max_length)
 36 | 
 37 |         self.max_seq_length = max_seq_length
 38 | 
 39 |         if tokenizer_name_or_path is not None:
 40 |             self.auto_model.config.tokenizer_class = self.tokenizer.__class__.__name__
 41 | 
 42 |     def __repr__(self):
 43 |         return "Transformer({}) with Transformer model: {} ".format(self.get_config_dict(), self.auto_model.__class__.__name__)
 44 | 
 45 |     def forward(self, features):
 46 |         """Returns token_embeddings, cls_token"""
 47 |         trans_features = {'input_ids': features['input_ids'], 'attention_mask': features['attention_mask']}
 48 |         if 'token_type_ids' in features:
 49 |             trans_features['token_type_ids'] = features['token_type_ids']
 50 | 
 51 |         output_states = self.auto_model(**trans_features, return_dict=False)
 52 |         output_tokens = output_states[0]
 53 | 
 54 |         features.update({'token_embeddings': output_tokens, 'attention_mask': features['attention_mask']})
 55 | 
 56 |         if self.auto_model.config.output_hidden_states:
 57 |             all_layer_idx = 2
 58 |             if len(output_states) < 3: #Some models only output last_hidden_states and all_hidden_states
 59 |                 all_layer_idx = 1
 60 | 
 61 |             hidden_states = output_states[all_layer_idx]
 62 |             features.update({'all_layer_embeddings': hidden_states})
 63 | 
 64 |         return features
 65 | 
 66 |     def get_word_embedding_dimension(self) -> int:
 67 |         return self.auto_model.config.hidden_size
 68 | 
 69 |     def tokenize(self, texts: Union[List[str], List[Dict], List[Tuple[str, str]]]):
 70 |         """
 71 |         Tokenizes a text and maps tokens to token-ids
 72 |         """
 73 |         output = {}
 74 |         if isinstance(texts[0], str):
 75 |             to_tokenize = [texts]
 76 |         elif isinstance(texts[0], dict):
 77 |             to_tokenize = []
 78 |             output['text_keys'] = []
 79 |             for lookup in texts:
 80 |                 text_key, text = next(iter(lookup.items()))
 81 |                 to_tokenize.append(text)
 82 |                 output['text_keys'].append(text_key)
 83 |             to_tokenize = [to_tokenize]
 84 |         else:
 85 |             batch1, batch2 = [], []
 86 |             for text_tuple in texts:
 87 |                 batch1.append(text_tuple[0])
 88 |                 batch2.append(text_tuple[1])
 89 |             to_tokenize = [batch1, batch2]
 90 | 
 91 |         #strip
 92 |         to_tokenize = [[str(s).strip() for s in col] for col in to_tokenize]
 93 | 
 94 |         #Lowercase
 95 |         if self.do_lower_case:
 96 |             to_tokenize = [[s.lower() for s in col] for col in to_tokenize]
 97 | 
 98 | 
 99 |         output.update(self.tokenizer(*to_tokenize, padding=True, truncation='longest_first', return_tensors="pt", max_length=self.max_seq_length))
100 |         return output
101 | 
102 | 
103 |     def get_config_dict(self):
104 |         return {key: self.__dict__[key] for key in self.config_keys}
105 | 
106 |     def save(self, output_path: str):
107 |         self.auto_model.save_pretrained(output_path)
108 |         self.tokenizer.save_pretrained(output_path)
109 | 
110 |         with open(os.path.join(output_path, 'sentence_bert_config.json'), 'w') as fOut:
111 |             json.dump(self.get_config_dict(), fOut, indent=2)
112 | 
113 |     @staticmethod
114 |     def load(input_path: str):
115 |         #Old classes used other config names than 'sentence_bert_config.json'
116 |         for config_name in ['sentence_bert_config.json', 'sentence_roberta_config.json', 'sentence_distilbert_config.json', 'sentence_camembert_config.json', 'sentence_albert_config.json', 'sentence_xlm-roberta_config.json', 'sentence_xlnet_config.json']:
117 |             sbert_config_path = os.path.join(input_path, config_name)
118 |             if os.path.exists(sbert_config_path):
119 |                 break
120 | 
121 |         with open(sbert_config_path) as fIn:
122 |             config = json.load(fIn)
123 |         return Transformer(model_name_or_path=input_path, **config)
124 | 
125 | 
126 | 
127 | 
128 | 
129 | 
130 | 


--------------------------------------------------------------------------------
/evaluation.py:
--------------------------------------------------------------------------------
  1 | # This code our modify from https://github.com/princeton-nlp/SimCSE/blob/main/evaluation.py
  2 | 
  3 | import sys
  4 | import io, os
  5 | import numpy as np
  6 | import logging
  7 | import argparse
  8 | from prettytable import PrettyTable
  9 | import torch
 10 | import transformers
 11 | from sentence_transformers import SentenceTransformer
 12 | 
 13 | # Set up logger
 14 | logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)
 15 | 
 16 | # Set PATHs
 17 | PATH_TO_SENTEVAL = './SentEval'
 18 | PATH_TO_DATA = './SentEval/data'
 19 | 
 20 | # Import SentEval
 21 | sys.path.insert(0, PATH_TO_SENTEVAL)
 22 | import senteval
 23 | 
 24 | def print_table(task_names, scores):
 25 |     tb = PrettyTable()
 26 |     tb.field_names = task_names
 27 |     tb.add_row(scores)
 28 |     print(tb)
 29 | 
 30 | def prepare(params, samples):
 31 |     return
 32 | 
 33 | def batcher(params, batch):
 34 |     batch = [' '.join(sent) if sent != [] else '.' for sent in batch]
 35 |     embeddings = params['encoder'](batch, show_progress_bar=False)
 36 |     return embeddings
 37 | 
 38 | def main():
 39 |     parser = argparse.ArgumentParser()
 40 |     parser.add_argument("--model_name_or_path", type=str, 
 41 |             help="Transformers' model name or path")
 42 |     parser.add_argument("--mode", type=str, 
 43 |             choices=['dev', 'test', 'fasttest'],
 44 |             default='test', 
 45 |             help="What evaluation mode to use (dev: fast mode, dev results; test: full mode, test results); fasttest: fast mode, test results")
 46 |     parser.add_argument("--task_set", type=str, 
 47 |             choices=['sts', 'transfer', 'full', 'na'],
 48 |             default='sts',
 49 |             help="What set of tasks to evaluate on. If not 'na', this will override '--tasks'")
 50 |     parser.add_argument("--tasks", type=str, nargs='+', 
 51 |             default=['STS12', 'STS13', 'STS14', 'STS15', 'STS16',
 52 |                      'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'TREC', 'MRPC',
 53 |                      'SICKRelatedness', 'STSBenchmark'], 
 54 |             help="Tasks to evaluate on. If '--task_set' is specified, this will be overridden")
 55 |     
 56 |     args = parser.parse_args()
 57 |     
 58 |     # Load sentence transformers' model checkpoint
 59 |     model = SentenceTransformer(args.model_name_or_path)
 60 |     
 61 |     # Set up the tasks
 62 |     if args.task_set == 'sts':
 63 |         args.tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'STSBenchmark', 'SICKRelatedness']
 64 |     elif args.task_set == 'transfer':
 65 |         args.tasks = ['MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'TREC', 'MRPC']
 66 |     elif args.task_set == 'full':
 67 |         args.tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'STSBenchmark', 'SICKRelatedness']
 68 |         args.tasks += ['MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'TREC', 'MRPC']
 69 | 
 70 |     # Set params for SentEval
 71 |     if args.mode == 'dev' or args.mode == 'fasttest':
 72 |         # Fast mode
 73 |         params = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5}
 74 |         params['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128,
 75 |                                          'tenacity': 3, 'epoch_size': 2}
 76 |     elif args.mode == 'test':
 77 |         # Full mode
 78 |         params = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 10}
 79 |         params['classifier'] = {'nhid': 0, 'optim': 'adam', 'batch_size': 64,
 80 |                                          'tenacity': 5, 'epoch_size': 4}
 81 |     else:
 82 |         raise NotImplementedError
 83 | 
 84 |     results = {}
 85 | 
 86 |     for task in args.tasks:
 87 |         params = {'task_path': 'SentEval/data/', 'usepytorch': True, 'kfold': 10}
 88 |         params['encoder'] = model.encode
 89 |         se = senteval.engine.SE(params, batcher, prepare)
 90 |         result = se.eval(task)
 91 |         results[task] = result
 92 |     
 93 |     # Print evaluation results
 94 |     if args.mode == 'dev':
 95 |         print("------ %s ------" % (args.mode))
 96 | 
 97 |         task_names = []
 98 |         scores = []
 99 |         for task in ['STSBenchmark', 'SICKRelatedness']:
100 |             task_names.append(task)
101 |             if task in results:
102 |                 scores.append("%.2f" % (results[task]['dev']['spearman'][0] * 100))
103 |             else:
104 |                 scores.append("0.00")
105 |         print_table(task_names, scores)
106 | 
107 |         task_names = []
108 |         scores = []
109 |         for task in ['MR', 'CR', 'SUBJ', 'MPQA', 'SST2', 'TREC', 'MRPC']:
110 |             task_names.append(task)
111 |             if task in results:
112 |                 scores.append("%.2f" % (results[task]['devacc']))    
113 |             else:
114 |                 scores.append("0.00")
115 |         task_names.append("Avg.")
116 |         scores.append("%.2f" % (sum([float(score) for score in scores]) / len(scores)))
117 |         print_table(task_names, scores)
118 | 
119 |     elif args.mode == 'test' or args.mode == 'fasttest':
120 |         print("------ %s ------" % (args.mode))
121 | 
122 |         task_names = []
123 |         scores = []
124 |         for task in ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'STSBenchmark', 'SICKRelatedness']:
125 |             task_names.append(task)
126 |             if task in results:
127 |                 if task in ['STS12', 'STS13', 'STS14', 'STS15', 'STS16']:
128 |                     scores.append("%.2f" % (results[task]['all']['spearman']['all'] * 100))
129 |                 else:
130 |                     scores.append("%.2f" % (results[task]['test']['spearman'].correlation * 100))
131 |             else:
132 |                 scores.append("0.00")
133 |         task_names.append("Avg.")
134 |         scores.append("%.2f" % (sum([float(score) for score in scores]) / len(scores)))
135 |         print_table(task_names, scores)
136 | 
137 |         task_names = []
138 |         scores = []
139 |         for task in ['MR', 'CR', 'SUBJ', 'MPQA', 'SST2', 'TREC', 'MRPC']:
140 |             task_names.append(task)
141 |             if task in results:
142 |                 scores.append("%.2f" % (results[task]['acc']))    
143 |             else:
144 |                 scores.append("0.00")
145 |         task_names.append("Avg.")
146 |         scores.append("%.2f" % (sum([float(score) for score in scores]) / len(scores)))
147 |         print_table(task_names, scores)
148 | 
149 | 
150 | if __name__ == "__main__":
151 |     main()


--------------------------------------------------------------------------------
/sentence_transformers_congen/evaluation/EmbeddingSimilarityEvaluator.py:
--------------------------------------------------------------------------------
  1 | from . import SentenceEvaluator, SimilarityFunction
  2 | import logging
  3 | import os
  4 | import csv
  5 | from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances
  6 | from scipy.stats import pearsonr, spearmanr
  7 | import numpy as np
  8 | from typing import List
  9 | from ..readers import InputExample
 10 | 
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | class EmbeddingSimilarityEvaluator(SentenceEvaluator):
 15 |     """
 16 |     Evaluate a model based on the similarity of the embeddings by calculating the Spearman and Pearson rank correlation
 17 |     in comparison to the gold standard labels.
 18 |     The metrics are the cosine similarity as well as euclidean and Manhattan distance
 19 |     The returned score is the Spearman correlation with a specified metric.
 20 | 
 21 |     The results are written in a CSV. If a CSV already exists, then values are appended.
 22 |     """
 23 |     def __init__(self, sentences1: List[str], sentences2: List[str], scores: List[float], batch_size: int = 16, main_similarity: SimilarityFunction = None, name: str = '', show_progress_bar: bool = False, write_csv: bool = True):
 24 |         """
 25 |         Constructs an evaluator based for the dataset
 26 | 
 27 |         The labels need to indicate the similarity between the sentences.
 28 | 
 29 |         :param sentences1:  List with the first sentence in a pair
 30 |         :param sentences2: List with the second sentence in a pair
 31 |         :param scores: Similarity score between sentences1[i] and sentences2[i]
 32 |         :param write_csv: Write results to a CSV file
 33 |         """
 34 |         self.sentences1 = sentences1
 35 |         self.sentences2 = sentences2
 36 |         self.scores = scores
 37 |         self.write_csv = write_csv
 38 | 
 39 |         assert len(self.sentences1) == len(self.sentences2)
 40 |         assert len(self.sentences1) == len(self.scores)
 41 | 
 42 |         self.main_similarity = main_similarity
 43 |         self.name = name
 44 | 
 45 |         self.batch_size = batch_size
 46 |         if show_progress_bar is None:
 47 |             show_progress_bar = (logger.getEffectiveLevel() == logging.INFO or logger.getEffectiveLevel() == logging.DEBUG)
 48 |         self.show_progress_bar = show_progress_bar
 49 | 
 50 |         self.csv_file = "similarity_evaluation"+("_"+name if name else '')+"_results.csv"
 51 |         self.csv_headers = ["epoch", "steps", "cosine_pearson", "cosine_spearman", "euclidean_pearson", "euclidean_spearman", "manhattan_pearson", "manhattan_spearman", "dot_pearson", "dot_spearman"]
 52 | 
 53 |     @classmethod
 54 |     def from_input_examples(cls, examples: List[InputExample], **kwargs):
 55 |         sentences1 = []
 56 |         sentences2 = []
 57 |         scores = []
 58 | 
 59 |         for example in examples:
 60 |             sentences1.append(example.texts[0])
 61 |             sentences2.append(example.texts[1])
 62 |             scores.append(example.label)
 63 |         return cls(sentences1, sentences2, scores, **kwargs)
 64 | 
 65 | 
 66 |     def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
 67 |         if epoch != -1:
 68 |             if steps == -1:
 69 |                 out_txt = " after epoch {}:".format(epoch)
 70 |             else:
 71 |                 out_txt = " in epoch {} after {} steps:".format(epoch, steps)
 72 |         else:
 73 |             out_txt = ":"
 74 | 
 75 |         logger.info("EmbeddingSimilarityEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt)
 76 | 
 77 |         embeddings1 = model.encode(self.sentences1, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True)
 78 |         embeddings2 = model.encode(self.sentences2, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True)
 79 |         labels = self.scores
 80 | 
 81 |         cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2))
 82 |         manhattan_distances = -paired_manhattan_distances(embeddings1, embeddings2)
 83 |         euclidean_distances = -paired_euclidean_distances(embeddings1, embeddings2)
 84 |         dot_products = [np.dot(emb1, emb2) for emb1, emb2 in zip(embeddings1, embeddings2)]
 85 | 
 86 | 
 87 |         eval_pearson_cosine, _ = pearsonr(labels, cosine_scores)
 88 |         eval_spearman_cosine, _ = spearmanr(labels, cosine_scores)
 89 | 
 90 |         eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances)
 91 |         eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances)
 92 | 
 93 |         eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances)
 94 |         eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances)
 95 | 
 96 |         eval_pearson_dot, _ = pearsonr(labels, dot_products)
 97 |         eval_spearman_dot, _ = spearmanr(labels, dot_products)
 98 | 
 99 |         logger.info("Cosine-Similarity :\tSpearman: {:.4f}".format(
100 |             eval_pearson_cosine, eval_spearman_cosine))
101 | #         logger.info("Manhattan-Distance:\tPearson: {:.4f}\tSpearman: {:.4f}".format(
102 | #             eval_pearson_manhattan, eval_spearman_manhattan))
103 | #         logger.info("Euclidean-Distance:\tPearson: {:.4f}\tSpearman: {:.4f}".format(
104 | #             eval_pearson_euclidean, eval_spearman_euclidean))
105 | #         logger.info("Dot-Product-Similarity:\tPearson: {:.4f}\tSpearman: {:.4f}".format(
106 | #             eval_pearson_dot, eval_spearman_dot))
107 | 
108 |         if output_path is not None and self.write_csv:
109 |             csv_path = os.path.join(output_path, self.csv_file)
110 |             output_file_exists = os.path.isfile(csv_path)
111 |             with open(csv_path, newline='', mode="a" if output_file_exists else 'w', encoding="utf-8") as f:
112 |                 writer = csv.writer(f)
113 |                 if not output_file_exists:
114 |                     writer.writerow(self.csv_headers)
115 | 
116 |                 writer.writerow([epoch, steps, eval_pearson_cosine, eval_spearman_cosine, eval_pearson_euclidean,
117 |                                  eval_spearman_euclidean, eval_pearson_manhattan, eval_spearman_manhattan, eval_pearson_dot, eval_spearman_dot])
118 | 
119 | 
120 |         if self.main_similarity == SimilarityFunction.COSINE:
121 |             return eval_spearman_cosine
122 |         elif self.main_similarity == SimilarityFunction.EUCLIDEAN:
123 |             return eval_spearman_euclidean
124 |         elif self.main_similarity == SimilarityFunction.MANHATTAN:
125 |             return eval_spearman_manhattan
126 |         elif self.main_similarity == SimilarityFunction.DOT_PRODUCT:
127 |             return eval_spearman_dot
128 |         elif self.main_similarity is None:
129 |             return max(eval_spearman_cosine, eval_spearman_manhattan, eval_spearman_euclidean, eval_spearman_dot)
130 |         else:
131 |             raise ValueError("Unknown main_similarity value")
132 | 


--------------------------------------------------------------------------------
/SentEval/senteval/engine.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017-present, Facebook, Inc.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | #
  7 | 
  8 | '''
  9 | 
 10 | Generic sentence evaluation scripts wrapper
 11 | 
 12 | '''
 13 | from __future__ import absolute_import, division, unicode_literals
 14 | 
 15 | from senteval import utils
 16 | from senteval.binary import CREval, MREval, MPQAEval, SUBJEval
 17 | from senteval.snli import SNLIEval
 18 | from senteval.trec import TRECEval
 19 | from senteval.sick import SICKEntailmentEval, SICKEval
 20 | from senteval.mrpc import MRPCEval
 21 | from senteval.sts import STS12Eval, STS13Eval, STS14Eval, STS15Eval, STS16Eval, STSBenchmarkEval, SICKRelatednessEval, STSBenchmarkFinetune
 22 | from senteval.sst import SSTEval
 23 | from senteval.rank import ImageCaptionRetrievalEval
 24 | from senteval.probing import *
 25 | 
 26 | class SE(object):
 27 |     def __init__(self, params, batcher, prepare=None):
 28 |         # parameters
 29 |         params = utils.dotdict(params)
 30 |         params.usepytorch = True if 'usepytorch' not in params else params.usepytorch
 31 |         params.seed = 1111 if 'seed' not in params else params.seed
 32 | 
 33 |         params.batch_size = 128 if 'batch_size' not in params else params.batch_size
 34 |         params.nhid = 0 if 'nhid' not in params else params.nhid
 35 |         params.kfold = 5 if 'kfold' not in params else params.kfold
 36 | 
 37 |         if 'classifier' not in params or not params['classifier']:
 38 |             params.classifier = {'nhid': 0}
 39 | 
 40 |         assert 'nhid' in params.classifier, 'Set number of hidden units in classifier config!!'
 41 | 
 42 |         self.params = params
 43 | 
 44 |         # batcher and prepare
 45 |         self.batcher = batcher
 46 |         self.prepare = prepare if prepare else lambda x, y: None
 47 | 
 48 |         self.list_tasks = ['CR', 'MR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC',
 49 |                            'SICKRelatedness', 'SICKEntailment', 'STSBenchmark',
 50 |                            'SNLI', 'ImageCaptionRetrieval', 'STS12', 'STS13',
 51 |                            'STS14', 'STS15', 'STS16',
 52 |                            'Length', 'WordContent', 'Depth', 'TopConstituents',
 53 |                            'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber',
 54 |                            'OddManOut', 'CoordinationInversion', 'SICKRelatedness-finetune', 'STSBenchmark-finetune', 'STSBenchmark-fix']
 55 | 
 56 |     def eval(self, name):
 57 |         # evaluate on evaluation [name], either takes string or list of strings
 58 |         if (isinstance(name, list)):
 59 |             self.results = {x: self.eval(x) for x in name}
 60 |             return self.results
 61 | 
 62 |         tpath = self.params.task_path
 63 |         assert name in self.list_tasks, str(name) + ' not in ' + str(self.list_tasks)
 64 | 
 65 |         # Original SentEval tasks
 66 |         if name == 'CR':
 67 |             self.evaluation = CREval(tpath + '/downstream/CR', seed=self.params.seed)
 68 |         elif name == 'MR':
 69 |             self.evaluation = MREval(tpath + '/downstream/MR', seed=self.params.seed)
 70 |         elif name == 'MPQA':
 71 |             self.evaluation = MPQAEval(tpath + '/downstream/MPQA', seed=self.params.seed)
 72 |         elif name == 'SUBJ':
 73 |             self.evaluation = SUBJEval(tpath + '/downstream/SUBJ', seed=self.params.seed)
 74 |         elif name == 'SST2':
 75 |             self.evaluation = SSTEval(tpath + '/downstream/SST/binary', nclasses=2, seed=self.params.seed)
 76 |         elif name == 'SST5':
 77 |             self.evaluation = SSTEval(tpath + '/downstream/SST/fine', nclasses=5, seed=self.params.seed)
 78 |         elif name == 'TREC':
 79 |             self.evaluation = TRECEval(tpath + '/downstream/TREC', seed=self.params.seed)
 80 |         elif name == 'MRPC':
 81 |             self.evaluation = MRPCEval(tpath + '/downstream/MRPC', seed=self.params.seed)
 82 |         elif name == 'SICKRelatedness':
 83 |             self.evaluation = SICKRelatednessEval(tpath + '/downstream/SICK', seed=self.params.seed)
 84 |         elif name == 'STSBenchmark':
 85 |             self.evaluation = STSBenchmarkEval(tpath + '/downstream/STS/STSBenchmark', seed=self.params.seed)
 86 |         elif name == 'STSBenchmark-fix':
 87 |             self.evaluation = STSBenchmarkEval(tpath + '/downstream/STS/STSBenchmark-fix', seed=self.params.seed)
 88 |         elif name == 'STSBenchmark-finetune':
 89 |             self.evaluation = STSBenchmarkFinetune(tpath + '/downstream/STS/STSBenchmark', seed=self.params.seed)
 90 |         elif name == 'SICKRelatedness-finetune':
 91 |             self.evaluation = SICKEval(tpath + '/downstream/SICK', seed=self.params.seed)
 92 |         elif name == 'SICKEntailment':
 93 |             self.evaluation = SICKEntailmentEval(tpath + '/downstream/SICK', seed=self.params.seed)
 94 |         elif name == 'SNLI':
 95 |             self.evaluation = SNLIEval(tpath + '/downstream/SNLI', seed=self.params.seed)
 96 |         elif name in ['STS12', 'STS13', 'STS14', 'STS15', 'STS16']:
 97 |             fpath = name + '-en-test'
 98 |             self.evaluation = eval(name + 'Eval')(tpath + '/downstream/STS/' + fpath, seed=self.params.seed)
 99 |         elif name == 'ImageCaptionRetrieval':
100 |             self.evaluation = ImageCaptionRetrievalEval(tpath + '/downstream/COCO', seed=self.params.seed)
101 | 
102 |         # Probing Tasks
103 |         elif name == 'Length':
104 |                 self.evaluation = LengthEval(tpath + '/probing', seed=self.params.seed)
105 |         elif name == 'WordContent':
106 |                 self.evaluation = WordContentEval(tpath + '/probing', seed=self.params.seed)
107 |         elif name == 'Depth':
108 |                 self.evaluation = DepthEval(tpath + '/probing', seed=self.params.seed)
109 |         elif name == 'TopConstituents':
110 |                 self.evaluation = TopConstituentsEval(tpath + '/probing', seed=self.params.seed)
111 |         elif name == 'BigramShift':
112 |                 self.evaluation = BigramShiftEval(tpath + '/probing', seed=self.params.seed)
113 |         elif name == 'Tense':
114 |                 self.evaluation = TenseEval(tpath + '/probing', seed=self.params.seed)
115 |         elif name == 'SubjNumber':
116 |                 self.evaluation = SubjNumberEval(tpath + '/probing', seed=self.params.seed)
117 |         elif name == 'ObjNumber':
118 |                 self.evaluation = ObjNumberEval(tpath + '/probing', seed=self.params.seed)
119 |         elif name == 'OddManOut':
120 |                 self.evaluation = OddManOutEval(tpath + '/probing', seed=self.params.seed)
121 |         elif name == 'CoordinationInversion':
122 |                 self.evaluation = CoordinationInversionEval(tpath + '/probing', seed=self.params.seed)
123 | 
124 |         self.params.current_task = name
125 |         self.evaluation.do_prepare(self.params, self.prepare)
126 | 
127 |         self.results = self.evaluation.run(self.params, self.batcher)
128 | 
129 |         return self.results
130 | 


--------------------------------------------------------------------------------
/sentence_transformers_congen/evaluation/TripletEvaluator.py:
--------------------------------------------------------------------------------
  1 | from . import SentenceEvaluator, SimilarityFunction
  2 | import logging
  3 | import os
  4 | import csv
  5 | from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances
  6 | from typing import List
  7 | from ..readers import InputExample
  8 | 
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | 
 13 | class TripletEvaluator(SentenceEvaluator):
 14 |     """
 15 |     Evaluate a model based on a triplet: (sentence, positive_example, negative_example).
 16 |         Checks if distance(sentence, positive_example) < distance(sentence, negative_example).
 17 |     """
 18 | 
 19 |     def __init__(
 20 |         self,
 21 |         anchors: List[str],
 22 |         positives: List[str],
 23 |         negatives: List[str],
 24 |         main_distance_function: SimilarityFunction = None,
 25 |         name: str = "",
 26 |         batch_size: int = 16,
 27 |         show_progress_bar: bool = False,
 28 |         write_csv: bool = True,
 29 |     ):
 30 |         """
 31 |         :param anchors: Sentences to check similarity to. (e.g. a query)
 32 |         :param positives: List of positive sentences
 33 |         :param negatives: List of negative sentences
 34 |         :param main_distance_function: One of 0 (Cosine), 1 (Euclidean) or 2 (Manhattan). Defaults to None, returning all 3.
 35 |         :param name: Name for the output
 36 |         :param batch_size: Batch size used to compute embeddings
 37 |         :param show_progress_bar: If true, prints a progress bar
 38 |         :param write_csv: Write results to a CSV file
 39 |         """
 40 |         self.anchors = anchors
 41 |         self.positives = positives
 42 |         self.negatives = negatives
 43 |         self.name = name
 44 | 
 45 |         assert len(self.anchors) == len(self.positives)
 46 |         assert len(self.anchors) == len(self.negatives)
 47 | 
 48 |         self.main_distance_function = main_distance_function
 49 | 
 50 |         self.batch_size = batch_size
 51 |         if show_progress_bar is None:
 52 |             show_progress_bar = (
 53 |                 logger.getEffectiveLevel() == logging.INFO or logger.getEffectiveLevel() == logging.DEBUG
 54 |             )
 55 |         self.show_progress_bar = show_progress_bar
 56 | 
 57 |         self.csv_file: str = "triplet_evaluation" + ("_" + name if name else "") + "_results.csv"
 58 |         self.csv_headers = ["epoch", "steps", "accuracy_cosinus", "accuracy_manhatten", "accuracy_euclidean"]
 59 |         self.write_csv = write_csv
 60 | 
 61 |     @classmethod
 62 |     def from_input_examples(cls, examples: List[InputExample], **kwargs):
 63 |         anchors = []
 64 |         positives = []
 65 |         negatives = []
 66 | 
 67 |         for example in examples:
 68 |             anchors.append(example.texts[0])
 69 |             positives.append(example.texts[1])
 70 |             negatives.append(example.texts[2])
 71 |         return cls(anchors, positives, negatives, **kwargs)
 72 | 
 73 |     def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
 74 |         if epoch != -1:
 75 |             if steps == -1:
 76 |                 out_txt = " after epoch {}:".format(epoch)
 77 |             else:
 78 |                 out_txt = " in epoch {} after {} steps:".format(epoch, steps)
 79 |         else:
 80 |             out_txt = ":"
 81 | 
 82 |         logger.info("TripletEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt)
 83 | 
 84 |         num_triplets = 0
 85 |         num_correct_cos_triplets, num_correct_manhatten_triplets, num_correct_euclidean_triplets = 0, 0, 0
 86 | 
 87 |         embeddings_anchors = model.encode(
 88 |             self.anchors, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True
 89 |         )
 90 |         embeddings_positives = model.encode(
 91 |             self.positives, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True
 92 |         )
 93 |         embeddings_negatives = model.encode(
 94 |             self.negatives, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True
 95 |         )
 96 | 
 97 |         # Cosine distance
 98 |         pos_cos_distance = paired_cosine_distances(embeddings_anchors, embeddings_positives)
 99 |         neg_cos_distances = paired_cosine_distances(embeddings_anchors, embeddings_negatives)
100 | 
101 |         # Manhattan
102 |         pos_manhattan_distance = paired_manhattan_distances(embeddings_anchors, embeddings_positives)
103 |         neg_manhattan_distances = paired_manhattan_distances(embeddings_anchors, embeddings_negatives)
104 | 
105 |         # Euclidean
106 |         pos_euclidean_distance = paired_euclidean_distances(embeddings_anchors, embeddings_positives)
107 |         neg_euclidean_distances = paired_euclidean_distances(embeddings_anchors, embeddings_negatives)
108 | 
109 |         for idx in range(len(pos_cos_distance)):
110 |             num_triplets += 1
111 | 
112 |             if pos_cos_distance[idx] < neg_cos_distances[idx]:
113 |                 num_correct_cos_triplets += 1
114 | 
115 |             if pos_manhattan_distance[idx] < neg_manhattan_distances[idx]:
116 |                 num_correct_manhatten_triplets += 1
117 | 
118 |             if pos_euclidean_distance[idx] < neg_euclidean_distances[idx]:
119 |                 num_correct_euclidean_triplets += 1
120 | 
121 |         accuracy_cos = num_correct_cos_triplets / num_triplets
122 |         accuracy_manhattan = num_correct_manhatten_triplets / num_triplets
123 |         accuracy_euclidean = num_correct_euclidean_triplets / num_triplets
124 | 
125 |         logger.info("Accuracy Cosine Distance:   \t{:.2f}".format(accuracy_cos * 100))
126 |         logger.info("Accuracy Manhattan Distance:\t{:.2f}".format(accuracy_manhattan * 100))
127 |         logger.info("Accuracy Euclidean Distance:\t{:.2f}\n".format(accuracy_euclidean * 100))
128 | 
129 |         if output_path is not None and self.write_csv:
130 |             csv_path = os.path.join(output_path, self.csv_file)
131 |             if not os.path.isfile(csv_path):
132 |                 with open(csv_path, newline="", mode="w", encoding="utf-8") as f:
133 |                     writer = csv.writer(f)
134 |                     writer.writerow(self.csv_headers)
135 |                     writer.writerow([epoch, steps, accuracy_cos, accuracy_manhattan, accuracy_euclidean])
136 | 
137 |             else:
138 |                 with open(csv_path, newline="", mode="a", encoding="utf-8") as f:
139 |                     writer = csv.writer(f)
140 |                     writer.writerow([epoch, steps, accuracy_cos, accuracy_manhattan, accuracy_euclidean])
141 | 
142 |         if self.main_distance_function == SimilarityFunction.COSINE:
143 |             return accuracy_cos
144 |         if self.main_distance_function == SimilarityFunction.MANHATTAN:
145 |             return accuracy_manhattan
146 |         if self.main_distance_function == SimilarityFunction.EUCLIDEAN:
147 |             return accuracy_euclidean
148 | 
149 |         return max(accuracy_cos, accuracy_manhattan, accuracy_euclidean)
150 | 


--------------------------------------------------------------------------------