├── .gitignore ├── HKUST.jpg ├── LICENCE ├── PPCM.png ├── README.md ├── dialogGPT_discr.py ├── dm.png ├── download_data.sh ├── evaluate.py ├── experiment_runner ├── run_classifier.sh ├── run_evaluate.sh └── run_evaluate_WD.sh ├── get_example.py ├── interact.py ├── interact_adapter.py ├── main.py ├── metric ├── bleu.py ├── dist_score.py ├── lm_score.py ├── multi-bleu.perl ├── sentiment_classifiers.py ├── text_classifier.py └── torchMoji │ ├── .gitignore │ ├── .travis.yml │ ├── LICENSE │ ├── README.md │ ├── data │ ├── .gitkeep │ ├── Olympic │ │ └── raw.pickle │ ├── PsychExp │ │ └── raw.pickle │ ├── SCv1 │ │ └── raw.pickle │ ├── SCv2-GEN │ │ └── raw.pickle │ ├── SE0714 │ │ └── raw.pickle │ ├── SS-Twitter │ │ └── raw.pickle │ ├── SS-Youtube │ │ └── raw.pickle │ ├── emoji_codes.json │ └── kaggle-insults │ │ └── raw.pickle │ ├── emoji_overview.png │ ├── examples │ ├── .gitkeep │ ├── README.md │ ├── __init__.py │ ├── create_twitter_vocab.py │ ├── dataset_split.py │ ├── encode_texts.py │ ├── example_helper.py │ ├── finetune_insults_chain-thaw.py │ ├── finetune_semeval_class-avg_f1.py │ ├── finetune_youtube_last.py │ ├── score_texts_emojis.py │ ├── text_emojize.py │ ├── tokenize_dataset.py │ └── vocab_extension.py │ ├── model │ ├── .gitkeep │ └── vocabulary.json │ ├── scripts │ ├── analyze_all_results.py │ ├── analyze_results.py │ ├── calculate_coverages.py │ ├── convert_all_datasets.py │ ├── download_weights.py │ ├── finetune_dataset.py │ └── results │ │ └── .gitkeep │ ├── setup.py │ ├── tests │ ├── test_finetuning.py │ ├── test_helper.py │ ├── test_sentence_tokenizer.py │ ├── test_tokenizer.py │ └── test_word_generator.py │ └── torchmoji │ ├── .gitkeep │ ├── __init__.py │ ├── attlayer.py │ ├── class_avg_finetuning.py │ ├── create_vocab.py │ ├── filter_input.py │ ├── filter_utils.py │ ├── finetuning.py │ ├── global_variables.py │ ├── lstm.py │ ├── model_def.py │ ├── sentence_tokenizer.py │ ├── tokenizer.py │ └── word_generator.py ├── models ├── heads.py ├── pplm.py ├── pytorch_pretrained_bert │ ├── __init__.py │ ├── __main__.py │ ├── convert_gpt2_checkpoint_to_pytorch.py │ ├── convert_openai_checkpoint_to_pytorch.py │ ├── convert_tf_checkpoint_to_pytorch.py │ ├── convert_transfo_xl_checkpoint_to_pytorch.py │ ├── file_utils.py │ ├── modeling.py │ ├── modeling_adapter.py │ ├── modeling_gpt2.py │ ├── modeling_openai.py │ ├── modeling_transfo_xl.py │ ├── modeling_transfo_xl_utilities.py │ ├── optimization.py │ ├── optimization_openai.py │ ├── tokenization.py │ ├── tokenization_gpt2.py │ ├── tokenization_openai.py │ └── tokenization_transfo_xl.py └── wd.py ├── pytorch-logo-dark.png ├── requirements.txt ├── scorer.py ├── train_score.py ├── train_supervised_adapter.py └── utils ├── helper.py ├── torchtext_text_classification.py └── utils_sample.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | venv 3 | **/__pycache__/ 4 | 5 | data 6 | human_evaluation 7 | models/dialoGPT 8 | models/discriminators 9 | results 10 | -------------------------------------------------------------------------------- /HKUST.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andreamad8/PPCM/e5bef1bbb70907a3d65de3225a00e4af9104d4a8/HKUST.jpg -------------------------------------------------------------------------------- /LICENCE: -------------------------------------------------------------------------------- 1 | 2 | MIT License 3 | 4 | Copyright (c) 2020 Zhaojiang Lin 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. -------------------------------------------------------------------------------- /PPCM.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andreamad8/PPCM/e5bef1bbb70907a3d65de3225a00e4af9104d4a8/PPCM.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Plug-and-Play Conversational Models 2 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 3 | 4 | 5 | 6 | 7 | This is the implementation of the paper: 8 | 9 | **Plug-and-Play Conversational Models**. [**Andrea Madotto**](https://andreamad8.github.io), Etzuko Ishii, [**Zhaojiang Lin**](https://zlinao.github.io/), [**Sumanth Dathathri**](https://dathath.github.io/), Pascale Fung [[PDF]](https://www.aclweb.org/anthology/2020.findings-emnlp.219/) **EMNLP2020** (findings) 10 | 11 | If you use any source codes or datasets included in this toolkit in your work, please cite the following paper. The bibtex is listed below: 12 |
 13 | @inproceedings{madotto2020plug,
 14 |   title={Plug-and-Play Conversational Models},
 15 |   author={Madotto, Andrea and Ishii, Etsuko and Lin, Zhaojiang and Dathathri, Sumanth and Fung, Pascale},
 16 |   booktitle={Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: Findings},
 17 |   pages={2422--2433},
 18 |   year={2020}
 19 | }
 20 | 
21 | 22 | ## Abstract 23 | There has been considerable progress made towards conversational models that generate coherent 24 | and fluent responses; however, this often involves training large language models on large dialogue datasets, such as Reddit. 25 | These large conversational models provide little control over the generated responses, and this control is further limited in the absence of annotated conversational datasets for attribute specific generation 26 | that can be used for fine-tuning the model. In this paper, we first propose and evaluate plug-and-play methods 27 | for controllable response generation, which does not require 28 | dialogue specific datasets and does not rely on fine-tuning a large model. 29 | While effective, the decoding procedure induces considerable computational overhead, 30 | rendering the conversational model unsuitable for interactive usage. 31 | To overcome this, we introduce an approach that does not require 32 | further computation at decoding time, while also does not require any fine-tuning of a large language model. We demonstrate, through extensive automatic and human evaluation, a high degree of 33 | control over the generated conversational responses with regard to multiple 34 | desired attributes, while being fluent. 35 | 36 | ## Plug-and-Play Conversational Models (PPCM) 37 |

38 | 39 |

40 | 41 | ## Basic Usage 42 | 43 | ### Dependencies 44 | Create a `python3.6` virtual environment and run `pip install -r requirements.txt`. 45 | 46 | ### Discriminator Training 47 | ``` 48 | python dialogGPT_discr.py --save_model --dataset sentiment --cached --epochs 100 49 | python dialogGPT_discr.py --save_model --dataset daily_dialogue_act --cached --epochs 100 50 | python dialogGPT_discr.py --save_model --dataset TC_AG_NEWS --cached --epochs 50 51 | python dialogGPT_discr.py --save_model --dataset TC_SogouNews --cached --epochs 50 52 | python dialogGPT_discr.py --save_model --dataset TC_DBpedia --cached --epochs 10 53 | python dialogGPT_discr.py --save_model --dataset TC_YahooAnswers --cached --epochs 10 54 | ``` 55 | ### Run PPLM 56 | By omitting the `--evaluate` flag, you can run PPLM in an interactive mode. 57 | ``` 58 | python main.py -D AG_NEWS --label_class 0 --length 30 --num_samples 10 --evaluate --verbose --all_starter --wd 59 | python main.py -D AG_NEWS --label_class 1 --length 30 --num_samples 10 --evaluate --verbose --all_starter --wd 60 | python main.py -D AG_NEWS --label_class 2 --length 30 --num_samples 10 --evaluate --verbose --all_starter --wd 61 | python main.py -D AG_NEWS --label_class 3 --length 30 --num_samples 10 --evaluate --verbose --all_starter --wd 62 | python main.py -D sentiment --label_class 3 --length 30 --num_samples 10 --evaluate --verbose --all_starter 63 | python main.py -D sentiment --label_class 2 --length 30 --num_samples 10 --evaluate --verbose --all_starter 64 | python main.py -D daily_dialogue_act --label_class 1 --length 30 --num_samples 10 --evaluate --verbose --all_starter 65 | ``` 66 | 67 | ### Run Adapter 68 | ``` 69 | python train_supervised_adapter.py --dataset SENT --label very_negative --iter 75 --lr 6.25e-4 70 | python train_supervised_adapter.py --dataset SENT --label very_positive --iter 25 71 | python train_supervised_adapter.py --dataset QUEST --label question --iter 25 72 | python train_supervised_adapter.py --dataset TOPI --label Business --iter 25 73 | python train_supervised_adapter.py --dataset TOPI --label SciTech --iter 25 74 | python train_supervised_adapter.py --dataset TOPI --label Sports --iter 25 75 | ``` 76 | 77 | 78 | 79 | ## Reproducibility 80 | 81 | You can **simply run** `./download_data.sh` to download and extract all required files, or you can perform the required actions manually, by following the steps outlined bellow: 82 | 83 | ### Manual setup 84 | 85 | ***Dataset*** 86 | 87 | Download the [**datasets**](https://drive.google.com/file/d/1LvAsTzJWIEZsb5orG4vvJz0376mH27l2/) 88 | ```console 89 | ❱❱❱ unzip data.zip 90 | ``` 91 | 92 | ***DialoGPT*** 93 | 94 | Download [**dialoGPT**](https://drive.google.com/file/d/1V8juN486jpeqPhKrGeuJ8WcpaCAy4D3-/view?usp=sharing) 95 | ```console 96 | ❱❱❱ unzip dialoGPT.zip 97 | ❱❱❱ mv dialiGPT models 98 | ``` 99 | ***Discriminators*** 100 | 101 | Download the [**discriminators**](https://drive.google.com/file/d/1IzbfGOKkbbEXaoyVxBI0kD_bWODKjVpn/view?usp=sharing) 102 | ```console 103 | ❱❱❱ unzip discriminators.zip 104 | ❱❱❱ mv discriminators models 105 | ``` 106 | 107 | ***Scorers*** 108 | 109 | Download the [**scorers**](https://drive.google.com/file/d/1rxgYyYEpWVH0qd2uxJQxC5uVIaApJMMf/view?usp=sharing) 110 | ```console 111 | ❱❱❱ unzip scorers.zip 112 | ❱❱❱ mv scorers models 113 | ``` 114 | 115 | ***Reproducibility*** 116 | 117 | Download the [**generated responses**](https://drive.google.com/file/d/1gZmaQ94kQmf1-N04LmW7rN-gOrt-Sa46/view?usp=sharing) 118 | ```console 119 | ❱❱❱ unzip evaluate.zip 120 | ❱❱❱ mv evaluate results 121 | ❱❱❱ python evaluate.py 122 | ``` 123 | In each folder you can find response generated by PPLM using multple styles. This are use to train each of the adapter using ```train_supervised_adapter.py```. 124 | 125 | Download the [**Human Evaluation Score**](https://drive.google.com/file/d/1BEtFR694-8x61_-iANr2TMVck8QRqkA4/view?usp=sharing) 126 | ```console 127 | ❱❱❱ unzip human_evaluation.zip 128 | ``` 129 | here you can find the jupiter notebook to replicate the human evaluation results and the human judgment scores. 130 | 131 | ***Run*** 132 | Check the experiment_runner folder to see how to run the generation. 133 | 134 | ## Toxicity 135 | For researcher working on abusive language, we have also responses generated using a toxic classifer. We can release it upon request exclusively for research purposes. 136 | 137 | 138 | 139 | ## Acknowledgement 140 | We would like to thanks the [**MLC**](http://mlcollective.org/) for the feedback on the earlystage of the work, and expecially [Jason Yosinski](http://yosinski.com/). This repository is implemented base on [**Huggingface**](https://github.com/huggingface/transfer-learning-conv-ai) 141 | 142 | 143 | 144 | 145 | 146 | -------------------------------------------------------------------------------- /dm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andreamad8/PPCM/e5bef1bbb70907a3d65de3225a00e4af9104d4a8/dm.png -------------------------------------------------------------------------------- /download_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1LvAsTzJWIEZsb5orG4vvJz0376mH27l2' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1LvAsTzJWIEZsb5orG4vvJz0376mH27l2" -O data.zip && rm -rf /tmp/cookies.txt 4 | unzip data.zip 5 | rm data.zip 6 | 7 | wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1V8juN486jpeqPhKrGeuJ8WcpaCAy4D3-' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1V8juN486jpeqPhKrGeuJ8WcpaCAy4D3-" -O dialoGPT.zip && rm -rf /tmp/cookies.txt 8 | unzip dialoGPT.zip 9 | mv dialoGPT models 10 | rm dialoGPT.zip 11 | 12 | wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1IzbfGOKkbbEXaoyVxBI0kD_bWODKjVpn ' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1IzbfGOKkbbEXaoyVxBI0kD_bWODKjVpn" -O discriminators.zip && rm -rf /tmp/cookies.txt 13 | unzip discriminators.zip 14 | mv discriminators models 15 | rm discriminators.zip 16 | 17 | wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1rxgYyYEpWVH0qd2uxJQxC5uVIaApJMMf ' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1rxgYyYEpWVH0qd2uxJQxC5uVIaApJMMf" -O scorers.zip && rm -rf /tmp/cookies.txt 18 | unzip scorers.zip 19 | mv scorers models 20 | rm scorers.zip 21 | 22 | wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1gZmaQ94kQmf1-N04LmW7rN-gOrt-Sa46 ' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1gZmaQ94kQmf1-N04LmW7rN-gOrt-Sa46" -O evaluate.zip && rm -rf /tmp/cookies.txt 23 | unzip evaluate.zip 24 | mv evaluate results 25 | rm evaluate.zip 26 | python evaluate.py 27 | 28 | wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1BEtFR694-8x61_-iANr2TMVck8QRqkA4 ' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1BEtFR694-8x61_-iANr2TMVck8QRqkA4" -O human_evaluation.zip && rm -rf /tmp/cookies.txt 29 | unzip human_evaluation.zip 30 | rm human_evaluation.zip -------------------------------------------------------------------------------- /evaluate.py: -------------------------------------------------------------------------------- 1 | from tabulate import tabulate 2 | tabulate.PRESERVE_WHITESPACE = True 3 | from utils.helper import cut_seq_to_eos, parse_prefixes, get_name 4 | from utils.helper import EOS_ID, print_loss_matplotlib 5 | from utils.utils_sample import scorer 6 | from sklearn.model_selection import ParameterGrid 7 | from models.pplm import latent_perturb 8 | from models.wd import weight_decoder 9 | import os 10 | import numpy as np 11 | import jsonlines 12 | 13 | def make_header(args,id_starter,knowledge): 14 | str_title = "" 15 | str_title += "===================================================\n" 16 | str_title += f"Model={args.model_size} Window={args.window_length} Iteration={args.num_iterations} Step_size={args.stepsize}\n" 17 | str_title += "===================================================\n" 18 | name_experiment = f"Iter={args.num_iterations}_Step={args.stepsize}_Start={id_starter}_W={args.window_length}" 19 | if(knowledge): 20 | str_title += f"Knowledge={knowledge}\n" 21 | str_title += "===================================================\n" 22 | knol = knowledge.replace(" ","_") 23 | name_experiment = f"Iter={args.num_iterations}_Know={knol}_Step={args.stepsize}_Start={id_starter}_W={args.window_length}" 24 | return str_title, name_experiment 25 | 26 | def logger_conv_ent(args,conv,enc,id_starter,logger,class2idx,classifier,knowledge=None,gold=None): 27 | str_title, name_experiment = make_header(args,id_starter,knowledge) 28 | acc_original = [] 29 | acc_pplm = [] 30 | for turn in conv: 31 | if(turn['speaker']=="PPLM"): 32 | str_title += "===================================================\n" 33 | str_title += "PPLM\n" 34 | str_title += "===================================================\n" 35 | hypotesis, acc_pplm, plots_array = scorer(args,turn,classifier,enc,class2idx,knowledge,plot=False,gold=gold) 36 | str_title += tabulate(hypotesis, headers=['Id', 'Loss','Dist','Label', 'BLEU/F1','Text'], tablefmt='simple',floatfmt=".2f",colalign=("center","center","center","center","left")) 37 | str_title += "\n" 38 | if(args.verbose): 39 | print(str_title) 40 | else: 41 | print_loss_matplotlib(plots_array,loss_original,str_title,logger,name=name_experiment) 42 | elif(turn['speaker']=="DGPT"): 43 | str_title += "===================================================\n" 44 | str_title += "DGPT\n" 45 | str_title += "===================================================\n" 46 | if(not args.bag_of_words): 47 | hypotesis_original, acc_original, _ = scorer(args,turn,classifier,enc,class2idx,knowledge,gold=gold) 48 | str_title += tabulate(hypotesis_original, headers=['Id','Loss','Dist','Label', 'BLEU/F1','Text'], tablefmt='simple',floatfmt=".2f",colalign=("center","center","center","center","left")) 49 | str_title += "\n" 50 | loss_original = hypotesis_original[0][1] 51 | else: 52 | hypotesis_original = [[i, enc.decode(cut_seq_to_eos(t))] for i, t in enumerate(turn['text'])] 53 | str_title += tabulate(hypotesis_original, headers=['Id','Text'], tablefmt='orgtbl') 54 | loss_original = 0 55 | str_title += "===================================================\n" 56 | else: ## human case 57 | str_title += f"{turn['speaker']} >>> {turn['text']}\n" 58 | loss_original = 0 59 | 60 | return acc_pplm, acc_original, hypotesis, hypotesis_original 61 | 62 | 63 | def evaluate(args,model,enc,classifier,entailment,task_ent,class2idx,param_grid,device,logger): 64 | if(entailment): 65 | list_starters = parse_prefixes(args,entailment=True,task=task_ent) 66 | else: 67 | list_starters = parse_prefixes(args,tokenizer=enc,seed=args.seed) 68 | for param in list(ParameterGrid(param_grid)): 69 | args.stepsize = param["steps"] 70 | args.num_iterations = param["iter"] 71 | args.window_length = param["window"] 72 | print("===================================================") 73 | print(f"Model={args.model_size} Discrim={args.discrim} Window={args.window_length} Iteration={args.num_iterations} Step_size={args.stepsize}") 74 | print("===================================================") 75 | global_acc_original, global_acc_PPLM = [], [] 76 | lab = class2idx[args.label_class].replace(" ","_").replace("/","") 77 | base_path = f"results/evaluate/{args.discrim}_class_{lab}/" 78 | name = get_name(args,base_path,class2idx) 79 | mode = 'w' 80 | if os.path.exists(name): 81 | num_lines = sum(1 for line in open(name,'r')) 82 | list_starters = list_starters[num_lines:] 83 | mode = 'a' 84 | with jsonlines.open(get_name(args,base_path,class2idx), mode=mode) as writer: 85 | for id_starter, starter in enumerate(list_starters): 86 | conversation = [] 87 | for t in starter["conversation"]: 88 | conversation.append({"speaker":"human", "text":t}) 89 | 90 | history = starter["conversation"] 91 | context_tokens = sum([enc.encode(h) + [EOS_ID] for h in history],[]) 92 | 93 | if(args.wd): 94 | context_tokens = [context_tokens] 95 | original_sentence, perturb_sentence, _, loss, _ = weight_decoder(model=model, enc=enc, 96 | args=args, context=context_tokens, 97 | device=device,repetition_penalty=args.repetition_penalty, 98 | classifier=classifier.classifier_head,knowledge=starter["knowledge"]) 99 | else: 100 | context_tokens = [context_tokens for _ in range(args.num_samples)] 101 | original_sentence, perturb_sentence, _, loss, _ = latent_perturb(model=model, enc=enc, 102 | args=args, context=context_tokens, 103 | device=device,repetition_penalty=args.repetition_penalty, 104 | classifier=classifier.classifier_head,knowledge=starter["knowledge"]) 105 | conversation.append({"speaker":"DGPT","text":original_sentence.tolist()}) 106 | conversation.append({"speaker":"PPLM","text":perturb_sentence.tolist(),"loss":loss}) 107 | acc_pplm, acc_original, hypotesis, hypotesis_original = logger_conv_ent(args,conversation,enc,id_starter,logger,class2idx=class2idx,classifier=classifier,knowledge=starter["knowledge"],gold=starter["gold"]) 108 | global_acc_PPLM.append(acc_pplm) 109 | global_acc_original.append(acc_original) 110 | writer.write({"acc":{"DGPT":acc_original,"PPLM":acc_pplm}, "hyp":{"DGPT":hypotesis_original,"PPLM":hypotesis},"conversation":starter}) 111 | 112 | 113 | print(f"Global Acc original:{np.mean(global_acc_original)} Acc PPLM:{np.mean(global_acc_PPLM)}") 114 | print() 115 | print() -------------------------------------------------------------------------------- /experiment_runner/run_classifier.sh: -------------------------------------------------------------------------------- 1 | ## TRAIN the head 2 | CUDA_VISIBLE_DEVICES=4 python dialogGPT_discr.py --save_model --dataset sentiment --cached --epochs 100 > results/discriminator_training/sentiment_new.txt 3 | CUDA_VISIBLE_DEVICES=4 python dialogGPT_discr.py --save_model --dataset daily_dialogue_act --cached --epochs 100 > results/discriminator_training/daily_dialogue_act_new.txt 4 | CUDA_VISIBLE_DEVICES=4 python dialogGPT_discr.py --save_model --dataset empathetic_dialogue --cached --epochs 100 > results/discriminator_training/empathetic_dialogue_new.txt# CUDA_VISIBLE_DEVICES=4 python dialogGPT_discr.py --save_model --dataset TC_AG_NEWS --cached --epochs 50 > results/discriminator_training/TC_AG_NEWS_new.txt 5 | 6 | ## TRAIN the scorer 7 | CUDA_VISIBLE_DEVICES=0 python train_score.py --dataset AmazonReviewFull > results/discriminator_training/Amazon5.txt 8 | CUDA_VISIBLE_DEVICES=3 python train_score.py --dataset TC_AG_NEWS > results/discriminator_training/BERT_TEST_AG_NEWS.txt 9 | 10 | -------------------------------------------------------------------------------- /experiment_runner/run_evaluate.sh: -------------------------------------------------------------------------------- 1 | # ## very negative 2 | # # DialGPT + PPLM 3 | CUDA_VISIBLE_DEVICES=1 python main.py -D sentiment --label_class 3 --length 30 --num_samples 10 --evaluate --verbose --sample_starter 10 4 | 5 | # ## very positive 6 | # # DialGPT 7 | # # DialGPT + PPLM 8 | CUDA_VISIBLE_DEVICES=1 python main.py -D sentiment --label_class 2 --length 30 --num_samples 10 --evaluate --verbose --sample_starter 10 9 | 10 | # ## Daily Dialogue ACT class=1 Question 11 | # # DialGPT 12 | # # DialGPT + PPLM 13 | CUDA_VISIBLE_DEVICES=1 python main.py -D daily_dialogue_act --label_class 1 --length 30 --num_samples 10 --evaluate --verbose --sample_starter 10 14 | 15 | # ## Text Classficiation 16 | # # DialGPT 17 | # # DialGPT + PPLM 18 | CUDA_VISIBLE_DEVICES=1 python main.py -D AG_NEWS --label_class 0 --length 30 --num_samples 10 --evaluate --verbose --sample_starter 10 19 | CUDA_VISIBLE_DEVICES=1 python main.py -D AG_NEWS --label_class 1 --length 30 --num_samples 10 --evaluate --verbose --sample_starter 10 20 | CUDA_VISIBLE_DEVICES=1 python main.py -D AG_NEWS --label_class 2 --length 30 --num_samples 10 --evaluate --verbose --sample_starter 10 21 | CUDA_VISIBLE_DEVICES=1 python main.py -D AG_NEWS --label_class 3 --length 30 --num_samples 10 --evaluate --verbose --sample_starter 10 22 | 23 | -------------------------------------------------------------------------------- /experiment_runner/run_evaluate_WD.sh: -------------------------------------------------------------------------------- 1 | ## very negative 2 | # DialGPT + WD 3 | CUDA_VISIBLE_DEVICES=1 python main.py -D sentiment --label_class 3 --length 30 --num_samples 10 --evaluate --verbose --sample_starter 10 --wd 4 | 5 | ## very positive 6 | # DialGPT 7 | # DialGPT + WD 8 | CUDA_VISIBLE_DEVICES=1 python main.py -D sentiment --label_class 2 --length 30 --num_samples 10 --evaluate --verbose --sample_starter 10 --wd 9 | 10 | ## Daily Dialogue ACT class=1 Question 11 | # DialGPT 12 | # DialGPT + WD 13 | CUDA_VISIBLE_DEVICES=1 python main.py -D daily_dialogue_act --label_class 1 --length 30 --num_samples 10 --evaluate --verbose --sample_starter 10 --wd 14 | 15 | ## Text Classficiation 16 | # DialGPT 17 | # DialGPT + WD 18 | CUDA_VISIBLE_DEVICES=1 python main.py -D AG_NEWS --label_class 0 --length 30 --num_samples 10 --evaluate --verbose --sample_starter 10 --wd 19 | CUDA_VISIBLE_DEVICES=1 python main.py -D AG_NEWS --label_class 1 --length 30 --num_samples 10 --evaluate --verbose --sample_starter 10 --wd 20 | CUDA_VISIBLE_DEVICES=1 python main.py -D AG_NEWS --label_class 2 --length 30 --num_samples 10 --evaluate --verbose --sample_starter 10 --wd 21 | CUDA_VISIBLE_DEVICES=1 python main.py -D AG_NEWS --label_class 3 --length 30 --num_samples 10 --evaluate --verbose --sample_starter 10 --wd -------------------------------------------------------------------------------- /interact.py: -------------------------------------------------------------------------------- 1 | from tabulate import tabulate 2 | tabulate.PRESERVE_WHITESPACE = True 3 | from utils.helper import EOS_ID 4 | from models.pplm import latent_perturb 5 | from utils.utils_sample import scorer 6 | import torch.nn.functional as F 7 | import torch 8 | 9 | 10 | def top_k_logits(logits, k, probs=False): 11 | """ 12 | Masks everything but the k top entries as -infinity (1e10). 13 | Used to mask logits such that e^-infinity -> 0 won't contribute to the 14 | sum of the denominator. 15 | """ 16 | if k == 0: 17 | return logits 18 | else: 19 | values = torch.topk(logits, k)[0] 20 | batch_mins = values[:, -1].view(-1, 1).expand_as(logits) 21 | if probs: 22 | return torch.where(logits < batch_mins, torch.ones_like(logits) * 0.0, logits) 23 | return torch.where(logits < batch_mins, torch.ones_like(logits) * -1e10, logits) 24 | 25 | def sample(model, args, classifier, context=None, past=None, device='cuda', 26 | sample=True, repetition_penalty=1.0): 27 | output = torch.tensor(context, device=device, dtype=torch.long) if context else None 28 | output_response = output.new_zeros([output.size(0),0]) 29 | stopped = [0 for _ in range(output.size(0))] 30 | for i in range(args.length): 31 | 32 | if past is None and output is not None: 33 | prev = output[:, -1:] 34 | _, past = model(output[:, :-1]) 35 | 36 | logits, past = model(prev, past=past) 37 | 38 | logits = logits[:, -1, :] / args.temperature # + SmallConst 39 | for i_o, o_ in enumerate(output): 40 | for token_idx in set(o_.tolist()): 41 | if logits[i_o, token_idx] < 0: 42 | logits[i_o, token_idx] *= repetition_penalty 43 | else: 44 | logits[i_o, token_idx] /= repetition_penalty 45 | 46 | logits = top_k_logits(logits, k=args.top_k) # + SmallConst 47 | log_probs = F.softmax(logits, dim=-1) 48 | 49 | if sample: 50 | prev = torch.multinomial(log_probs, num_samples=1) 51 | else: 52 | _, prev = torch.topk(log_probs, k=1, dim=-1) 53 | 54 | output = prev if output is None else torch.cat((output, prev), dim=1) # update output 55 | output_response = torch.cat((output_response, prev), dim=1) 56 | 57 | for i_p, p in enumerate(prev.tolist()): 58 | if(p[0]) == EOS_ID: 59 | stopped[i_p] = 1 60 | 61 | if(all(x == 1 for x in stopped)): break 62 | 63 | return output_response 64 | 65 | 66 | 67 | def interact(args,model,enc,classifier,class2idx,speaker,device,logger): 68 | history = [] 69 | while True: 70 | raw_text = input("USR >>> ") 71 | while not raw_text: 72 | print('Prompt should not be empty!') 73 | raw_text = input("USR >>>") 74 | history.append(raw_text) 75 | 76 | context_tokens = sum([enc.encode(h) + [EOS_ID] for h in history],[]) 77 | context_tokens = [context_tokens for _ in range(args.num_samples)] 78 | 79 | 80 | if(speaker=="PPLM"): 81 | original_sentence, perturb_sentence, _, loss, _ = latent_perturb(model=model, enc=enc, 82 | args=args, context=context_tokens, 83 | device=device,repetition_penalty=args.repetition_penalty, 84 | classifier=classifier.classifier_head) 85 | spk_turn = {"text":perturb_sentence.tolist()} 86 | else: 87 | original_sentence = sample(model=model,args=args, context=context_tokens, device=device, 88 | classifier=classifier.classifier_head, repetition_penalty=args.repetition_penalty, 89 | ) 90 | spk_turn = {"text":original_sentence.tolist()} 91 | hypotesis, _, _ = scorer(args,spk_turn,classifier,enc,class2idx,knowledge=None,plot=False) 92 | text = hypotesis[0][-1] 93 | 94 | print(f"SYS >>> {text}") 95 | history.append(text) 96 | history = history[-(2*args.max_history+1):] -------------------------------------------------------------------------------- /interact_adapter.py: -------------------------------------------------------------------------------- 1 | from tabulate import tabulate 2 | tabulate.PRESERVE_WHITESPACE = True 3 | from utils.helper import load_classifier 4 | from utils.helper import EOS_ID 5 | from utils.utils_sample import scorer 6 | import torch.nn.functional as F 7 | import torch 8 | from nltk import tokenize 9 | 10 | #CUDA_VISIBLE_DEVICES=2 python main.py -D sentiment --label_class 3 --length 30 --num_samples 1 --interact --verbose --speaker DGPT --load_check_point_adapter runs/SENT_very_negative_Mar30_13-59-53/pytorch_model.bin 11 | 12 | def top_k_logits(logits, k, probs=False): 13 | """ 14 | Masks everything but the k top entries as -infinity (1e10). 15 | Used to mask logits such that e^-infinity -> 0 won't contribute to the 16 | sum of the denominator. 17 | """ 18 | if k == 0: 19 | return logits 20 | else: 21 | values = torch.topk(logits, k)[0] 22 | batch_mins = values[:, -1].view(-1, 1).expand_as(logits) 23 | if probs: 24 | return torch.where(logits < batch_mins, torch.ones_like(logits) * 0.0, logits) 25 | return torch.where(logits < batch_mins, torch.ones_like(logits) * -1e10, logits) 26 | 27 | def sample(model, args, context=None, past=None, device='cuda', 28 | sample=True, repetition_penalty=1.0): 29 | output = torch.tensor(context, device=device, dtype=torch.long) if context else None 30 | output_response = output.new_zeros([output.size(0),0]) 31 | stopped = [0 for _ in range(output.size(0))] 32 | for i in range(args.length): 33 | 34 | if past is None and output is not None: 35 | prev = output[:, -1:] 36 | _, past = model(output[:, :-1]) 37 | 38 | logits, past = model(prev, past=past) 39 | 40 | logits = logits[:, -1, :] / args.temperature # + SmallConst 41 | for i_o, o_ in enumerate(output): 42 | for token_idx in set(o_.tolist()): 43 | if logits[i_o, token_idx] < 0: 44 | logits[i_o, token_idx] *= repetition_penalty 45 | else: 46 | logits[i_o, token_idx] /= repetition_penalty 47 | 48 | logits = top_k_logits(logits, k=args.top_k) # + SmallConst 49 | log_probs = F.softmax(logits, dim=-1) 50 | 51 | if sample: 52 | prev = torch.multinomial(log_probs, num_samples=1) 53 | else: 54 | _, prev = torch.topk(log_probs, k=1, dim=-1) 55 | 56 | output = prev if output is None else torch.cat((output, prev), dim=1) # update output 57 | output_response = torch.cat((output_response, prev), dim=1) 58 | 59 | for i_p, p in enumerate(prev.tolist()): 60 | if(p[0]) == EOS_ID: 61 | stopped[i_p] = 1 62 | 63 | if(all(x == 1 for x in stopped)): break 64 | 65 | return output_response 66 | 67 | def get_rankers(args,model): 68 | classifiers = {} 69 | 70 | args.discrim = 'sentiment' 71 | args.label_class = 2 72 | classifier, class2idx = load_classifier(args, model) 73 | classifiers['a'] = [classifier, class2idx] 74 | 75 | args.discrim = 'sentiment' 76 | args.label_class = 3 77 | classifier, class2idx = load_classifier(args, model) 78 | classifiers['b'] = [classifier, class2idx] 79 | 80 | args.discrim = 'daily_dialogue_act' 81 | args.label_class = 1 82 | classifier, class2idx = load_classifier(args, model) 83 | classifiers['c'] = [classifier, class2idx] 84 | 85 | args.discrim = 'toxicity' 86 | args.label_class = 1 87 | classifier, class2idx = load_classifier(args, model) 88 | classifiers['d'] = [classifier, class2idx] 89 | 90 | args.discrim = 'AG_NEWS' 91 | args.label_class = 0 92 | classifier, class2idx = load_classifier(args, model) 93 | classifiers['e'] = [classifier, class2idx] 94 | 95 | args.discrim = 'AG_NEWS' 96 | args.label_class = 1 97 | classifier, class2idx = load_classifier(args, model) 98 | classifiers['f'] = [classifier, class2idx] 99 | 100 | args.discrim = 'AG_NEWS' 101 | args.label_class = 2 102 | classifier, class2idx = load_classifier(args, model) 103 | classifiers['g'] = [classifier, class2idx] 104 | 105 | args.discrim = 'AG_NEWS' 106 | args.label_class = 3 107 | classifier, class2idx = load_classifier(args, model) 108 | classifiers['h'] = [classifier, class2idx] 109 | 110 | return classifiers 111 | 112 | def interact(args,model,enc,classifier,class2idx,device): 113 | classifiers = get_rankers(args,model) 114 | history = [] 115 | while True: 116 | raw_text = input("USR >>> ") 117 | while not raw_text: 118 | print('Prompt should not be empty!') 119 | raw_text = input("USR >>>") 120 | 121 | style = input("Choose a style \n (a) Positive (b) Negative (c) Question (d) Toxic (e) World (f) Sports (g) Business (h) Sci/Tech (i) DGPT \n >>> ") 122 | if(style == "a"): 123 | classifier,class2idx = classifiers["a"] 124 | args.num_samples = 10 125 | task_id = 1 126 | args.label_class = 2 127 | elif(style == "b"): 128 | classifier,class2idx = classifiers["b"] 129 | args.num_samples = 10 130 | task_id = 0 131 | args.label_class = 3 132 | elif(style == "c"): 133 | classifier,class2idx = classifiers["c"] 134 | args.num_samples = 10 135 | task_id = 3 136 | args.label_class = 1 137 | elif(style == "d"): 138 | classifier,class2idx = classifiers["d"] 139 | args.num_samples = 10 140 | task_id = 2 141 | args.label_class = 1 142 | elif(style == "e"): 143 | classifier,class2idx = classifiers["e"] 144 | args.num_samples = 10 145 | task_id = 7 146 | args.label_class = 0 147 | elif(style == "f"): 148 | classifier,class2idx = classifiers["f"] 149 | args.num_samples = 10 150 | task_id = 6 151 | args.label_class = 1 152 | 153 | elif(style == "g"): 154 | classifier,class2idx = classifiers["g"] 155 | args.num_samples = 10 156 | task_id = 4 157 | args.label_class = 2 158 | 159 | elif(style == "h"): 160 | classifier,class2idx = classifiers["h"] 161 | args.num_samples = 10 162 | task_id = 5 163 | args.label_class = 3 164 | else: 165 | args.num_samples = 1 166 | args.label_class = 0 167 | task_id = -1 168 | 169 | 170 | history.append(raw_text) 171 | 172 | context_tokens = sum([enc.encode(h) + [EOS_ID] for h in history],[]) 173 | context_tokens = [context_tokens for _ in range(args.num_samples)] 174 | 175 | original_sentence = sample(model=model,args=args, context=context_tokens, device=device, 176 | repetition_penalty=args.repetition_penalty) 177 | spk_turn = {"text":original_sentence.tolist()} 178 | hypotesis, _, _ = scorer(args,spk_turn,classifier,enc,class2idx,knowledge=None,plot=False) 179 | text = hypotesis[0][-1] 180 | text = " ".join(tokenize.sent_tokenize(text)[:2]) 181 | # print(text_sent) 182 | # print(text_sent[0]) 183 | print(f"SYS >>> {text}") 184 | history.append(text) 185 | history = history[-(2*args.max_history+1):] -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import torch 4 | import numpy as np 5 | from transformers import GPT2Tokenizer 6 | 7 | from interact_adapter import interact 8 | from utils.helper import load_classifier, load_model, load_model_recursive 9 | from evaluate import evaluate 10 | 11 | def run_model(): 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('--model_size', type=str, default="medium", help='Size of dialoGPT') 14 | parser.add_argument('--model_path', '-M', type=str, default='gpt-2_pt_models/dialoGPT/', 15 | help='pretrained model name or path to local checkpoint') 16 | parser.add_argument('--discrim', '-D', type=str, default=None, 17 | choices=('sentiment',"daily_dialogue_act", 18 | "AG_NEWS"), 19 | help='Discriminator to use for loss-type 2') 20 | parser.add_argument('--label_class', type=int, default=-1, help='Class label used for the discriminator') 21 | parser.add_argument('--stepsize', type=float, default=0.03) 22 | parser.add_argument('--num_iterations', type=int, default=2) 23 | parser.add_argument("--length", type=int, default=100) 24 | parser.add_argument("--seed", type=int, default=5555) 25 | parser.add_argument("--temperature", type=float, default=1) 26 | parser.add_argument('--repetition_penalty', type=float, default=1.1) #1.1 27 | parser.add_argument("--top_k", type=int, default=10) 28 | parser.add_argument("--gm_scale", type=float, default=0.95) 29 | parser.add_argument("--kl_scale", type=float, default=0.01) 30 | parser.add_argument('--nocuda', action='store_true', help='no cuda') 31 | parser.add_argument('--grad_length', type=int, default=10000) 32 | parser.add_argument('--num_samples', type=int, default=1, 33 | help='Number of samples to generate from the modified latents') 34 | parser.add_argument('--horizon_length', type=int, default=1, help='Length of future to optimize over') 35 | # parser.add_argument('--force-token', action='store_true', help='no cuda') 36 | parser.add_argument('--window_length', type=int, default=0, 37 | help='Length of past which is being optimizer; 0 corresponds to infinite window length') 38 | parser.add_argument('--decay', action='store_true', help='whether to decay or not') 39 | parser.add_argument('--gamma', type=float, default=1.0) 40 | parser.add_argument("--max_history", type=int, default=-1) 41 | parser.add_argument('--evaluate', action='store_true', help='evaluate') 42 | parser.add_argument('--wd', action='store_true', help='greedy based on rev comments') 43 | parser.add_argument('--verbose', action='store_true', help='verbose mode, no comet print in the terminal') 44 | parser.add_argument('--bow_scale_weight', type=float, default=20) 45 | parser.add_argument('--sample_starter', type=int, default=0) 46 | parser.add_argument('--all_starter', action='store_true', help='selfchat') 47 | parser.add_argument("--speaker", type=str, default="PPLM") 48 | parser.add_argument("--task_ent", type=str, default="data/simple_QA/QA.json") 49 | parser.add_argument("--load_check_point_adapter", type=str, default="None") 50 | parser.add_argument("--task_id", type=int, default=0) 51 | parser.add_argument("--trial_id", type=int, default=1) 52 | parser.add_argument("--entailment", type=bool, default=False) 53 | parser.add_argument("--BCE", type=bool, default=False) 54 | parser.add_argument("--bag_of_words", type=str, default=None) 55 | args = parser.parse_args() 56 | torch.manual_seed(args.seed) 57 | torch.cuda.manual_seed(args.seed) 58 | np.random.seed(args.seed) 59 | torch.backends.cudnn.deterministic = True 60 | torch.backends.cudnn.benchmark = False 61 | if(args.load_check_point_adapter != "None"): 62 | print("LOADING ADAPTER CONFIG FILE AND INTERACTIVE SCRIPT") 63 | from models.pytorch_pretrained_bert.modeling_adapter import GPT2LMHeadModel, GPT2Config 64 | else: 65 | from models.pytorch_pretrained_bert import GPT2LMHeadModel, GPT2Config 66 | 67 | device = 'cpu' if args.nocuda else 'cuda' 68 | args.model_path = f'models/dialoGPT/{args.model_size}/' 69 | config = GPT2Config.from_json_file(os.path.join(args.model_path, 'config.json')) 70 | tokenizer = GPT2Tokenizer.from_pretrained(args.model_path) 71 | 72 | if(args.load_check_point_adapter != "None"): 73 | print("Loading ADAPTERS") 74 | model = load_model_recursive(GPT2LMHeadModel(config,default_task_id=args.task_id), args.load_check_point_adapter, args, verbose=True) 75 | else: 76 | model = load_model(GPT2LMHeadModel(config), args.model_path+f"{args.model_size}_ft.pkl", args, verbose=True) 77 | model.to(device).eval() 78 | 79 | # Freeze Models weights 80 | for param in model.parameters(): 81 | param.requires_grad = False 82 | 83 | classifier, class2idx = load_classifier(args, model) 84 | 85 | logger = None 86 | 87 | ## set iter to 0 to run the adapter 88 | ## set iter to 50 to run WD 89 | param_grid = {'iter': [75], 'window': [0], 'steps': [0.02]} 90 | 91 | if(args.evaluate): 92 | evaluate(args,model,tokenizer,classifier,args.entailment,args.task_ent,class2idx,param_grid,device,logger) 93 | else: 94 | interact(args, model, tokenizer, classifier, class2idx, device) 95 | 96 | if __name__ == '__main__': 97 | run_model() 98 | -------------------------------------------------------------------------------- /metric/bleu.py: -------------------------------------------------------------------------------- 1 | """ROUGE metric implementation. 2 | Copy from tf_seq2seq/seq2seq/metrics/rouge.py. 3 | This is a modified and slightly extended verison of 4 | https://github.com/miso-belica/sumy/blob/dev/sumy/evaluation/rouge.py. 5 | """ 6 | 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import print_function 10 | from __future__ import unicode_literals 11 | 12 | import itertools 13 | import numpy as np 14 | 15 | import numpy 16 | import os 17 | import re 18 | import subprocess 19 | import tempfile 20 | import numpy as np 21 | from collections import Counter 22 | 23 | 24 | # -*- coding: utf-8 -*- 25 | # Copyright 2017 Google Inc. 26 | # 27 | # Licensed under the Apache License, Version 2.0 (the "License"); 28 | # you may not use this file except in compliance with the License. 29 | # You may obtain a copy of the License at 30 | # 31 | # http://www.apache.org/licenses/LICENSE-2.0 32 | # 33 | # Unless required by applicable law or agreed to in writing, software 34 | # distributed under the License is distributed on an "AS IS" BASIS, 35 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 36 | # See the License for the specific language governing permissions and 37 | # limitations under the License. 38 | """BLEU metric implementation. 39 | """ 40 | 41 | 42 | def moses_multi_bleu(hypotheses, references, lowercase=False): 43 | """Calculate the bleu score for hypotheses and references 44 | using the MOSES multi-bleu.perl script. 45 | Args: 46 | hypotheses: A numpy array of strings where each string is a single example. 47 | references: A numpy array of strings where each string is a single example. 48 | lowercase: If true, pass the "-lc" flag to the multi-bleu script 49 | Returns: 50 | The BLEU score as a float32 value. 51 | """ 52 | 53 | if np.size(hypotheses) == 0: 54 | return np.float32(0.0) 55 | 56 | multi_bleu_path = "metric/multi-bleu.perl" 57 | os.chmod(multi_bleu_path, 0o755) 58 | 59 | 60 | # Dump hypotheses and references to tempfiles 61 | hypothesis_file = tempfile.NamedTemporaryFile() 62 | hypothesis_file.write("\n".join(hypotheses).encode("utf-8")) 63 | hypothesis_file.write(b"\n") 64 | hypothesis_file.flush() 65 | reference_file = tempfile.NamedTemporaryFile() 66 | reference_file.write("\n".join(references).encode("utf-8")) 67 | reference_file.write(b"\n") 68 | reference_file.flush() 69 | 70 | 71 | # Calculate BLEU using multi-bleu script 72 | with open(hypothesis_file.name, "r") as read_pred: 73 | bleu_cmd = [multi_bleu_path] 74 | if lowercase: 75 | bleu_cmd += ["-lc"] 76 | bleu_cmd += [reference_file.name] 77 | try: 78 | bleu_out = subprocess.check_output(bleu_cmd, stdin=read_pred, stderr=subprocess.STDOUT) 79 | bleu_out = bleu_out.decode("utf-8") 80 | bleu_score = re.search(r"BLEU = (.+?),", bleu_out).group(1) 81 | bleu_score = float(bleu_score) 82 | except subprocess.CalledProcessError as error: 83 | if error.output is not None: 84 | print("multi-bleu.perl script returned non-zero exit code") 85 | print(error.output) 86 | bleu_score = np.float32(0.0) 87 | 88 | # Close temp files 89 | hypothesis_file.close() 90 | reference_file.close() 91 | return bleu_score -------------------------------------------------------------------------------- /metric/dist_score.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding:utf-8 3 | 4 | # Copyright (c) Tsinghua university conversational AI group (THU-coai). 5 | # This source code is licensed under the MIT license. 6 | """Script for the Evaluation of Chinese Human-Computer Dialogue Technology (SMP2019-ECDT) Task2. 7 | This script evaluates the distinct[1] of the submitted model. 8 | This uses a the version of the dataset which does not contain the "Golden Response" . 9 | Leaderboard scores will be run in the same form but on a hidden test set. 10 | 11 | reference: 12 | 13 | [1] Li, Jiwei, et al. "A diversity-promoting objective function for neural conversation models." 14 | arXiv preprint arXiv:1510.03055 (2015). 15 | 16 | This requires each team to implement the following function: 17 | def gen_response(self, contexts): 18 | return a list of responses for each context 19 | Arguments: 20 | contexts -- a list of context, each context contains dialogue histories and personal profiles of every speaker 21 | Returns a list, where each element is the response of the corresponding context 22 | """ 23 | # from main import Model 24 | import json 25 | import sys 26 | from transformers import GPT2Tokenizer,GPT2LMHeadModel 27 | tokenizer = GPT2Tokenizer.from_pretrained('gpt2') 28 | from utils.helper import truncate 29 | 30 | def read_dialog(file): 31 | """ 32 | Read dialogs from file 33 | :param file: str, file path to the dataset 34 | :return: list, a list of dialogue (context) contained in file 35 | """ 36 | with open(file) as f: 37 | contents = [i.strip() for i in f.readlines() if len(i.strip()) != 0] 38 | return [json.loads(i) for i in contents] 39 | 40 | 41 | def count_ngram(hyps_resp, n): 42 | """ 43 | Count the number of unique n-grams 44 | :param hyps_resp: list, a list of responses 45 | :param n: int, n-gram 46 | :return: the number of unique n-grams in hyps_resp 47 | """ 48 | if len(hyps_resp) == 0: 49 | print("ERROR, eval_distinct get empty input") 50 | return 51 | 52 | if type(hyps_resp[0]) != list: 53 | print("ERROR, eval_distinct takes in a list of , get a list of {} instead".format( 54 | type(hyps_resp[0]))) 55 | return 56 | 57 | ngram = set() 58 | for resp in hyps_resp: 59 | if len(resp) < n: 60 | continue 61 | for i in range(len(resp) - n + 1): 62 | ngram.add(' '.join(resp[i: i + n])) 63 | return len(ngram) 64 | 65 | 66 | def eval_distinct(hyps_resp): 67 | """ 68 | compute distinct score for the hyps_resp 69 | :param hyps_resp: list, a list of hyps responses 70 | :return: average distinct score for 1, 2-gram 71 | """ 72 | 73 | hyps_resp = [list(map(str, tokenizer.encode(h))) for h in hyps_resp] 74 | 75 | if len(hyps_resp) == 0: 76 | print("ERROR, eval_distinct get empty input") 77 | return 78 | 79 | if type(hyps_resp[0]) != list: 80 | print("ERROR, eval_distinct takes in a list of , get a list of {} instead".format( 81 | type(hyps_resp[0]))) 82 | return 83 | 84 | hyps_resp = [(' '.join(i)).split() for i in hyps_resp] 85 | num_tokens = sum([len(i) for i in hyps_resp]) 86 | dist1 = count_ngram(hyps_resp, 1) / float(num_tokens) 87 | dist2 = count_ngram(hyps_resp, 2) / float(num_tokens) 88 | dist3 = count_ngram(hyps_resp, 3) / float(num_tokens) 89 | 90 | return truncate(dist1,2), truncate(dist2,2), truncate(dist3,2) 91 | 92 | -------------------------------------------------------------------------------- /metric/lm_score.py: -------------------------------------------------------------------------------- 1 | from transformers import GPT2Tokenizer,GPT2LMHeadModel 2 | import torch 3 | import numpy as np 4 | import math 5 | 6 | tokenizer = GPT2Tokenizer.from_pretrained('gpt2') 7 | model = GPT2LMHeadModel.from_pretrained('gpt2') 8 | model.to('cuda') 9 | model.eval() 10 | 11 | def get_ppl(text, starter): 12 | lls = [] 13 | for idx, t in enumerate(text): 14 | input_ids = torch.tensor(tokenizer.encode(" ".join(starter[idx]['conversation'])+ " "+t)).unsqueeze(0) # Batch size 1 15 | input_ids = input_ids.to('cuda') 16 | if input_ids.size(1)>1: 17 | with torch.no_grad(): 18 | outputs = model(input_ids, labels=input_ids) 19 | loss, _ = outputs[:2] 20 | lls.append(loss.item()) 21 | 22 | return math.exp(np.mean(lls)) 23 | 24 | def get_ppl_simplified(text, starter): 25 | lls = [] 26 | for idx, t in enumerate(text): 27 | input_ids = torch.tensor(tokenizer.encode(" ".join(starter[idx])+ " "+t)).unsqueeze(0) # Batch size 1 28 | input_ids = input_ids.to('cuda') 29 | if input_ids.size(1)>1: 30 | with torch.no_grad(): 31 | outputs = model(input_ids, labels=input_ids) 32 | loss, _ = outputs[:2] 33 | lls.append(loss.item()) 34 | 35 | return math.exp(np.mean(lls)) -------------------------------------------------------------------------------- /metric/multi-bleu.perl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # 3 | # This file is part of moses. Its use is licensed under the GNU Lesser General 4 | # Public License version 2.1 or, at your option, any later version. 5 | 6 | # $Id$ 7 | use warnings; 8 | use strict; 9 | 10 | my $lowercase = 0; 11 | if ($ARGV[0] eq "-lc") { 12 | $lowercase = 1; 13 | shift; 14 | } 15 | 16 | my $stem = $ARGV[0]; 17 | if (!defined $stem) { 18 | print STDERR "usage: multi-bleu.pl [-lc] reference < hypothesis\n"; 19 | print STDERR "Reads the references from reference or reference0, reference1, ...\n"; 20 | exit(1); 21 | } 22 | 23 | $stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0"; 24 | 25 | my @REF; 26 | my $ref=0; 27 | while(-e "$stem$ref") { 28 | &add_to_ref("$stem$ref",\@REF); 29 | $ref++; 30 | } 31 | &add_to_ref($stem,\@REF) if -e $stem; 32 | die("ERROR: could not find reference file $stem") unless scalar @REF; 33 | 34 | # add additional references explicitly specified on the command line 35 | shift; 36 | foreach my $stem (@ARGV) { 37 | &add_to_ref($stem,\@REF) if -e $stem; 38 | } 39 | 40 | 41 | 42 | sub add_to_ref { 43 | my ($file,$REF) = @_; 44 | my $s=0; 45 | if ($file =~ /.gz$/) { 46 | open(REF,"gzip -dc $file|") or die "Can't read $file"; 47 | } else { 48 | open(REF,$file) or die "Can't read $file"; 49 | } 50 | while() { 51 | chop; 52 | push @{$$REF[$s++]}, $_; 53 | } 54 | close(REF); 55 | } 56 | 57 | my(@CORRECT,@TOTAL,$length_translation,$length_reference); 58 | my $s=0; 59 | while() { 60 | chop; 61 | $_ = lc if $lowercase; 62 | my @WORD = split; 63 | my %REF_NGRAM = (); 64 | my $length_translation_this_sentence = scalar(@WORD); 65 | my ($closest_diff,$closest_length) = (9999,9999); 66 | foreach my $reference (@{$REF[$s]}) { 67 | # print "$s $_ <=> $reference\n"; 68 | $reference = lc($reference) if $lowercase; 69 | my @WORD = split(' ',$reference); 70 | my $length = scalar(@WORD); 71 | my $diff = abs($length_translation_this_sentence-$length); 72 | if ($diff < $closest_diff) { 73 | $closest_diff = $diff; 74 | $closest_length = $length; 75 | # print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n"; 76 | } elsif ($diff == $closest_diff) { 77 | $closest_length = $length if $length < $closest_length; 78 | # from two references with the same closeness to me 79 | # take the *shorter* into account, not the "first" one. 80 | } 81 | for(my $n=1;$n<=4;$n++) { 82 | my %REF_NGRAM_N = (); 83 | for(my $start=0;$start<=$#WORD-($n-1);$start++) { 84 | my $ngram = "$n"; 85 | for(my $w=0;$w<$n;$w++) { 86 | $ngram .= " ".$WORD[$start+$w]; 87 | } 88 | $REF_NGRAM_N{$ngram}++; 89 | } 90 | foreach my $ngram (keys %REF_NGRAM_N) { 91 | if (!defined($REF_NGRAM{$ngram}) || 92 | $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) { 93 | $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram}; 94 | # print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}
\n"; 95 | } 96 | } 97 | } 98 | } 99 | $length_translation += $length_translation_this_sentence; 100 | $length_reference += $closest_length; 101 | for(my $n=1;$n<=4;$n++) { 102 | my %T_NGRAM = (); 103 | for(my $start=0;$start<=$#WORD-($n-1);$start++) { 104 | my $ngram = "$n"; 105 | for(my $w=0;$w<$n;$w++) { 106 | $ngram .= " ".$WORD[$start+$w]; 107 | } 108 | $T_NGRAM{$ngram}++; 109 | } 110 | foreach my $ngram (keys %T_NGRAM) { 111 | $ngram =~ /^(\d+) /; 112 | my $n = $1; 113 | # my $corr = 0; 114 | # print "$i e $ngram $T_NGRAM{$ngram}
\n"; 115 | $TOTAL[$n] += $T_NGRAM{$ngram}; 116 | if (defined($REF_NGRAM{$ngram})) { 117 | if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) { 118 | $CORRECT[$n] += $T_NGRAM{$ngram}; 119 | # $corr = $T_NGRAM{$ngram}; 120 | # print "$i e correct1 $T_NGRAM{$ngram}
\n"; 121 | } 122 | else { 123 | $CORRECT[$n] += $REF_NGRAM{$ngram}; 124 | # $corr = $REF_NGRAM{$ngram}; 125 | # print "$i e correct2 $REF_NGRAM{$ngram}
\n"; 126 | } 127 | } 128 | # $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram}; 129 | # print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n" 130 | } 131 | } 132 | $s++; 133 | } 134 | my $brevity_penalty = 1; 135 | my $bleu = 0; 136 | 137 | my @bleu=(); 138 | 139 | for(my $n=1;$n<=4;$n++) { 140 | if (defined ($TOTAL[$n])){ 141 | $bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0; 142 | # print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n"; 143 | }else{ 144 | $bleu[$n]=0; 145 | } 146 | } 147 | 148 | if ($length_reference==0){ 149 | printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n"; 150 | exit(1); 151 | } 152 | 153 | if ($length_translation<$length_reference) { 154 | $brevity_penalty = exp(1-$length_reference/$length_translation); 155 | } 156 | $bleu = $brevity_penalty * exp((my_log( $bleu[1] ) + 157 | my_log( $bleu[2] ) + 158 | my_log( $bleu[3] ) + 159 | my_log( $bleu[4] ) ) / 4) ; 160 | printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n", 161 | 100*$bleu, 162 | 100*$bleu[1], 163 | 100*$bleu[2], 164 | 100*$bleu[3], 165 | 100*$bleu[4], 166 | $brevity_penalty, 167 | $length_translation / $length_reference, 168 | $length_translation, 169 | $length_reference; 170 | 171 | 172 | print STDERR "It is in-advisable to publish scores from multi-bleu.perl. The scores depend on your tokenizer, which is unlikely to be reproducible from your paper or consistent across research groups. Instead you should detokenize then use mteval-v14.pl, which has a standard tokenization. Scores from multi-bleu.perl can still be used for internal purposes when you have a consistent tokenizer.\n"; 173 | 174 | sub my_log { 175 | return -9999999999 unless $_[0]; 176 | return log($_[0]); 177 | } -------------------------------------------------------------------------------- /metric/sentiment_classifiers.py: -------------------------------------------------------------------------------- 1 | from models.heads import Scorer 2 | import numpy as np 3 | import torch 4 | from transformers import BertTokenizer 5 | import torch.nn.functional as F 6 | from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 7 | analyser = SentimentIntensityAnalyzer() 8 | 9 | device = "cuda" if torch.cuda.is_available() else "cpu" 10 | model = Scorer(hidden_dim=256, 11 | output_dim=5, 12 | n_layers=2, 13 | bidirectional=True, 14 | dropout=0.25).to(device) 15 | model.load_state_dict(torch.load('models/scorers/AmazonReviewFull.pt')) 16 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 17 | max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased'] 18 | init_token_idx = tokenizer.cls_token_id 19 | eos_token_idx = tokenizer.sep_token_id 20 | pad_token_idx = tokenizer.pad_token_id 21 | unk_token_idx = tokenizer.unk_token_id 22 | 23 | def predict_sentiment(model, tokenizer, sentence): 24 | model.eval() 25 | tokens = tokenizer.tokenize(sentence) 26 | tokens = tokens[:max_input_length-2] 27 | indexed = [init_token_idx] + tokenizer.convert_tokens_to_ids(tokens) + [eos_token_idx] 28 | tensor = torch.LongTensor(indexed).to(device) 29 | tensor = tensor.unsqueeze(0) 30 | prediction = F.softmax(model(tensor),1) 31 | pred_t = prediction.argmax(dim=1, keepdim=True) 32 | 33 | return pred_t.item() 34 | 35 | def get_loss(model, tokenizer, sentence, label): 36 | model.eval() 37 | ce_loss_logging = torch.nn.CrossEntropyLoss() 38 | tokens = tokenizer.tokenize(sentence) 39 | tokens = tokens[:max_input_length-2] 40 | indexed = [init_token_idx] + tokenizer.convert_tokens_to_ids(tokens) + [eos_token_idx] 41 | tensor = torch.LongTensor(indexed).to(device) 42 | tensor = tensor.unsqueeze(0) 43 | label = torch.tensor([label], device='cuda', dtype=torch.long) 44 | output_t = model(tensor) 45 | loss = ce_loss_logging(output_t, label).item() 46 | return loss 47 | 48 | # def predict_sentiment(model, tokenizer, sentence): 49 | # model.eval() 50 | # tokens = tokenizer.tokenize(sentence) 51 | # tokens = tokens[:max_input_length-2] 52 | # indexed = [init_token_idx] + tokenizer.convert_tokens_to_ids(tokens) + [eos_token_idx] 53 | # tensor = torch.LongTensor(indexed).to(device) 54 | # tensor = tensor.unsqueeze(0) 55 | # prediction = torch.round(torch.sigmoid(model(tensor))) 56 | 57 | # return prediction.item() 58 | 59 | def sentiment_analyzer_scores(sentence): 60 | score = analyser.polarity_scores(sentence) 61 | if(score["compound"] >= 0.05): return 2 62 | elif(score["compound"] > -0.05 and (score["compound"] < 0.05)): return 1 63 | elif(score["compound"] <= -0.05): return 0 64 | 65 | def get_vater_score(sentences, l): 66 | lable = {"very negative":0,"very positive":2} 67 | acc = [] 68 | for s in sentences: 69 | prediciton = sentiment_analyzer_scores(s) 70 | if(int(prediciton)==int(lable[l])):acc.append(1) 71 | else: acc.append(0) 72 | 73 | return np.mean(acc) 74 | 75 | 76 | def get_sentiment_score(sentences, l): 77 | lable = {"very negative":0,"very positive":4} 78 | acc = [] 79 | for s in sentences: 80 | prediciton = predict_sentiment(model, tokenizer, s) 81 | if(int(prediciton)==int(lable[l])):acc.append(1) 82 | else: acc.append(0) 83 | 84 | return np.mean(acc) 85 | -------------------------------------------------------------------------------- /metric/text_classifier.py: -------------------------------------------------------------------------------- 1 | from models.heads import Scorer 2 | import numpy as np 3 | import torch 4 | from transformers import BertTokenizer 5 | import torch.nn.functional as F 6 | from collections import Counter 7 | 8 | device = "cuda" if torch.cuda.is_available() else "cpu" 9 | 10 | model_AGNEWS = Scorer(hidden_dim=256, 11 | output_dim=4, 12 | n_layers=2, 13 | bidirectional=True, 14 | dropout=0.25).to(device) 15 | model_AGNEWS.load_state_dict(torch.load('models/scorers/TC_AG_NEWS.pt')) 16 | 17 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 18 | max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased'] 19 | init_token_idx = tokenizer.cls_token_id 20 | eos_token_idx = tokenizer.sep_token_id 21 | pad_token_idx = tokenizer.pad_token_id 22 | unk_token_idx = tokenizer.unk_token_id 23 | 24 | def predict_class(model, tokenizer, sentence): 25 | model.eval() 26 | tokens = tokenizer.tokenize(sentence) 27 | tokens = tokens[:max_input_length-2] 28 | indexed = [init_token_idx] + tokenizer.convert_tokens_to_ids(tokens) + [eos_token_idx] 29 | tensor = torch.LongTensor(indexed).to(device) 30 | tensor = tensor.unsqueeze(0) 31 | prediction = F.softmax(model(tensor),1) 32 | pred_t = prediction.argmax(dim=1, keepdim=True) 33 | return pred_t.item() 34 | 35 | 36 | def get_text_score_AGNEWS(sentences,l): 37 | idx2class = ["World","Sports","Business","SciTech"] 38 | class2idx = {c:i for i, c in enumerate(idx2class)} 39 | idx2class = {i: c for i, c in enumerate(idx2class)} 40 | pred = [] 41 | acc = [] 42 | for s in sentences: 43 | prediciton = predict_class(model_AGNEWS, tokenizer, s) 44 | pred.append(idx2class[prediciton]) 45 | if(int(prediciton)==int(class2idx[l])): 46 | acc.append(1) 47 | else: acc.append(0) 48 | 49 | counter = Counter(pred) 50 | return np.mean(acc) #"".join([ f"({freq/len(sentences)}){w}" for w, freq in counter.most_common(1)]) 51 | -------------------------------------------------------------------------------- /metric/torchMoji/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | 91 | # Local data 92 | /data/local 93 | 94 | # Vim swapfiles 95 | *.swp 96 | *.swo 97 | 98 | # nosetests 99 | .noseids 100 | 101 | # pyTorch model 102 | pytorch_model.bin 103 | 104 | # VSCODE 105 | .vscode/* 106 | 107 | # data 108 | *.csv 109 | -------------------------------------------------------------------------------- /metric/torchMoji/.travis.yml: -------------------------------------------------------------------------------- 1 | group: travis_latest 2 | language: python 3 | cache: pip 4 | python: 5 | - 2.7 6 | - 3.6 7 | #- nightly 8 | #- pypy 9 | #- pypy3 10 | matrix: 11 | allow_failures: 12 | - python: nightly 13 | - python: pypy 14 | - python: pypy3 15 | install: 16 | #- pip install -r requirements.txt 17 | - pip install flake8 # pytest # add another testing frameworks later 18 | before_script: 19 | # stop the build if there are Python syntax errors or undefined names 20 | - flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics 21 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 22 | - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 23 | script: 24 | - true # pytest --capture=sys # add other tests here 25 | notifications: 26 | on_success: change 27 | on_failure: change # `always` will be the setting once code changes slow down 28 | -------------------------------------------------------------------------------- /metric/torchMoji/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Bjarke Felbo, Han Thi Nguyen, Thomas Wolf 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /metric/torchMoji/README.md: -------------------------------------------------------------------------------- 1 | ### ------ Update September 2018 ------ 2 | It's been a year since TorchMoji and DeepMoji were released. We're trying to understand how it's being used such that we can make improvements and design better models in the future. 3 | 4 | You can help us achieve this by answering this [4-question Google Form](https://docs.google.com/forms/d/e/1FAIpQLSe1h4NSQD30YM8dsbJQEnki-02_9KVQD34qgP9to0bwAHBvBA/viewform "DeepMoji Google Form"). Thanks for your support! 5 | 6 | # 😇 TorchMoji 7 | 8 | > **Read our blog post about the implementation process [here](https://medium.com/huggingface/understanding-emotions-from-keras-to-pytorch-3ccb61d5a983).** 9 | 10 | TorchMoji is a [pyTorch](http://pytorch.org/) implementation of the [DeepMoji](https://github.com/bfelbo/DeepMoji) model developped by Bjarke Felbo, Alan Mislove, Anders Søgaard, Iyad Rahwan and Sune Lehmann. 11 | 12 | This model trained on 1.2 billion tweets with emojis to understand how language is used to express emotions. Through transfer learning the model can obtain state-of-the-art performance on many emotion-related text modeling tasks. 13 | 14 | Try the online demo of DeepMoji [http://deepmoji.mit.edu](http://deepmoji.mit.edu/)! See the [paper](https://arxiv.org/abs/1708.00524), [blog post](https://medium.com/@bjarkefelbo/what-can-we-learn-from-emojis-6beb165a5ea0) or [FAQ](https://www.media.mit.edu/projects/deepmoji/overview/) for more details. 15 | 16 | ## Overview 17 | * [torchmoji/](torchmoji) contains all the underlying code needed to convert a dataset to the vocabulary and use the model. 18 | * [examples/](examples) contains short code snippets showing how to convert a dataset to the vocabulary, load up the model and run it on that dataset. 19 | * [scripts/](scripts) contains code for processing and analysing datasets to reproduce results in the paper. 20 | * [model/](model) contains the pretrained model and vocabulary. 21 | * [data/](data) contains raw and processed datasets that we include in this repository for testing. 22 | * [tests/](tests) contains unit tests for the codebase. 23 | 24 | To start out with, have a look inside the [examples/](examples) directory. See [score_texts_emojis.py](examples/score_texts_emojis.py) for how to use DeepMoji to extract emoji predictions, [encode_texts.py](examples/encode_texts.py) for how to convert text into 2304-dimensional emotional feature vectors or [finetune_youtube_last.py](examples/finetune_youtube_last.py) for how to use the model for transfer learning on a new dataset. 25 | 26 | Please consider citing the [paper](https://arxiv.org/abs/1708.00524) of DeepMoji if you use the model or code (see below for citation). 27 | 28 | ## Installation 29 | 30 | We assume that you're using [Python 2.7-3.5](https://www.python.org/downloads/) with [pip](https://pip.pypa.io/en/stable/installing/) installed. 31 | 32 | First you need to install [pyTorch (version 0.2+)](http://pytorch.org/), currently by: 33 | ```bash 34 | conda install pytorch -c pytorch 35 | ``` 36 | At the present stage the model can't make efficient use of CUDA. See details in the [Hugging Face blog post](https://medium.com/huggingface/understanding-emotions-from-keras-to-pytorch-3ccb61d5a983). 37 | 38 | When pyTorch is installed, run the following in the root directory to install the remaining dependencies: 39 | 40 | ```bash 41 | pip install -e . 42 | ``` 43 | This will install the following dependencies: 44 | * [scikit-learn](https://github.com/scikit-learn/scikit-learn) 45 | * [text-unidecode](https://github.com/kmike/text-unidecode) 46 | * [emoji](https://github.com/carpedm20/emoji) 47 | 48 | Then, run the download script to downloads the pretrained torchMoji weights (~85MB) from [here](https://www.dropbox.com/s/q8lax9ary32c7t9/pytorch_model.bin?dl=0) and put them in the model/ directory: 49 | 50 | ```bash 51 | python scripts/download_weights.py 52 | ``` 53 | 54 | ## Testing 55 | To run the tests, install [nose](http://nose.readthedocs.io/en/latest/). After installing, navigate to the [tests/](tests) directory and run: 56 | 57 | ```bash 58 | cd tests 59 | nosetests -v 60 | ``` 61 | 62 | By default, this will also run finetuning tests. These tests train the model for one epoch and then check the resulting accuracy, which may take several minutes to finish. If you'd prefer to exclude those, run the following instead: 63 | 64 | ```bash 65 | cd tests 66 | nosetests -v -a '!slow' 67 | ``` 68 | 69 | ## Disclaimer 70 | This code has been tested to work with Python 2.7 and 3.5 on Ubuntu 16.04 and macOS Sierra machines. It has not been optimized for efficiency, but should be fast enough for most purposes. We do not give any guarantees that there are no bugs - use the code on your own responsibility! 71 | 72 | ## Contributions 73 | We welcome pull requests if you feel like something could be improved. You can also greatly help us by telling us how you felt when writing your most recent tweets. Just click [here](http://deepmoji.mit.edu/contribute/) to contribute. 74 | 75 | ## License 76 | This code and the pretrained model is licensed under the MIT license. 77 | 78 | ## Benchmark datasets 79 | The benchmark datasets are uploaded to this repository for convenience purposes only. They were not released by us and we do not claim any rights on them. Use the datasets at your responsibility and make sure you fulfill the licenses that they were released with. If you use any of the benchmark datasets please consider citing the original authors. 80 | 81 | ## Citation 82 | ``` 83 | @inproceedings{felbo2017, 84 | title={Using millions of emoji occurrences to learn any-domain representations for detecting sentiment, emotion and sarcasm}, 85 | author={Felbo, Bjarke and Mislove, Alan and S{\o}gaard, Anders and Rahwan, Iyad and Lehmann, Sune}, 86 | booktitle={Conference on Empirical Methods in Natural Language Processing (EMNLP)}, 87 | year={2017} 88 | } 89 | ``` 90 | -------------------------------------------------------------------------------- /metric/torchMoji/data/.gitkeep: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /metric/torchMoji/data/Olympic/raw.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andreamad8/PPCM/e5bef1bbb70907a3d65de3225a00e4af9104d4a8/metric/torchMoji/data/Olympic/raw.pickle -------------------------------------------------------------------------------- /metric/torchMoji/data/PsychExp/raw.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andreamad8/PPCM/e5bef1bbb70907a3d65de3225a00e4af9104d4a8/metric/torchMoji/data/PsychExp/raw.pickle -------------------------------------------------------------------------------- /metric/torchMoji/data/emoji_codes.json: -------------------------------------------------------------------------------- 1 | { 2 | "0": ":joy:", 3 | "1": ":unamused:", 4 | "2": ":weary:", 5 | "3": ":sob:", 6 | "4": ":heart_eyes:", 7 | "5": ":pensive:", 8 | "6": ":ok_hand:", 9 | "7": ":blush:", 10 | "8": ":heart:", 11 | "9": ":smirk:", 12 | "10":":grin:", 13 | "11":":notes:", 14 | "12":":flushed:", 15 | "13":":100:", 16 | "14":":sleeping:", 17 | "15":":relieved:", 18 | "16":":relaxed:", 19 | "17":":raised_hands:", 20 | "18":":two_hearts:", 21 | "19":":expressionless:", 22 | "20":":sweat_smile:", 23 | "21":":pray:", 24 | "22":":confused:", 25 | "23":":kissing_heart:", 26 | "24":":hearts:", 27 | "25":":neutral_face:", 28 | "26":":information_desk_person:", 29 | "27":":disappointed:", 30 | "28":":see_no_evil:", 31 | "29":":tired_face:", 32 | "30":":v:", 33 | "31":":sunglasses:", 34 | "32":":rage:", 35 | "33":":thumbsup:", 36 | "34":":cry:", 37 | "35":":sleepy:", 38 | "36":":stuck_out_tongue_winking_eye:", 39 | "37":":triumph:", 40 | "38":":raised_hand:", 41 | "39":":mask:", 42 | "40":":clap:", 43 | "41":":eyes:", 44 | "42":":gun:", 45 | "43":":persevere:", 46 | "44":":imp:", 47 | "45":":sweat:", 48 | "46":":broken_heart:", 49 | "47":":blue_heart:", 50 | "48":":headphones:", 51 | "49":":speak_no_evil:", 52 | "50":":wink:", 53 | "51":":skull:", 54 | "52":":confounded:", 55 | "53":":smile:", 56 | "54":":stuck_out_tongue_winking_eye:", 57 | "55":":angry:", 58 | "56":":no_good:", 59 | "57":":muscle:", 60 | "58":":punch:", 61 | "59":":purple_heart:", 62 | "60":":sparkling_heart:", 63 | "61":":blue_heart:", 64 | "62":":grimacing:", 65 | "63":":sparkles:" 66 | } 67 | 68 | -------------------------------------------------------------------------------- /metric/torchMoji/emoji_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andreamad8/PPCM/e5bef1bbb70907a3d65de3225a00e4af9104d4a8/metric/torchMoji/emoji_overview.png -------------------------------------------------------------------------------- /metric/torchMoji/examples/.gitkeep: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /metric/torchMoji/examples/README.md: -------------------------------------------------------------------------------- 1 | # torchMoji examples 2 | 3 | ## Initialization 4 | [create_twitter_vocab.py](create_twitter_vocab.py) 5 | Create a new vocabulary from a tsv file. 6 | 7 | [tokenize_dataset.py](tokenize_dataset.py) 8 | Tokenize a given dataset using the prebuilt vocabulary. 9 | 10 | [vocab_extension.py](vocab_extension.py) 11 | Extend the given vocabulary using dataset-specific words. 12 | 13 | [dataset_split.py](dataset_split.py) 14 | Split a given dataset into training, validation and testing. 15 | 16 | ## Use pretrained model/architecture 17 | [score_texts_emojis.py](score_texts_emojis.py) 18 | Use torchMoji to score texts for emoji distribution. 19 | 20 | [text_emojize.py](text_emojize.py) 21 | Use torchMoji to output emoji visualization from a single text input (mapped from `emoji_overview.png`) 22 | 23 | ```sh 24 | python examples/text_emojize.py --text "I love mom's cooking\!" 25 | # => I love mom's cooking! 😋 😍 💓 💛 ❤ 26 | ``` 27 | 28 | [encode_texts.py](encode_texts.py) 29 | Use torchMoji to encode the text into 2304-dimensional feature vectors for further modeling/analysis. 30 | 31 | ## Transfer learning 32 | [finetune_youtube_last.py](finetune_youtube_last.py) 33 | Finetune the model on the SS-Youtube dataset using the 'last' method. 34 | 35 | [finetune_insults_chain-thaw.py](finetune_insults_chain-thaw.py) 36 | Finetune the model on the Kaggle insults dataset (from blog post) using the 'chain-thaw' method. 37 | 38 | [finetune_semeval_class-avg_f1.py](finetune_semeval_class-avg_f1.py) 39 | Finetune the model on the SemeEval emotion dataset using the 'full' method and evaluate using the class average F1 metric. 40 | -------------------------------------------------------------------------------- /metric/torchMoji/examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andreamad8/PPCM/e5bef1bbb70907a3d65de3225a00e4af9104d4a8/metric/torchMoji/examples/__init__.py -------------------------------------------------------------------------------- /metric/torchMoji/examples/create_twitter_vocab.py: -------------------------------------------------------------------------------- 1 | """ Creates a vocabulary from a tsv file. 2 | """ 3 | 4 | import codecs 5 | import example_helper 6 | from torchmoji.create_vocab import VocabBuilder 7 | from torchmoji.word_generator import TweetWordGenerator 8 | 9 | with codecs.open('../../twitterdata/tweets.2016-09-01', 'rU', 'utf-8') as stream: 10 | wg = TweetWordGenerator(stream) 11 | vb = VocabBuilder(wg) 12 | vb.count_all_words() 13 | vb.save_vocab() 14 | -------------------------------------------------------------------------------- /metric/torchMoji/examples/dataset_split.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Split a given dataset into three different datasets: training, validation and 3 | testing. 4 | 5 | This is achieved by splitting the given list of sentences into three separate 6 | lists according to either a given ratio (e.g. [0.7, 0.1, 0.2]) or by an 7 | explicit enumeration. The sentences are also tokenised using the given 8 | vocabulary. 9 | 10 | Also splits a given list of dictionaries containing information about 11 | each sentence. 12 | 13 | An additional parameter can be set 'extend_with', which will extend the given 14 | vocabulary with up to 'extend_with' tokens, taken from the training dataset. 15 | ''' 16 | from __future__ import print_function, unicode_literals 17 | import example_helper 18 | import json 19 | 20 | from torchmoji.sentence_tokenizer import SentenceTokenizer 21 | 22 | DATASET = [ 23 | 'I am sentence 0', 24 | 'I am sentence 1', 25 | 'I am sentence 2', 26 | 'I am sentence 3', 27 | 'I am sentence 4', 28 | 'I am sentence 5', 29 | 'I am sentence 6', 30 | 'I am sentence 7', 31 | 'I am sentence 8', 32 | 'I am sentence 9 newword', 33 | ] 34 | 35 | INFO_DICTS = [ 36 | {'label': 'sentence 0'}, 37 | {'label': 'sentence 1'}, 38 | {'label': 'sentence 2'}, 39 | {'label': 'sentence 3'}, 40 | {'label': 'sentence 4'}, 41 | {'label': 'sentence 5'}, 42 | {'label': 'sentence 6'}, 43 | {'label': 'sentence 7'}, 44 | {'label': 'sentence 8'}, 45 | {'label': 'sentence 9'}, 46 | ] 47 | 48 | with open('../model/vocabulary.json', 'r') as f: 49 | vocab = json.load(f) 50 | st = SentenceTokenizer(vocab, 30) 51 | 52 | # Split using the default split ratio 53 | print(st.split_train_val_test(DATASET, INFO_DICTS)) 54 | 55 | # Split explicitly 56 | print(st.split_train_val_test(DATASET, 57 | INFO_DICTS, 58 | [[0, 1, 2, 4, 9], [5, 6], [7, 8, 3]], 59 | extend_with=1)) 60 | -------------------------------------------------------------------------------- /metric/torchMoji/examples/encode_texts.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ Use torchMoji to encode texts into emotional feature vectors. 4 | """ 5 | from __future__ import print_function, division, unicode_literals 6 | import json 7 | 8 | from torchmoji.sentence_tokenizer import SentenceTokenizer 9 | from torchmoji.model_def import torchmoji_feature_encoding 10 | from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH 11 | 12 | TEST_SENTENCES = ['I love mom\'s cooking', 13 | 'I love how you never reply back..', 14 | 'I love cruising with my homies', 15 | 'I love messing with yo mind!!', 16 | 'I love you and now you\'re just gone..', 17 | 'This is shit', 18 | 'This is the shit'] 19 | 20 | maxlen = 30 21 | batch_size = 32 22 | 23 | print('Tokenizing using dictionary from {}'.format(VOCAB_PATH)) 24 | with open(VOCAB_PATH, 'r') as f: 25 | vocabulary = json.load(f) 26 | st = SentenceTokenizer(vocabulary, maxlen) 27 | tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES) 28 | 29 | print('Loading model from {}.'.format(PRETRAINED_PATH)) 30 | model = torchmoji_feature_encoding(PRETRAINED_PATH) 31 | print(model) 32 | 33 | print('Encoding texts..') 34 | encoding = model(tokenized) 35 | 36 | print('First 5 dimensions for sentence: {}'.format(TEST_SENTENCES[0])) 37 | print(encoding[0,:5]) 38 | 39 | # Now you could visualize the encodings to see differences, 40 | # run a logistic regression classifier on top, 41 | # or basically anything you'd like to do. -------------------------------------------------------------------------------- /metric/torchMoji/examples/example_helper.py: -------------------------------------------------------------------------------- 1 | """ Module import helper. 2 | Modifies PATH in order to allow us to import the torchmoji directory. 3 | """ 4 | import sys 5 | from os.path import abspath, dirname 6 | sys.path.insert(0, dirname(dirname(abspath(__file__)))) 7 | -------------------------------------------------------------------------------- /metric/torchMoji/examples/finetune_insults_chain-thaw.py: -------------------------------------------------------------------------------- 1 | """Finetuning example. 2 | 3 | Trains the torchMoji model on the kaggle insults dataset, using the 'chain-thaw' 4 | finetuning method and the accuracy metric. See the blog post at 5 | https://medium.com/@bjarkefelbo/what-can-we-learn-from-emojis-6beb165a5ea0 6 | for more information. Note that results may differ a bit due to slight 7 | changes in preprocessing and train/val/test split. 8 | 9 | The 'chain-thaw' method does the following: 10 | 0) Load all weights except for the softmax layer. Extend the embedding layer if 11 | necessary, initialising the new weights with random values. 12 | 1) Freeze every layer except the last (softmax) layer and train it. 13 | 2) Freeze every layer except the first layer and train it. 14 | 3) Freeze every layer except the second etc., until the second last layer. 15 | 4) Unfreeze all layers and train entire model. 16 | """ 17 | 18 | from __future__ import print_function 19 | import example_helper 20 | import json 21 | from torchmoji.model_def import torchmoji_transfer 22 | from torchmoji.global_variables import PRETRAINED_PATH 23 | from torchmoji.finetuning import ( 24 | load_benchmark, 25 | finetune) 26 | 27 | 28 | DATASET_PATH = '../data/kaggle-insults/raw.pickle' 29 | nb_classes = 2 30 | 31 | with open('../model/vocabulary.json', 'r') as f: 32 | vocab = json.load(f) 33 | 34 | # Load dataset. Extend the existing vocabulary with up to 10000 tokens from 35 | # the training dataset. 36 | data = load_benchmark(DATASET_PATH, vocab, extend_with=10000) 37 | 38 | # Set up model and finetune. Note that we have to extend the embedding layer 39 | # with the number of tokens added to the vocabulary. 40 | model = torchmoji_transfer(nb_classes, PRETRAINED_PATH, extend_embedding=data['added']) 41 | print(model) 42 | model, acc = finetune(model, data['texts'], data['labels'], nb_classes, 43 | data['batch_size'], method='chain-thaw') 44 | print('Acc: {}'.format(acc)) 45 | -------------------------------------------------------------------------------- /metric/torchMoji/examples/finetune_semeval_class-avg_f1.py: -------------------------------------------------------------------------------- 1 | """Finetuning example. 2 | 3 | Trains the torchMoji model on the SemEval emotion dataset, using the 'last' 4 | finetuning method and the class average F1 metric. 5 | 6 | The 'last' method does the following: 7 | 0) Load all weights except for the softmax layer. Do not add tokens to the 8 | vocabulary and do not extend the embedding layer. 9 | 1) Freeze all layers except for the softmax layer. 10 | 2) Train. 11 | 12 | The class average F1 metric does the following: 13 | 1) For each class, relabel the dataset into binary classification 14 | (belongs to/does not belong to this class). 15 | 2) Calculate F1 score for each class. 16 | 3) Compute the average of all F1 scores. 17 | """ 18 | 19 | from __future__ import print_function 20 | import example_helper 21 | import json 22 | from torchmoji.finetuning import load_benchmark 23 | from torchmoji.class_avg_finetuning import class_avg_finetune 24 | from torchmoji.model_def import torchmoji_transfer 25 | from torchmoji.global_variables import PRETRAINED_PATH 26 | 27 | DATASET_PATH = '../data/SE0714/raw.pickle' 28 | nb_classes = 3 29 | 30 | with open('../model/vocabulary.json', 'r') as f: 31 | vocab = json.load(f) 32 | 33 | 34 | # Load dataset. Extend the existing vocabulary with up to 10000 tokens from 35 | # the training dataset. 36 | data = load_benchmark(DATASET_PATH, vocab, extend_with=10000) 37 | 38 | # Set up model and finetune. Note that we have to extend the embedding layer 39 | # with the number of tokens added to the vocabulary. 40 | # 41 | # Also note that when using class average F1 to evaluate, the model has to be 42 | # defined with two classes, since the model will be trained for each class 43 | # separately. 44 | model = torchmoji_transfer(2, PRETRAINED_PATH, extend_embedding=data['added']) 45 | print(model) 46 | 47 | # For finetuning however, pass in the actual number of classes. 48 | model, f1 = class_avg_finetune(model, data['texts'], data['labels'], 49 | nb_classes, data['batch_size'], method='last') 50 | print('F1: {}'.format(f1)) 51 | -------------------------------------------------------------------------------- /metric/torchMoji/examples/finetune_youtube_last.py: -------------------------------------------------------------------------------- 1 | """Finetuning example. 2 | 3 | Trains the torchMoji model on the SS-Youtube dataset, using the 'last' 4 | finetuning method and the accuracy metric. 5 | 6 | The 'last' method does the following: 7 | 0) Load all weights except for the softmax layer. Do not add tokens to the 8 | vocabulary and do not extend the embedding layer. 9 | 1) Freeze all layers except for the softmax layer. 10 | 2) Train. 11 | """ 12 | 13 | from __future__ import print_function 14 | import example_helper 15 | import json 16 | from torchmoji.model_def import torchmoji_transfer 17 | from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH, ROOT_PATH 18 | from torchmoji.finetuning import ( 19 | load_benchmark, 20 | finetune) 21 | 22 | DATASET_PATH = '{}/data/SS-Youtube/raw.pickle'.format(ROOT_PATH) 23 | nb_classes = 2 24 | 25 | with open(VOCAB_PATH, 'r') as f: 26 | vocab = json.load(f) 27 | 28 | # Load dataset. 29 | data = load_benchmark(DATASET_PATH, vocab) 30 | 31 | # Set up model and finetune 32 | model = torchmoji_transfer(nb_classes, PRETRAINED_PATH) 33 | print(model) 34 | model, acc = finetune(model, data['texts'], data['labels'], nb_classes, data['batch_size'], method='last') 35 | print('Acc: {}'.format(acc)) 36 | -------------------------------------------------------------------------------- /metric/torchMoji/examples/score_texts_emojis.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ Use torchMoji to score texts for emoji distribution. 4 | 5 | The resulting emoji ids (0-63) correspond to the mapping 6 | in emoji_overview.png file at the root of the torchMoji repo. 7 | 8 | Writes the result to a csv file. 9 | """ 10 | from __future__ import print_function, division, unicode_literals 11 | # import example_helper 12 | import json 13 | import csv 14 | import numpy as np 15 | from collections import Counter 16 | 17 | from torchmoji.sentence_tokenizer import SentenceTokenizer 18 | from torchmoji.model_def import torchmoji_emojis 19 | from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH 20 | 21 | # OUTPUT_PATH = 'test_sentences.csv' 22 | 23 | # TEST_SENTENCES = ['I love mom\'s cooking', 24 | # 'I love how you never reply back..', 25 | # 'I love cruising with my homies', 26 | # 'I love messing with yo mind!!', 27 | # 'I love you and now you\'re just gone..', 28 | # 'This is shit', 29 | # 'This is the shit'] 30 | 31 | 32 | def top_elements(array, k): 33 | ind = np.argpartition(array, -k)[-k:] 34 | return ind[np.argsort(array[ind])][::-1] 35 | 36 | def most_frequent(List): 37 | return max(set(List), key = List.count) 38 | 39 | with open(VOCAB_PATH, 'r') as f: 40 | vocabulary = json.load(f) 41 | 42 | maxlen = 30 43 | st = SentenceTokenizer(vocabulary, maxlen) 44 | 45 | model = torchmoji_emojis(PRETRAINED_PATH) 46 | 47 | def get_emoji_score(sentence, label): 48 | mapper = {"angry":anger, "disgusted":disgust, "terrified":fear, "joyful":joy, "sad":sad, "surprised":surprise} 49 | tokenized, _, _ = st.tokenize_sentences(sentence) 50 | prob = model(tokenized) 51 | scores = [] 52 | acc = [] 53 | for i, t in enumerate(sentence): 54 | t_prob = prob[i] 55 | ind_top = top_elements(t_prob, 2) 56 | if(emoji_list[ind_top[0]][1] in mapper[label] or emoji_list[ind_top[1]][1] in mapper[label]): 57 | acc.append(1) 58 | else: 59 | acc.append(0) 60 | 61 | scores.append([emoji_list[i][1].upper() for i in ind_top]) 62 | 63 | counter = Counter(sum(scores,[])) 64 | 65 | return "".join([ f'({c})' + r'{\NotoEmoji\symbol{"' +str(w)+'}}' for w, c in counter.most_common(4)]),np.mean(acc) 66 | 67 | 68 | joy = ["1f602", "1f604","1f60a","1f60b", "1f60c", "1f60d", "1f60e", "1f60f", "263a", "1f618", 69 | "1f61c","2764", "1f496", "1f495", "1f601","2665","270c","2661","1f3a7","1f49c","1f496","1f499"] 70 | sad = ["1f614", "1f615", "1f62b", "1f629", "1f622", 71 | "1f62a", "1f62d", "1f494"] 72 | anger= ["1f62c", "1f620", "1f610","1f611", "1f621", "1f616", "1f624"] 73 | disgust = ["1f637"] 74 | fear = ["1f605"] 75 | surprise = ["1f633"] 76 | 77 | 78 | emoji_list = [["\U0001f602","1f602"], 79 | ["\U0001f612","1f612"], 80 | ["\U0001f629","1f629"], 81 | ["\U0001f62d","1f62d"], 82 | ["\U0001f60d","1f60d"], 83 | ["\U0001f614","1f614"], 84 | ["\U0001f44c","1f44c"], 85 | ["\U0001f60a","1f60a"], 86 | ["\u2764","2764"], 87 | ["\U0001f60f","1f60f"], 88 | ["\U0001f601","1f601"], 89 | ["\U0001f3b6","1f3b6"], 90 | ["\U0001f633","1f633"], 91 | ["\U0001f4af","1f4af"], 92 | ["\U0001f634","1f634"], 93 | ["\U0001f60c","1f60c"], 94 | ["\u263a","263a"], 95 | ["\U0001f64c","1f64c"], 96 | ["\U0001f495","1f495"], 97 | ["\U0001f611","1f611"], 98 | ["\U0001f605","1f605"], 99 | ["\U0001f64f","1f64f"], 100 | ["\U0001f615","1f615"], 101 | ["\U0001f618","1f618"], 102 | ["\u2665","2665"], 103 | ["\U0001f610","1f610"], 104 | ["\U0001f481","1f481"], 105 | ["\U0001f61e","1f61e"], 106 | ["\U0001f648","1f648"], 107 | ["\U0001f62b","1f62b"], 108 | ["\u270c","270c"], 109 | ["\U0001f60e","1f60e"], 110 | ["\U0001f621","1f621"], 111 | ["\U0001f44d","1f44d"], 112 | ["\U0001f622","1f622"], 113 | ["\U0001f62a","1f62a"], 114 | ["\U0001f60b","1f60b"], 115 | ["\U0001f624","1f624"], 116 | ["\u270b","270b"], 117 | ["\U0001f637","1f637"], 118 | ["\U0001f44f","1f44f"], 119 | ["\U0001f440","1f440"], 120 | ["\U0001f52b","1f52b"], 121 | ["\U0001f623","1f623"], 122 | ["\U0001f608","1f608"], 123 | ["\U0001f613","1f613"], 124 | ["\U0001f494","1f494"], 125 | ["\u2661","2661"], 126 | ["\U0001f3a7","1f3a7"], 127 | ["\U0001f64a","1f64a"], 128 | ["\U0001f609","1f609"], 129 | ["\U0001f480","1f480"], 130 | ["\U0001f616","1f616"], 131 | ["\U0001f604","1f604"], 132 | ["\U0001f61c","1f61c"], 133 | ["\U0001f620","1f620"], 134 | ["\U0001f645","1f645"], 135 | ["\U0001f4aa","1f4aa"], 136 | ["\U0001f44a","1f44a"], 137 | ["\U0001f49c","1f49c"], 138 | ["\U0001f496","1f496"], 139 | ["\U0001f499","1f499"], 140 | ["\U0001f62c","1f62c"], 141 | ["\u2728","2728"]] -------------------------------------------------------------------------------- /metric/torchMoji/examples/text_emojize.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ Use torchMoji to predict emojis from a single text input 4 | """ 5 | 6 | from __future__ import print_function, division, unicode_literals 7 | import example_helper 8 | import json 9 | import csv 10 | import argparse 11 | 12 | import numpy as np 13 | import emoji 14 | 15 | from torchmoji.sentence_tokenizer import SentenceTokenizer 16 | from torchmoji.model_def import torchmoji_emojis 17 | from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH 18 | 19 | # Emoji map in emoji_overview.png 20 | EMOJIS = ":joy: :unamused: :weary: :sob: :heart_eyes: \ 21 | :pensive: :ok_hand: :blush: :heart: :smirk: \ 22 | :grin: :notes: :flushed: :100: :sleeping: \ 23 | :relieved: :relaxed: :raised_hands: :two_hearts: :expressionless: \ 24 | :sweat_smile: :pray: :confused: :kissing_heart: :heartbeat: \ 25 | :neutral_face: :information_desk_person: :disappointed: :see_no_evil: :tired_face: \ 26 | :v: :sunglasses: :rage: :thumbsup: :cry: \ 27 | :sleepy: :yum: :triumph: :hand: :mask: \ 28 | :clap: :eyes: :gun: :persevere: :smiling_imp: \ 29 | :sweat: :broken_heart: :yellow_heart: :musical_note: :speak_no_evil: \ 30 | :wink: :skull: :confounded: :smile: :stuck_out_tongue_winking_eye: \ 31 | :angry: :no_good: :muscle: :facepunch: :purple_heart: \ 32 | :sparkling_heart: :blue_heart: :grimacing: :sparkles:".split(' ') 33 | 34 | def top_elements(array, k): 35 | ind = np.argpartition(array, -k)[-k:] 36 | return ind[np.argsort(array[ind])][::-1] 37 | 38 | if __name__ == "__main__": 39 | argparser = argparse.ArgumentParser() 40 | argparser.add_argument('--text', type=str, required=True, help="Input text to emojize") 41 | argparser.add_argument('--maxlen', type=int, default=30, help="Max length of input text") 42 | args = argparser.parse_args() 43 | 44 | # Tokenizing using dictionary 45 | with open(VOCAB_PATH, 'r') as f: 46 | vocabulary = json.load(f) 47 | 48 | st = SentenceTokenizer(vocabulary, args.maxlen) 49 | 50 | # Loading model 51 | model = torchmoji_emojis(PRETRAINED_PATH) 52 | # Running predictions 53 | tokenized, _, _ = st.tokenize_sentences([args.text]) 54 | # Get sentence probability 55 | prob = model(tokenized)[0] 56 | 57 | # Top emoji id 58 | emoji_ids = top_elements(prob, 5) 59 | 60 | # map to emojis 61 | emojis = map(lambda x: EMOJIS[x], emoji_ids) 62 | 63 | print(emoji.emojize("{} {}".format(args.text,' '.join(emojis)), use_aliases=True)) 64 | -------------------------------------------------------------------------------- /metric/torchMoji/examples/tokenize_dataset.py: -------------------------------------------------------------------------------- 1 | """ 2 | Take a given list of sentences and turn it into a numpy array, where each 3 | number corresponds to a word. Padding is used (number 0) to ensure fixed length 4 | of sentences. 5 | """ 6 | 7 | from __future__ import print_function, unicode_literals 8 | import example_helper 9 | import json 10 | from torchmoji.sentence_tokenizer import SentenceTokenizer 11 | 12 | with open('../model/vocabulary.json', 'r') as f: 13 | vocabulary = json.load(f) 14 | 15 | st = SentenceTokenizer(vocabulary, 30) 16 | test_sentences = [ 17 | '\u2014 -- \u203c !!\U0001F602', 18 | 'Hello world!', 19 | 'This is a sample tweet #example', 20 | ] 21 | 22 | tokens, infos, stats = st.tokenize_sentences(test_sentences) 23 | 24 | print(tokens) 25 | print(infos) 26 | print(stats) 27 | -------------------------------------------------------------------------------- /metric/torchMoji/examples/vocab_extension.py: -------------------------------------------------------------------------------- 1 | """ 2 | Extend the given vocabulary using dataset-specific words. 3 | 4 | 1. First create a vocabulary for the specific dataset. 5 | 2. Find all words not in our vocabulary, but in the dataset vocabulary. 6 | 3. Take top X (default=1000) of these words and add them to the vocabulary. 7 | 4. Save this combined vocabulary and embedding matrix, which can now be used. 8 | """ 9 | 10 | from __future__ import print_function, unicode_literals 11 | import example_helper 12 | import json 13 | from torchmoji.create_vocab import extend_vocab, VocabBuilder 14 | from torchmoji.word_generator import WordGenerator 15 | 16 | new_words = ['#zzzzaaazzz', 'newword', 'newword'] 17 | word_gen = WordGenerator(new_words) 18 | vb = VocabBuilder(word_gen) 19 | vb.count_all_words() 20 | 21 | with open('../model/vocabulary.json') as f: 22 | vocab = json.load(f) 23 | 24 | print(len(vocab)) 25 | print(vb.word_counts) 26 | extend_vocab(vocab, vb, max_tokens=1) 27 | 28 | # 'newword' should be added because it's more frequent in the given vocab 29 | print(vocab['newword']) 30 | print(len(vocab)) 31 | -------------------------------------------------------------------------------- /metric/torchMoji/model/.gitkeep: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /metric/torchMoji/scripts/analyze_all_results.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | # allow us to import the codebase directory 4 | import sys 5 | import glob 6 | import numpy as np 7 | from os.path import dirname, abspath 8 | sys.path.insert(0, dirname(dirname(abspath(__file__)))) 9 | 10 | DATASETS = ['SE0714', 'Olympic', 'PsychExp', 'SS-Twitter', 'SS-Youtube', 11 | 'SCv1', 'SV2-GEN'] # 'SE1604' excluded due to Twitter's ToS 12 | 13 | def get_results(dset): 14 | METHOD = 'last' 15 | RESULTS_DIR = 'results/' 16 | RESULT_PATHS = glob.glob('{}/{}_{}_*_results.txt'.format(RESULTS_DIR, dset, METHOD)) 17 | assert len(RESULT_PATHS) 18 | 19 | scores = [] 20 | for path in RESULT_PATHS: 21 | with open(path) as f: 22 | score = f.readline().split(':')[1] 23 | scores.append(float(score)) 24 | 25 | average = np.mean(scores) 26 | maximum = max(scores) 27 | minimum = min(scores) 28 | std = np.std(scores) 29 | 30 | print('Dataset: {}'.format(dset)) 31 | print('Method: {}'.format(METHOD)) 32 | print('Number of results: {}'.format(len(scores))) 33 | print('--------------------------') 34 | print('Average: {}'.format(average)) 35 | print('Maximum: {}'.format(maximum)) 36 | print('Minimum: {}'.format(minimum)) 37 | print('Standard deviaton: {}'.format(std)) 38 | 39 | for dset in DATASETS: 40 | get_results(dset) 41 | -------------------------------------------------------------------------------- /metric/torchMoji/scripts/analyze_results.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import sys 4 | import glob 5 | import numpy as np 6 | 7 | DATASET = 'SS-Twitter' # 'SE1604' excluded due to Twitter's ToS 8 | METHOD = 'new' 9 | 10 | # Optional usage: analyze_results.py 11 | if len(sys.argv) == 3: 12 | DATASET = sys.argv[1] 13 | METHOD = sys.argv[2] 14 | 15 | RESULTS_DIR = 'results/' 16 | RESULT_PATHS = glob.glob('{}/{}_{}_*_results.txt'.format(RESULTS_DIR, DATASET, METHOD)) 17 | 18 | if not RESULT_PATHS: 19 | print('Could not find results for \'{}\' using \'{}\' in directory \'{}\'.'.format(DATASET, METHOD, RESULTS_DIR)) 20 | else: 21 | scores = [] 22 | for path in RESULT_PATHS: 23 | with open(path) as f: 24 | score = f.readline().split(':')[1] 25 | scores.append(float(score)) 26 | 27 | average = np.mean(scores) 28 | maximum = max(scores) 29 | minimum = min(scores) 30 | std = np.std(scores) 31 | 32 | print('Dataset: {}'.format(DATASET)) 33 | print('Method: {}'.format(METHOD)) 34 | print('Number of results: {}'.format(len(scores))) 35 | print('--------------------------') 36 | print('Average: {}'.format(average)) 37 | print('Maximum: {}'.format(maximum)) 38 | print('Minimum: {}'.format(minimum)) 39 | print('Standard deviaton: {}'.format(std)) 40 | -------------------------------------------------------------------------------- /metric/torchMoji/scripts/calculate_coverages.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import pickle 3 | import json 4 | import csv 5 | import sys 6 | from io import open 7 | 8 | # Allow us to import the torchmoji directory 9 | from os.path import dirname, abspath 10 | sys.path.insert(0, dirname(dirname(abspath(__file__)))) 11 | 12 | from torchmoji.sentence_tokenizer import SentenceTokenizer, coverage 13 | 14 | try: 15 | unicode # Python 2 16 | except NameError: 17 | unicode = str # Python 3 18 | 19 | IS_PYTHON2 = int(sys.version[0]) == 2 20 | 21 | OUTPUT_PATH = 'coverage.csv' 22 | DATASET_PATHS = [ 23 | '../data/Olympic/raw.pickle', 24 | '../data/PsychExp/raw.pickle', 25 | '../data/SCv1/raw.pickle', 26 | '../data/SCv2-GEN/raw.pickle', 27 | '../data/SE0714/raw.pickle', 28 | #'../data/SE1604/raw.pickle', # Excluded due to Twitter's ToS 29 | '../data/SS-Twitter/raw.pickle', 30 | '../data/SS-Youtube/raw.pickle', 31 | ] 32 | 33 | with open('../model/vocabulary.json', 'r') as f: 34 | vocab = json.load(f) 35 | 36 | results = [] 37 | for p in DATASET_PATHS: 38 | coverage_result = [p] 39 | print('Calculating coverage for {}'.format(p)) 40 | with open(p, 'rb') as f: 41 | if IS_PYTHON2: 42 | s = pickle.load(f) 43 | else: 44 | s = pickle.load(f, fix_imports=True) 45 | 46 | # Decode data 47 | try: 48 | s['texts'] = [unicode(x) for x in s['texts']] 49 | except UnicodeDecodeError: 50 | s['texts'] = [x.decode('utf-8') for x in s['texts']] 51 | 52 | # Own 53 | st = SentenceTokenizer({}, 30) 54 | tests, dicts, _ = st.split_train_val_test(s['texts'], s['info'], 55 | [s['train_ind'], 56 | s['val_ind'], 57 | s['test_ind']], 58 | extend_with=10000) 59 | coverage_result.append(coverage(tests[2])) 60 | 61 | # Last 62 | st = SentenceTokenizer(vocab, 30) 63 | tests, dicts, _ = st.split_train_val_test(s['texts'], s['info'], 64 | [s['train_ind'], 65 | s['val_ind'], 66 | s['test_ind']], 67 | extend_with=0) 68 | coverage_result.append(coverage(tests[2])) 69 | 70 | # Full 71 | st = SentenceTokenizer(vocab, 30) 72 | tests, dicts, _ = st.split_train_val_test(s['texts'], s['info'], 73 | [s['train_ind'], 74 | s['val_ind'], 75 | s['test_ind']], 76 | extend_with=10000) 77 | coverage_result.append(coverage(tests[2])) 78 | 79 | results.append(coverage_result) 80 | 81 | with open(OUTPUT_PATH, 'wb') as csvfile: 82 | writer = csv.writer(csvfile, delimiter='\t', lineterminator='\n') 83 | writer.writerow(['Dataset', 'Own', 'Last', 'Full']) 84 | for i, row in enumerate(results): 85 | try: 86 | writer.writerow(row) 87 | except: 88 | print("Exception at row {}!".format(i)) 89 | 90 | print('Saved to {}'.format(OUTPUT_PATH)) 91 | -------------------------------------------------------------------------------- /metric/torchMoji/scripts/convert_all_datasets.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import json 4 | import math 5 | import pickle 6 | import sys 7 | from io import open 8 | import numpy as np 9 | from os.path import abspath, dirname 10 | sys.path.insert(0, dirname(dirname(abspath(__file__)))) 11 | 12 | from torchmoji.word_generator import WordGenerator 13 | from torchmoji.create_vocab import VocabBuilder 14 | from torchmoji.sentence_tokenizer import SentenceTokenizer, extend_vocab, coverage 15 | from torchmoji.tokenizer import tokenize 16 | 17 | try: 18 | unicode # Python 2 19 | except NameError: 20 | unicode = str # Python 3 21 | 22 | IS_PYTHON2 = int(sys.version[0]) == 2 23 | 24 | DATASETS = [ 25 | 'Olympic', 26 | 'PsychExp', 27 | 'SCv1', 28 | 'SCv2-GEN', 29 | 'SE0714', 30 | #'SE1604', # Excluded due to Twitter's ToS 31 | 'SS-Twitter', 32 | 'SS-Youtube', 33 | ] 34 | 35 | DIR = '../data' 36 | FILENAME_RAW = 'raw.pickle' 37 | FILENAME_OWN = 'own_vocab.pickle' 38 | FILENAME_OUR = 'twitter_vocab.pickle' 39 | FILENAME_COMBINED = 'combined_vocab.pickle' 40 | 41 | 42 | def roundup(x): 43 | return int(math.ceil(x / 10.0)) * 10 44 | 45 | 46 | def format_pickle(dset, train_texts, val_texts, test_texts, train_labels, val_labels, test_labels): 47 | return {'dataset': dset, 48 | 'train_texts': train_texts, 49 | 'val_texts': val_texts, 50 | 'test_texts': test_texts, 51 | 'train_labels': train_labels, 52 | 'val_labels': val_labels, 53 | 'test_labels': test_labels} 54 | 55 | def convert_dataset(filepath, extend_with, vocab): 56 | print('-- Generating {} '.format(filepath)) 57 | sys.stdout.flush() 58 | st = SentenceTokenizer(vocab, maxlen) 59 | tokenized, dicts, _ = st.split_train_val_test(texts, 60 | labels, 61 | [data['train_ind'], 62 | data['val_ind'], 63 | data['test_ind']], 64 | extend_with=extend_with) 65 | pick = format_pickle(dset, tokenized[0], tokenized[1], tokenized[2], 66 | dicts[0], dicts[1], dicts[2]) 67 | with open(filepath, 'w') as f: 68 | pickle.dump(pick, f) 69 | cover = coverage(tokenized[2]) 70 | 71 | print(' done. Coverage: {}'.format(cover)) 72 | 73 | with open('../model/vocabulary.json', 'r') as f: 74 | vocab = json.load(f) 75 | 76 | for dset in DATASETS: 77 | print('Converting {}'.format(dset)) 78 | 79 | PATH_RAW = '{}/{}/{}'.format(DIR, dset, FILENAME_RAW) 80 | PATH_OWN = '{}/{}/{}'.format(DIR, dset, FILENAME_OWN) 81 | PATH_OUR = '{}/{}/{}'.format(DIR, dset, FILENAME_OUR) 82 | PATH_COMBINED = '{}/{}/{}'.format(DIR, dset, FILENAME_COMBINED) 83 | 84 | with open(PATH_RAW, 'rb') as dataset: 85 | if IS_PYTHON2: 86 | data = pickle.load(dataset) 87 | else: 88 | data = pickle.load(dataset, fix_imports=True) 89 | 90 | # Decode data 91 | try: 92 | texts = [unicode(x) for x in data['texts']] 93 | except UnicodeDecodeError: 94 | texts = [x.decode('utf-8') for x in data['texts']] 95 | 96 | wg = WordGenerator(texts) 97 | vb = VocabBuilder(wg) 98 | vb.count_all_words() 99 | 100 | # Calculate max length of sequences considered 101 | # Adjust batch_size accordingly to prevent GPU overflow 102 | lengths = [len(tokenize(t)) for t in texts] 103 | maxlen = roundup(np.percentile(lengths, 80.0)) 104 | 105 | # Extract labels 106 | labels = [x['label'] for x in data['info']] 107 | 108 | convert_dataset(PATH_OWN, 50000, {}) 109 | convert_dataset(PATH_OUR, 0, vocab) 110 | convert_dataset(PATH_COMBINED, 10000, vocab) 111 | -------------------------------------------------------------------------------- /metric/torchMoji/scripts/download_weights.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | from subprocess import call 4 | from builtins import input 5 | 6 | curr_folder = os.path.basename(os.path.normpath(os.getcwd())) 7 | 8 | weights_filename = 'pytorch_model.bin' 9 | weights_folder = 'model' 10 | weights_path = '{}/{}'.format(weights_folder, weights_filename) 11 | if curr_folder == 'scripts': 12 | weights_path = '../' + weights_path 13 | weights_download_link = 'https://www.dropbox.com/s/q8lax9ary32c7t9/pytorch_model.bin?dl=0#' 14 | 15 | 16 | MB_FACTOR = float(1<<20) 17 | 18 | def prompt(): 19 | while True: 20 | valid = { 21 | 'y': True, 22 | 'ye': True, 23 | 'yes': True, 24 | 'n': False, 25 | 'no': False, 26 | } 27 | choice = input().lower() 28 | if choice in valid: 29 | return valid[choice] 30 | else: 31 | print('Please respond with \'y\' or \'n\' (or \'yes\' or \'no\')') 32 | 33 | download = True 34 | if os.path.exists(weights_path): 35 | print('Weight file already exists at {}. Would you like to redownload it anyway? [y/n]'.format(weights_path)) 36 | download = prompt() 37 | already_exists = True 38 | else: 39 | already_exists = False 40 | 41 | if download: 42 | print('About to download the pretrained weights file from {}'.format(weights_download_link)) 43 | if already_exists == False: 44 | print('The size of the file is roughly 85MB. Continue? [y/n]') 45 | else: 46 | os.unlink(weights_path) 47 | 48 | if already_exists or prompt(): 49 | print('Downloading...') 50 | 51 | #urllib.urlretrieve(weights_download_link, weights_path) 52 | #with open(weights_path,'wb') as f: 53 | # f.write(requests.get(weights_download_link).content) 54 | 55 | # downloading using wget due to issues with urlretrieve and requests 56 | sys_call = 'wget {} -O {}'.format(weights_download_link, os.path.abspath(weights_path)) 57 | print("Running system call: {}".format(sys_call)) 58 | call(sys_call, shell=True) 59 | 60 | if os.path.getsize(weights_path) / MB_FACTOR < 80: 61 | raise ValueError("Download finished, but the resulting file is too small! " + 62 | "It\'s only {} bytes.".format(os.path.getsize(weights_path))) 63 | print('Downloaded weights to {}'.format(weights_path)) 64 | else: 65 | print('Exiting.') 66 | -------------------------------------------------------------------------------- /metric/torchMoji/scripts/finetune_dataset.py: -------------------------------------------------------------------------------- 1 | """ Finetuning example. 2 | """ 3 | from __future__ import print_function 4 | import sys 5 | import numpy as np 6 | from os.path import abspath, dirname 7 | sys.path.insert(0, dirname(dirname(abspath(__file__)))) 8 | 9 | import json 10 | import math 11 | from torchmoji.model_def import torchmoji_transfer 12 | from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH 13 | from torchmoji.finetuning import ( 14 | load_benchmark, 15 | finetune) 16 | from torchmoji.class_avg_finetuning import class_avg_finetune 17 | 18 | def roundup(x): 19 | return int(math.ceil(x / 10.0)) * 10 20 | 21 | 22 | # Format: (dataset_name, 23 | # path_to_dataset, 24 | # nb_classes, 25 | # use_f1_score) 26 | DATASETS = [ 27 | #('SE0714', '../data/SE0714/raw.pickle', 3, True), 28 | #('Olympic', '../data/Olympic/raw.pickle', 4, True), 29 | #('PsychExp', '../data/PsychExp/raw.pickle', 7, True), 30 | #('SS-Twitter', '../data/SS-Twitter/raw.pickle', 2, False), 31 | ('SS-Youtube', '../data/SS-Youtube/raw.pickle', 2, False), 32 | #('SE1604', '../data/SE1604/raw.pickle', 3, False), # Excluded due to Twitter's ToS 33 | #('SCv1', '../data/SCv1/raw.pickle', 2, True), 34 | #('SCv2-GEN', '../data/SCv2-GEN/raw.pickle', 2, True) 35 | ] 36 | 37 | RESULTS_DIR = 'results' 38 | 39 | # 'new' | 'last' | 'full' | 'chain-thaw' 40 | FINETUNE_METHOD = 'last' 41 | VERBOSE = 1 42 | 43 | nb_tokens = 50000 44 | nb_epochs = 1000 45 | epoch_size = 1000 46 | 47 | with open(VOCAB_PATH, 'r') as f: 48 | vocab = json.load(f) 49 | 50 | for rerun_iter in range(5): 51 | for p in DATASETS: 52 | 53 | # debugging 54 | assert len(vocab) == nb_tokens 55 | 56 | dset = p[0] 57 | path = p[1] 58 | nb_classes = p[2] 59 | use_f1_score = p[3] 60 | 61 | if FINETUNE_METHOD == 'last': 62 | extend_with = 0 63 | elif FINETUNE_METHOD in ['new', 'full', 'chain-thaw']: 64 | extend_with = 10000 65 | else: 66 | raise ValueError('Finetuning method not recognised!') 67 | 68 | # Load dataset. 69 | data = load_benchmark(path, vocab, extend_with=extend_with) 70 | 71 | (X_train, y_train) = (data['texts'][0], data['labels'][0]) 72 | (X_val, y_val) = (data['texts'][1], data['labels'][1]) 73 | (X_test, y_test) = (data['texts'][2], data['labels'][2]) 74 | 75 | weight_path = PRETRAINED_PATH if FINETUNE_METHOD != 'new' else None 76 | nb_model_classes = 2 if use_f1_score else nb_classes 77 | model = torchmoji_transfer( 78 | nb_model_classes, 79 | weight_path, 80 | extend_embedding=data['added']) 81 | print(model) 82 | 83 | # Training 84 | print('Training: {}'.format(path)) 85 | if use_f1_score: 86 | model, result = class_avg_finetune(model, data['texts'], 87 | data['labels'], 88 | nb_classes, data['batch_size'], 89 | FINETUNE_METHOD, 90 | verbose=VERBOSE) 91 | else: 92 | model, result = finetune(model, data['texts'], data['labels'], 93 | nb_classes, data['batch_size'], 94 | FINETUNE_METHOD, metric='acc', 95 | verbose=VERBOSE) 96 | 97 | # Write results 98 | if use_f1_score: 99 | print('Overall F1 score (dset = {}): {}'.format(dset, result)) 100 | with open('{}/{}_{}_{}_results.txt'. 101 | format(RESULTS_DIR, dset, FINETUNE_METHOD, rerun_iter), 102 | "w") as f: 103 | f.write("F1: {}\n".format(result)) 104 | else: 105 | print('Test accuracy (dset = {}): {}'.format(dset, result)) 106 | with open('{}/{}_{}_{}_results.txt'. 107 | format(RESULTS_DIR, dset, FINETUNE_METHOD, rerun_iter), 108 | "w") as f: 109 | f.write("Acc: {}\n".format(result)) 110 | -------------------------------------------------------------------------------- /metric/torchMoji/scripts/results/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /metric/torchMoji/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name='torchmoji', 5 | version='1.0', 6 | packages=['torchmoji'], 7 | description='torchMoji', 8 | include_package_data=True, 9 | install_requires=[ 10 | 'emoji==0.4.5', 11 | 'numpy==1.13.1', 12 | 'scipy==0.19.1', 13 | 'scikit-learn==0.19.0', 14 | 'text-unidecode==1.0', 15 | ], 16 | ) 17 | -------------------------------------------------------------------------------- /metric/torchMoji/tests/test_finetuning.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, print_function, division, unicode_literals 2 | 3 | import test_helper 4 | 5 | from nose.plugins.attrib import attr 6 | import json 7 | import numpy as np 8 | 9 | from torchmoji.class_avg_finetuning import relabel 10 | from torchmoji.sentence_tokenizer import SentenceTokenizer 11 | 12 | from torchmoji.finetuning import ( 13 | calculate_batchsize_maxlen, 14 | freeze_layers, 15 | change_trainable, 16 | finetune, 17 | load_benchmark 18 | ) 19 | from torchmoji.model_def import ( 20 | torchmoji_transfer, 21 | torchmoji_feature_encoding, 22 | torchmoji_emojis 23 | ) 24 | from torchmoji.global_variables import ( 25 | PRETRAINED_PATH, 26 | NB_TOKENS, 27 | VOCAB_PATH, 28 | ROOT_PATH 29 | ) 30 | 31 | 32 | def test_calculate_batchsize_maxlen(): 33 | """ Batch size and max length are calculated properly. 34 | """ 35 | texts = ['a b c d', 36 | 'e f g h i'] 37 | batch_size, maxlen = calculate_batchsize_maxlen(texts) 38 | 39 | assert batch_size == 250 40 | assert maxlen == 10, maxlen 41 | 42 | 43 | def test_freeze_layers(): 44 | """ Correct layers are frozen. 45 | """ 46 | model = torchmoji_transfer(5) 47 | keyword = 'output_layer' 48 | 49 | model = freeze_layers(model, unfrozen_keyword=keyword) 50 | 51 | for name, module in model.named_children(): 52 | trainable = keyword.lower() in name.lower() 53 | assert all(p.requires_grad == trainable for p in module.parameters()) 54 | 55 | 56 | def test_change_trainable(): 57 | """ change_trainable() changes trainability of layers. 58 | """ 59 | model = torchmoji_transfer(5) 60 | change_trainable(model.embed, False) 61 | assert not any(p.requires_grad for p in model.embed.parameters()) 62 | change_trainable(model.embed, True) 63 | assert all(p.requires_grad for p in model.embed.parameters()) 64 | 65 | 66 | def test_torchmoji_transfer_extend_embedding(): 67 | """ Defining torchmoji with extension. 68 | """ 69 | extend_with = 50 70 | model = torchmoji_transfer(5, weight_path=PRETRAINED_PATH, 71 | extend_embedding=extend_with) 72 | embedding_layer = model.embed 73 | assert embedding_layer.weight.size()[0] == NB_TOKENS + extend_with 74 | 75 | 76 | def test_torchmoji_return_attention(): 77 | seq_tensor = np.array([[1]]) 78 | # test the output of the normal model 79 | model = torchmoji_emojis(weight_path=PRETRAINED_PATH) 80 | # check correct number of outputs 81 | assert len(model(seq_tensor)) == 1 82 | # repeat above described tests when returning attention weights 83 | model = torchmoji_emojis(weight_path=PRETRAINED_PATH, return_attention=True) 84 | assert len(model(seq_tensor)) == 2 85 | 86 | 87 | def test_relabel(): 88 | """ relabel() works with multi-class labels. 89 | """ 90 | nb_classes = 3 91 | inputs = np.array([ 92 | [True, False, False], 93 | [False, True, False], 94 | [True, False, True], 95 | ]) 96 | expected_0 = np.array([True, False, True]) 97 | expected_1 = np.array([False, True, False]) 98 | expected_2 = np.array([False, False, True]) 99 | 100 | assert np.array_equal(relabel(inputs, 0, nb_classes), expected_0) 101 | assert np.array_equal(relabel(inputs, 1, nb_classes), expected_1) 102 | assert np.array_equal(relabel(inputs, 2, nb_classes), expected_2) 103 | 104 | 105 | def test_relabel_binary(): 106 | """ relabel() works with binary classification (no changes to labels) 107 | """ 108 | nb_classes = 2 109 | inputs = np.array([True, False, False]) 110 | 111 | assert np.array_equal(relabel(inputs, 0, nb_classes), inputs) 112 | 113 | 114 | @attr('slow') 115 | def test_finetune_full(): 116 | """ finetuning using 'full'. 117 | """ 118 | DATASET_PATH = ROOT_PATH+'/data/SS-Youtube/raw.pickle' 119 | nb_classes = 2 120 | # Keras and pyTorch implementation of the Adam optimizer are slightly different and change a bit the results 121 | # We reduce the min accuracy needed here to pass the test 122 | # See e.g. https://discuss.pytorch.org/t/suboptimal-convergence-when-compared-with-tensorflow-model/5099/11 123 | min_acc = 0.68 124 | 125 | with open(VOCAB_PATH, 'r') as f: 126 | vocab = json.load(f) 127 | 128 | data = load_benchmark(DATASET_PATH, vocab, extend_with=10000) 129 | print('Loading pyTorch model from {}.'.format(PRETRAINED_PATH)) 130 | model = torchmoji_transfer(nb_classes, PRETRAINED_PATH, extend_embedding=data['added']) 131 | print(model) 132 | model, acc = finetune(model, data['texts'], data['labels'], nb_classes, 133 | data['batch_size'], method='full', nb_epochs=1) 134 | 135 | print("Finetune full SS-Youtube 1 epoch acc: {}".format(acc)) 136 | assert acc >= min_acc 137 | 138 | 139 | @attr('slow') 140 | def test_finetune_last(): 141 | """ finetuning using 'last'. 142 | """ 143 | dataset_path = ROOT_PATH + '/data/SS-Youtube/raw.pickle' 144 | nb_classes = 2 145 | min_acc = 0.68 146 | 147 | with open(VOCAB_PATH, 'r') as f: 148 | vocab = json.load(f) 149 | 150 | data = load_benchmark(dataset_path, vocab) 151 | print('Loading model from {}.'.format(PRETRAINED_PATH)) 152 | model = torchmoji_transfer(nb_classes, PRETRAINED_PATH) 153 | print(model) 154 | model, acc = finetune(model, data['texts'], data['labels'], nb_classes, 155 | data['batch_size'], method='last', nb_epochs=1) 156 | 157 | print("Finetune last SS-Youtube 1 epoch acc: {}".format(acc)) 158 | 159 | assert acc >= min_acc 160 | 161 | 162 | def test_score_emoji(): 163 | """ Emoji predictions make sense. 164 | """ 165 | test_sentences = [ 166 | 'I love mom\'s cooking', 167 | 'I love how you never reply back..', 168 | 'I love cruising with my homies', 169 | 'I love messing with yo mind!!', 170 | 'I love you and now you\'re just gone..', 171 | 'This is shit', 172 | 'This is the shit' 173 | ] 174 | 175 | expected = [ 176 | np.array([36, 4, 8, 16, 47]), 177 | np.array([1, 19, 55, 25, 46]), 178 | np.array([31, 6, 30, 15, 13]), 179 | np.array([54, 44, 9, 50, 49]), 180 | np.array([46, 5, 27, 35, 34]), 181 | np.array([55, 32, 27, 1, 37]), 182 | np.array([48, 11, 6, 31, 9]) 183 | ] 184 | 185 | def top_elements(array, k): 186 | ind = np.argpartition(array, -k)[-k:] 187 | return ind[np.argsort(array[ind])][::-1] 188 | 189 | # Initialize by loading dictionary and tokenize texts 190 | with open(VOCAB_PATH, 'r') as f: 191 | vocabulary = json.load(f) 192 | 193 | st = SentenceTokenizer(vocabulary, 30) 194 | tokens, _, _ = st.tokenize_sentences(test_sentences) 195 | 196 | # Load model and run 197 | model = torchmoji_emojis(weight_path=PRETRAINED_PATH) 198 | prob = model(tokens) 199 | 200 | # Find top emojis for each sentence 201 | for i, t_prob in enumerate(list(prob)): 202 | assert np.array_equal(top_elements(t_prob, 5), expected[i]) 203 | 204 | 205 | def test_encode_texts(): 206 | """ Text encoding is stable. 207 | """ 208 | 209 | TEST_SENTENCES = ['I love mom\'s cooking', 210 | 'I love how you never reply back..', 211 | 'I love cruising with my homies', 212 | 'I love messing with yo mind!!', 213 | 'I love you and now you\'re just gone..', 214 | 'This is shit', 215 | 'This is the shit'] 216 | 217 | 218 | maxlen = 30 219 | batch_size = 32 220 | 221 | with open(VOCAB_PATH, 'r') as f: 222 | vocabulary = json.load(f) 223 | 224 | st = SentenceTokenizer(vocabulary, maxlen) 225 | 226 | print('Loading model from {}.'.format(PRETRAINED_PATH)) 227 | model = torchmoji_feature_encoding(PRETRAINED_PATH) 228 | print(model) 229 | tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES) 230 | encoding = model(tokenized) 231 | 232 | avg_across_sentences = np.around(np.mean(encoding, axis=0)[:5], 3) 233 | assert np.allclose(avg_across_sentences, np.array([-0.023, 0.021, -0.037, -0.001, -0.005])) 234 | 235 | test_encode_texts() -------------------------------------------------------------------------------- /metric/torchMoji/tests/test_helper.py: -------------------------------------------------------------------------------- 1 | """ Module import helper. 2 | Modifies PATH in order to allow us to import the torchmoji directory. 3 | """ 4 | import sys 5 | from os.path import abspath, dirname 6 | sys.path.insert(0, dirname(dirname(abspath(__file__)))) 7 | -------------------------------------------------------------------------------- /metric/torchMoji/tests/test_sentence_tokenizer.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, print_function, division, unicode_literals 2 | import test_helper 3 | import json 4 | 5 | from torchmoji.sentence_tokenizer import SentenceTokenizer 6 | 7 | sentences = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'] 8 | 9 | dicts = [ 10 | {'label': 0}, 11 | {'label': 1}, 12 | {'label': 2}, 13 | {'label': 3}, 14 | {'label': 4}, 15 | {'label': 5}, 16 | {'label': 6}, 17 | {'label': 7}, 18 | {'label': 8}, 19 | {'label': 9}, 20 | ] 21 | 22 | train_ind = [0, 5, 3, 6, 8] 23 | val_ind = [9, 2, 1] 24 | test_ind = [4, 7] 25 | 26 | with open('../model/vocabulary.json', 'r') as f: 27 | vocab = json.load(f) 28 | 29 | def test_dataset_split_parameter(): 30 | """ Dataset is split in the desired ratios 31 | """ 32 | split_parameter = [0.7, 0.1, 0.2] 33 | st = SentenceTokenizer(vocab, 30) 34 | 35 | result, result_dicts, _ = st.split_train_val_test(sentences, dicts, 36 | split_parameter, extend_with=0) 37 | train = result[0] 38 | val = result[1] 39 | test = result[2] 40 | 41 | train_dicts = result_dicts[0] 42 | val_dicts = result_dicts[1] 43 | test_dicts = result_dicts[2] 44 | 45 | assert len(train) == len(sentences) * split_parameter[0] 46 | assert len(val) == len(sentences) * split_parameter[1] 47 | assert len(test) == len(sentences) * split_parameter[2] 48 | 49 | assert len(train_dicts) == len(dicts) * split_parameter[0] 50 | assert len(val_dicts) == len(dicts) * split_parameter[1] 51 | assert len(test_dicts) == len(dicts) * split_parameter[2] 52 | 53 | def test_dataset_split_explicit(): 54 | """ Dataset is split according to given indices 55 | """ 56 | split_parameter = [train_ind, val_ind, test_ind] 57 | st = SentenceTokenizer(vocab, 30) 58 | tokenized, _, _ = st.tokenize_sentences(sentences) 59 | 60 | result, result_dicts, added = st.split_train_val_test(sentences, dicts, split_parameter, extend_with=0) 61 | train = result[0] 62 | val = result[1] 63 | test = result[2] 64 | 65 | train_dicts = result_dicts[0] 66 | val_dicts = result_dicts[1] 67 | test_dicts = result_dicts[2] 68 | 69 | tokenized = tokenized 70 | 71 | for i, sentence in enumerate(sentences): 72 | if i in train_ind: 73 | assert tokenized[i] in train 74 | assert dicts[i] in train_dicts 75 | elif i in val_ind: 76 | assert tokenized[i] in val 77 | assert dicts[i] in val_dicts 78 | elif i in test_ind: 79 | assert tokenized[i] in test 80 | assert dicts[i] in test_dicts 81 | 82 | assert len(train) == len(train_ind) 83 | assert len(val) == len(val_ind) 84 | assert len(test) == len(test_ind) 85 | assert len(train_dicts) == len(train_ind) 86 | assert len(val_dicts) == len(val_ind) 87 | assert len(test_dicts) == len(test_ind) 88 | 89 | def test_id_to_sentence(): 90 | """Tokenizing and converting back preserves the input. 91 | """ 92 | vb = {'CUSTOM_MASK': 0, 93 | 'aasdf': 1000, 94 | 'basdf': 2000} 95 | 96 | sentence = 'aasdf basdf basdf basdf' 97 | st = SentenceTokenizer(vb, 30) 98 | token, _, _ = st.tokenize_sentences([sentence]) 99 | assert st.to_sentence(token[0]) == sentence 100 | 101 | def test_id_to_sentence_with_unknown(): 102 | """Tokenizing and converting back preserves the input, except for unknowns. 103 | """ 104 | vb = {'CUSTOM_MASK': 0, 105 | 'CUSTOM_UNKNOWN': 1, 106 | 'aasdf': 1000, 107 | 'basdf': 2000} 108 | 109 | sentence = 'aasdf basdf ccc' 110 | expected = 'aasdf basdf CUSTOM_UNKNOWN' 111 | st = SentenceTokenizer(vb, 30) 112 | token, _, _ = st.tokenize_sentences([sentence]) 113 | assert st.to_sentence(token[0]) == expected 114 | -------------------------------------------------------------------------------- /metric/torchMoji/tests/test_tokenizer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ Tokenization tests. 3 | """ 4 | from __future__ import absolute_import, print_function, division, unicode_literals 5 | 6 | import sys 7 | from nose.tools import nottest 8 | from os.path import dirname, abspath 9 | sys.path.append(dirname(dirname(abspath(__file__)))) 10 | from torchmoji.tokenizer import tokenize 11 | 12 | TESTS_NORMAL = [ 13 | ('200K words!', ['200', 'K', 'words', '!']), 14 | ] 15 | 16 | TESTS_EMOJIS = [ 17 | ('i \U0001f496 you to the moon and back', 18 | ['i', '\U0001f496', 'you', 'to', 'the', 'moon', 'and', 'back']), 19 | ("i\U0001f496you to the \u2605's and back", 20 | ['i', '\U0001f496', 'you', 'to', 'the', 21 | '\u2605', "'", 's', 'and', 'back']), 22 | ('~<3~', ['~', '<3', '~']), 23 | ('<333', ['<333']), 24 | (':-)', [':-)']), 25 | ('>:-(', ['>:-(']), 26 | ('\u266b\u266a\u2605\u2606\u2665\u2764\u2661', 27 | ['\u266b', '\u266a', '\u2605', '\u2606', 28 | '\u2665', '\u2764', '\u2661']), 29 | ] 30 | 31 | TESTS_URLS = [ 32 | ('www.sample.com', ['www.sample.com']), 33 | ('http://endless.horse', ['http://endless.horse']), 34 | ('https://github.mit.ed', ['https://github.mit.ed']), 35 | ] 36 | 37 | TESTS_TWITTER = [ 38 | ('#blacklivesmatter', ['#blacklivesmatter']), 39 | ('#99_percent.', ['#99_percent', '.']), 40 | ('the#99%', ['the', '#99', '%']), 41 | ('@golden_zenith', ['@golden_zenith']), 42 | ('@99_percent', ['@99_percent']), 43 | ('latte-express@mit.ed', ['latte-express@mit.ed']), 44 | ] 45 | 46 | TESTS_PHONE_NUMS = [ 47 | ('518)528-0252', ['518', ')', '528', '-', '0252']), 48 | ('1200-0221-0234', ['1200', '-', '0221', '-', '0234']), 49 | ('1200.0221.0234', ['1200', '.', '0221', '.', '0234']), 50 | ] 51 | 52 | TESTS_DATETIME = [ 53 | ('15:00', ['15', ':', '00']), 54 | ('2:00pm', ['2', ':', '00', 'pm']), 55 | ('9/14/16', ['9', '/', '14', '/', '16']), 56 | ] 57 | 58 | TESTS_CURRENCIES = [ 59 | ('517.933\xa3', ['517', '.', '933', '\xa3']), 60 | ('$517.87', ['$', '517', '.', '87']), 61 | ('1201.6598', ['1201', '.', '6598']), 62 | ('120,6', ['120', ',', '6']), 63 | ('10,00\u20ac', ['10', ',', '00', '\u20ac']), 64 | ('1,000', ['1', ',', '000']), 65 | ('1200pesos', ['1200', 'pesos']), 66 | ] 67 | 68 | TESTS_NUM_SYM = [ 69 | ('5162f', ['5162', 'f']), 70 | ('f5162', ['f', '5162']), 71 | ('1203(', ['1203', '(']), 72 | ('(1203)', ['(', '1203', ')']), 73 | ('1200/', ['1200', '/']), 74 | ('1200+', ['1200', '+']), 75 | ('1202o-east', ['1202', 'o-east']), 76 | ('1200r', ['1200', 'r']), 77 | ('1200-1400', ['1200', '-', '1400']), 78 | ('120/today', ['120', '/', 'today']), 79 | ('today/120', ['today', '/', '120']), 80 | ('120/5', ['120', '/', '5']), 81 | ("120'/5", ['120', "'", '/', '5']), 82 | ('120/5pro', ['120', '/', '5', 'pro']), 83 | ("1200's,)", ['1200', "'", 's', ',', ')']), 84 | ('120.76.218.207', ['120', '.', '76', '.', '218', '.', '207']), 85 | ] 86 | 87 | TESTS_PUNCTUATION = [ 88 | ("don''t", ['don', "''", 't']), 89 | ("don'tcha", ["don'tcha"]), 90 | ('no?!?!;', ['no', '?', '!', '?', '!', ';']), 91 | ('no??!!..', ['no', '??', '!!', '..']), 92 | ('a.m.', ['a.m.']), 93 | ('.s.u', ['.', 's', '.', 'u']), 94 | ('!!i..n__', ['!!', 'i', '..', 'n', '__']), 95 | ('lv(<3)w(3>)u Mr.!', ['lv', '(', '<3', ')', 'w', '(', '3', 96 | '>', ')', 'u', 'Mr.', '!']), 97 | ('-->', ['--', '>']), 98 | ('->', ['-', '>']), 99 | ('<-', ['<', '-']), 100 | ('<--', ['<', '--']), 101 | ('hello (@person)', ['hello', '(', '@person', ')']), 102 | ] 103 | 104 | 105 | def test_normal(): 106 | """ Normal/combined usage. 107 | """ 108 | test_base(TESTS_NORMAL) 109 | 110 | 111 | def test_emojis(): 112 | """ Tokenizing emojis/emoticons/decorations. 113 | """ 114 | test_base(TESTS_EMOJIS) 115 | 116 | 117 | def test_urls(): 118 | """ Tokenizing URLs. 119 | """ 120 | test_base(TESTS_URLS) 121 | 122 | 123 | def test_twitter(): 124 | """ Tokenizing hashtags, mentions and emails. 125 | """ 126 | test_base(TESTS_TWITTER) 127 | 128 | 129 | def test_phone_nums(): 130 | """ Tokenizing phone numbers. 131 | """ 132 | test_base(TESTS_PHONE_NUMS) 133 | 134 | 135 | def test_datetime(): 136 | """ Tokenizing dates and times. 137 | """ 138 | test_base(TESTS_DATETIME) 139 | 140 | 141 | def test_currencies(): 142 | """ Tokenizing currencies. 143 | """ 144 | test_base(TESTS_CURRENCIES) 145 | 146 | 147 | def test_num_sym(): 148 | """ Tokenizing combinations of numbers and symbols. 149 | """ 150 | test_base(TESTS_NUM_SYM) 151 | 152 | 153 | def test_punctuation(): 154 | """ Tokenizing punctuation and contractions. 155 | """ 156 | test_base(TESTS_PUNCTUATION) 157 | 158 | 159 | @nottest 160 | def test_base(tests): 161 | """ Base function for running tests. 162 | """ 163 | for (test, expected) in tests: 164 | actual = tokenize(test) 165 | assert actual == expected, \ 166 | "Tokenization of \'{}\' failed, expected: {}, actual: {}"\ 167 | .format(test, expected, actual) 168 | -------------------------------------------------------------------------------- /metric/torchMoji/tests/test_word_generator.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | from os.path import dirname, abspath 4 | sys.path.append(dirname(dirname(abspath(__file__)))) 5 | from nose.tools import raises 6 | from torchmoji.word_generator import WordGenerator 7 | 8 | IS_PYTHON2 = int(sys.version[0]) == 2 9 | 10 | @raises(ValueError) 11 | def test_only_unicode_accepted(): 12 | """ Non-Unicode strings raise a ValueError. 13 | In Python 3 all string are Unicode 14 | """ 15 | if not IS_PYTHON2: 16 | raise ValueError("You are using python 3 so this test should always pass") 17 | 18 | sentences = [ 19 | u'Hello world', 20 | u'I am unicode', 21 | 'I am not unicode', 22 | ] 23 | 24 | wg = WordGenerator(sentences) 25 | for w in wg: 26 | pass 27 | 28 | 29 | def test_unicode_sentences_ignored_if_set(): 30 | """ Strings with Unicode characters tokenize to empty array if they're not allowed. 31 | """ 32 | sentence = [u'Dobrý den, jak se máš?'] 33 | wg = WordGenerator(sentence, allow_unicode_text=False) 34 | assert wg.get_words(sentence[0]) == [] 35 | 36 | 37 | def test_check_ascii(): 38 | """ check_ascii recognises ASCII words properly. 39 | In Python 3 all string are Unicode 40 | """ 41 | if not IS_PYTHON2: 42 | return 43 | 44 | wg = WordGenerator([]) 45 | assert wg.check_ascii('ASCII') 46 | assert not wg.check_ascii('ščřžýá') 47 | assert not wg.check_ascii('❤ ☀ ☆ ☂ ☻ ♞ ☯ ☭ ☢') 48 | 49 | 50 | def test_convert_unicode_word(): 51 | """ convert_unicode_word converts Unicode words correctly. 52 | """ 53 | wg = WordGenerator([], allow_unicode_text=True) 54 | 55 | result = wg.convert_unicode_word(u'č') 56 | assert result == (True, u'\u010d'), '{}'.format(result) 57 | 58 | 59 | def test_convert_unicode_word_ignores_if_set(): 60 | """ convert_unicode_word ignores Unicode words if set. 61 | """ 62 | wg = WordGenerator([], allow_unicode_text=False) 63 | 64 | result = wg.convert_unicode_word(u'č') 65 | assert result == (False, ''), '{}'.format(result) 66 | 67 | 68 | def test_convert_unicode_chars(): 69 | """ convert_unicode_word correctly converts accented characters. 70 | """ 71 | wg = WordGenerator([], allow_unicode_text=True) 72 | result = wg.convert_unicode_word(u'ěščřžýáíé') 73 | assert result == (True, u'\u011b\u0161\u010d\u0159\u017e\xfd\xe1\xed\xe9'), '{}'.format(result) 74 | -------------------------------------------------------------------------------- /metric/torchMoji/torchmoji/.gitkeep: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /metric/torchMoji/torchmoji/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andreamad8/PPCM/e5bef1bbb70907a3d65de3225a00e4af9104d4a8/metric/torchMoji/torchmoji/__init__.py -------------------------------------------------------------------------------- /metric/torchMoji/torchmoji/attlayer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ Define the Attention Layer of the model. 3 | """ 4 | 5 | from __future__ import print_function, division 6 | 7 | import torch 8 | 9 | from torch.autograd import Variable 10 | from torch.nn import Module 11 | from torch.nn.parameter import Parameter 12 | 13 | class Attention(Module): 14 | """ 15 | Computes a weighted average of the different channels across timesteps. 16 | Uses 1 parameter pr. channel to compute the attention value for a single timestep. 17 | """ 18 | 19 | def __init__(self, attention_size, return_attention=False): 20 | """ Initialize the attention layer 21 | 22 | # Arguments: 23 | attention_size: Size of the attention vector. 24 | return_attention: If true, output will include the weight for each input token 25 | used for the prediction 26 | 27 | """ 28 | super(Attention, self).__init__() 29 | self.return_attention = return_attention 30 | self.attention_size = attention_size 31 | self.attention_vector = Parameter(torch.FloatTensor(attention_size)) 32 | self.attention_vector.data.normal_(std=0.05) # Initialize attention vector 33 | 34 | def __repr__(self): 35 | s = '{name}({attention_size}, return attention={return_attention})' 36 | return s.format(name=self.__class__.__name__, **self.__dict__) 37 | 38 | def forward(self, inputs, input_lengths): 39 | """ Forward pass. 40 | 41 | # Arguments: 42 | inputs (Torch.Variable): Tensor of input sequences 43 | input_lengths (torch.LongTensor): Lengths of the sequences 44 | 45 | # Return: 46 | Tuple with (representations and attentions if self.return_attention else None). 47 | """ 48 | logits = inputs.matmul(self.attention_vector) 49 | unnorm_ai = (logits - logits.max()).exp() 50 | 51 | # Compute a mask for the attention on the padded sequences 52 | # See e.g. https://discuss.pytorch.org/t/self-attention-on-words-and-masking/5671/5 53 | max_len = unnorm_ai.size(1) 54 | idxes = torch.arange(0, max_len, out=torch.LongTensor(max_len)).unsqueeze(0) 55 | mask = Variable((idxes < input_lengths.unsqueeze(1)).float()) 56 | 57 | # apply mask and renormalize attention scores (weights) 58 | masked_weights = unnorm_ai * mask 59 | att_sums = masked_weights.sum(dim=1, keepdim=True) # sums per sequence 60 | attentions = masked_weights.div(att_sums) 61 | 62 | # apply attention weights 63 | weighted = torch.mul(inputs, attentions.unsqueeze(-1).expand_as(inputs)) 64 | 65 | # get the final fixed vector representations of the sentences 66 | representations = weighted.sum(dim=1) 67 | 68 | return (representations, attentions if self.return_attention else None) 69 | -------------------------------------------------------------------------------- /metric/torchMoji/torchmoji/create_vocab.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import print_function, division 3 | 4 | import glob 5 | import json 6 | import uuid 7 | from copy import deepcopy 8 | from collections import defaultdict, OrderedDict 9 | import numpy as np 10 | 11 | from torchmoji.filter_utils import is_special_token 12 | from torchmoji.word_generator import WordGenerator 13 | from torchmoji.global_variables import SPECIAL_TOKENS, VOCAB_PATH 14 | 15 | class VocabBuilder(): 16 | """ Create vocabulary with words extracted from sentences as fed from a 17 | word generator. 18 | """ 19 | def __init__(self, word_gen): 20 | # initialize any new key with value of 0 21 | self.word_counts = defaultdict(lambda: 0, {}) 22 | self.word_length_limit=30 23 | 24 | for token in SPECIAL_TOKENS: 25 | assert len(token) < self.word_length_limit 26 | self.word_counts[token] = 0 27 | self.word_gen = word_gen 28 | 29 | def count_words_in_sentence(self, words): 30 | """ Generates word counts for all tokens in the given sentence. 31 | 32 | # Arguments: 33 | words: Tokenized sentence whose words should be counted. 34 | """ 35 | for word in words: 36 | if 0 < len(word) and len(word) <= self.word_length_limit: 37 | try: 38 | self.word_counts[word] += 1 39 | except KeyError: 40 | self.word_counts[word] = 1 41 | 42 | def save_vocab(self, path=None): 43 | """ Saves the vocabulary into a file. 44 | 45 | # Arguments: 46 | path: Where the vocabulary should be saved. If not specified, a 47 | randomly generated filename is used instead. 48 | """ 49 | dtype = ([('word','|S{}'.format(self.word_length_limit)),('count','int')]) 50 | np_dict = np.array(self.word_counts.items(), dtype=dtype) 51 | 52 | # sort from highest to lowest frequency 53 | np_dict[::-1].sort(order='count') 54 | data = np_dict 55 | 56 | if path is None: 57 | path = str(uuid.uuid4()) 58 | 59 | np.savez_compressed(path, data=data) 60 | print("Saved dict to {}".format(path)) 61 | 62 | def get_next_word(self): 63 | """ Returns next tokenized sentence from the word geneerator. 64 | 65 | # Returns: 66 | List of strings, representing the next tokenized sentence. 67 | """ 68 | return self.word_gen.__iter__().next() 69 | 70 | def count_all_words(self): 71 | """ Generates word counts for all words in all sentences of the word 72 | generator. 73 | """ 74 | for words, _ in self.word_gen: 75 | self.count_words_in_sentence(words) 76 | 77 | class MasterVocab(): 78 | """ Combines vocabularies. 79 | """ 80 | def __init__(self): 81 | 82 | # initialize custom tokens 83 | self.master_vocab = {} 84 | 85 | def populate_master_vocab(self, vocab_path, min_words=1, force_appearance=None): 86 | """ Populates the master vocabulary using all vocabularies found in the 87 | given path. Vocabularies should be named *.npz. Expects the 88 | vocabularies to be numpy arrays with counts. Normalizes the counts 89 | and combines them. 90 | 91 | # Arguments: 92 | vocab_path: Path containing vocabularies to be combined. 93 | min_words: Minimum amount of occurences a word must have in order 94 | to be included in the master vocabulary. 95 | force_appearance: Optional vocabulary filename that will be added 96 | to the master vocabulary no matter what. This vocabulary must 97 | be present in vocab_path. 98 | """ 99 | 100 | paths = glob.glob(vocab_path + '*.npz') 101 | sizes = {path: 0 for path in paths} 102 | dicts = {path: {} for path in paths} 103 | 104 | # set up and get sizes of individual dictionaries 105 | for path in paths: 106 | np_data = np.load(path)['data'] 107 | 108 | for entry in np_data: 109 | word, count = entry 110 | if count < min_words: 111 | continue 112 | if is_special_token(word): 113 | continue 114 | dicts[path][word] = count 115 | 116 | sizes[path] = sum(dicts[path].values()) 117 | print('Overall word count for {} -> {}'.format(path, sizes[path])) 118 | print('Overall word number for {} -> {}'.format(path, len(dicts[path]))) 119 | 120 | vocab_of_max_size = max(sizes, key=sizes.get) 121 | max_size = sizes[vocab_of_max_size] 122 | print('Min: {}, {}, {}'.format(sizes, vocab_of_max_size, max_size)) 123 | 124 | # can force one vocabulary to always be present 125 | if force_appearance is not None: 126 | force_appearance_path = [p for p in paths if force_appearance in p][0] 127 | force_appearance_vocab = deepcopy(dicts[force_appearance_path]) 128 | print(force_appearance_path) 129 | else: 130 | force_appearance_path, force_appearance_vocab = None, None 131 | 132 | # normalize word counts before inserting into master dict 133 | for path in paths: 134 | normalization_factor = max_size / sizes[path] 135 | print('Norm factor for path {} -> {}'.format(path, normalization_factor)) 136 | 137 | for word in dicts[path]: 138 | if is_special_token(word): 139 | print("SPECIAL - ", word) 140 | continue 141 | normalized_count = dicts[path][word] * normalization_factor 142 | 143 | # can force one vocabulary to always be present 144 | if force_appearance_vocab is not None: 145 | try: 146 | force_word_count = force_appearance_vocab[word] 147 | except KeyError: 148 | continue 149 | #if force_word_count < 5: 150 | #continue 151 | 152 | if word in self.master_vocab: 153 | self.master_vocab[word] += normalized_count 154 | else: 155 | self.master_vocab[word] = normalized_count 156 | 157 | print('Size of master_dict {}'.format(len(self.master_vocab))) 158 | print("Hashes for master dict: {}".format( 159 | len([w for w in self.master_vocab if '#' in w[0]]))) 160 | 161 | def save_vocab(self, path_count, path_vocab, word_limit=100000): 162 | """ Saves the master vocabulary into a file. 163 | """ 164 | 165 | # reserve space for 10 special tokens 166 | words = OrderedDict() 167 | for token in SPECIAL_TOKENS: 168 | # store -1 instead of np.inf, which can overflow 169 | words[token] = -1 170 | 171 | # sort words by frequency 172 | desc_order = OrderedDict(sorted(self.master_vocab.items(), 173 | key=lambda kv: kv[1], reverse=True)) 174 | words.update(desc_order) 175 | 176 | # use encoding of up to 30 characters (no token conversions) 177 | # use float to store large numbers (we don't care about precision loss) 178 | np_vocab = np.array(words.items(), 179 | dtype=([('word','|S30'),('count','float')])) 180 | 181 | # output count for debugging 182 | counts = np_vocab[:word_limit] 183 | np.savez_compressed(path_count, counts=counts) 184 | 185 | # output the index of each word for easy lookup 186 | final_words = OrderedDict() 187 | for i, w in enumerate(words.keys()[:word_limit]): 188 | final_words.update({w:i}) 189 | with open(path_vocab, 'w') as f: 190 | f.write(json.dumps(final_words, indent=4, separators=(',', ': '))) 191 | 192 | 193 | def all_words_in_sentences(sentences): 194 | """ Extracts all unique words from a given list of sentences. 195 | 196 | # Arguments: 197 | sentences: List or word generator of sentences to be processed. 198 | 199 | # Returns: 200 | List of all unique words contained in the given sentences. 201 | """ 202 | vocab = [] 203 | if isinstance(sentences, WordGenerator): 204 | sentences = [s for s, _ in sentences] 205 | 206 | for sentence in sentences: 207 | for word in sentence: 208 | if word not in vocab: 209 | vocab.append(word) 210 | 211 | return vocab 212 | 213 | 214 | def extend_vocab_in_file(vocab, max_tokens=10000, vocab_path=VOCAB_PATH): 215 | """ Extends JSON-formatted vocabulary with words from vocab that are not 216 | present in the current vocabulary. Adds up to max_tokens words. 217 | Overwrites file in vocab_path. 218 | 219 | # Arguments: 220 | new_vocab: Vocabulary to be added. MUST have word_counts populated, i.e. 221 | must have run count_all_words() previously. 222 | max_tokens: Maximum number of words to be added. 223 | vocab_path: Path to the vocabulary json which is to be extended. 224 | """ 225 | try: 226 | with open(vocab_path, 'r') as f: 227 | current_vocab = json.load(f) 228 | except IOError: 229 | print('Vocabulary file not found, expected at ' + vocab_path) 230 | return 231 | 232 | extend_vocab(current_vocab, vocab, max_tokens) 233 | 234 | # Save back to file 235 | with open(vocab_path, 'w') as f: 236 | json.dump(current_vocab, f, sort_keys=True, indent=4, separators=(',',': ')) 237 | 238 | 239 | def extend_vocab(current_vocab, new_vocab, max_tokens=10000): 240 | """ Extends current vocabulary with words from vocab that are not 241 | present in the current vocabulary. Adds up to max_tokens words. 242 | 243 | # Arguments: 244 | current_vocab: Current dictionary of tokens. 245 | new_vocab: Vocabulary to be added. MUST have word_counts populated, i.e. 246 | must have run count_all_words() previously. 247 | max_tokens: Maximum number of words to be added. 248 | 249 | # Returns: 250 | How many new tokens have been added. 251 | """ 252 | if max_tokens < 0: 253 | max_tokens = 10000 254 | 255 | words = OrderedDict() 256 | 257 | # sort words by frequency 258 | desc_order = OrderedDict(sorted(new_vocab.word_counts.items(), 259 | key=lambda kv: kv[1], reverse=True)) 260 | words.update(desc_order) 261 | 262 | base_index = len(current_vocab.keys()) 263 | added = 0 264 | for word in words: 265 | if added >= max_tokens: 266 | break 267 | if word not in current_vocab.keys(): 268 | current_vocab[word] = base_index + added 269 | added += 1 270 | 271 | return added 272 | -------------------------------------------------------------------------------- /metric/torchMoji/torchmoji/filter_input.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import print_function, division 3 | import codecs 4 | import csv 5 | import numpy as np 6 | from emoji import UNICODE_EMOJI 7 | 8 | def read_english(path="english_words.txt", add_emojis=True): 9 | # read english words for filtering (includes emojis as part of set) 10 | english = set() 11 | with codecs.open(path, "r", "utf-8") as f: 12 | for line in f: 13 | line = line.strip().lower().replace('\n', '') 14 | if len(line): 15 | english.add(line) 16 | if add_emojis: 17 | for e in UNICODE_EMOJI: 18 | english.add(e) 19 | return english 20 | 21 | def read_wanted_emojis(path="wanted_emojis.csv"): 22 | emojis = [] 23 | with open(path, 'rb') as f: 24 | reader = csv.reader(f) 25 | for line in reader: 26 | line = line[0].strip().replace('\n', '') 27 | line = line.decode('unicode-escape') 28 | emojis.append(line) 29 | return emojis 30 | 31 | def read_non_english_users(path="unwanted_users.npz"): 32 | try: 33 | neu_set = set(np.load(path)['userids']) 34 | except IOError: 35 | neu_set = set() 36 | return neu_set 37 | -------------------------------------------------------------------------------- /metric/torchMoji/torchmoji/filter_utils.py: -------------------------------------------------------------------------------- 1 | 2 | # -*- coding: utf-8 -*- 3 | from __future__ import print_function, division, unicode_literals 4 | import sys 5 | import re 6 | import string 7 | import emoji 8 | from itertools import groupby 9 | 10 | import numpy as np 11 | from torchmoji.tokenizer import RE_MENTION, RE_URL 12 | from torchmoji.global_variables import SPECIAL_TOKENS 13 | 14 | try: 15 | unichr # Python 2 16 | except NameError: 17 | unichr = chr # Python 3 18 | 19 | 20 | AtMentionRegex = re.compile(RE_MENTION) 21 | urlRegex = re.compile(RE_URL) 22 | 23 | # from http://bit.ly/2rdjgjE (UTF-8 encodings and Unicode chars) 24 | VARIATION_SELECTORS = [ '\ufe00', 25 | '\ufe01', 26 | '\ufe02', 27 | '\ufe03', 28 | '\ufe04', 29 | '\ufe05', 30 | '\ufe06', 31 | '\ufe07', 32 | '\ufe08', 33 | '\ufe09', 34 | '\ufe0a', 35 | '\ufe0b', 36 | '\ufe0c', 37 | '\ufe0d', 38 | '\ufe0e', 39 | '\ufe0f'] 40 | 41 | # from https://stackoverflow.com/questions/92438/stripping-non-printable-characters-from-a-string-in-python 42 | ALL_CHARS = (unichr(i) for i in range(sys.maxunicode)) 43 | CONTROL_CHARS = ''.join(map(unichr, list(range(0,32)) + list(range(127,160)))) 44 | CONTROL_CHAR_REGEX = re.compile('[%s]' % re.escape(CONTROL_CHARS)) 45 | 46 | def is_special_token(word): 47 | equal = False 48 | for spec in SPECIAL_TOKENS: 49 | if word == spec: 50 | equal = True 51 | break 52 | return equal 53 | 54 | def mostly_english(words, english, pct_eng_short=0.5, pct_eng_long=0.6, ignore_special_tokens=True, min_length=2): 55 | """ Ensure text meets threshold for containing English words """ 56 | 57 | n_words = 0 58 | n_english = 0 59 | 60 | if english is None: 61 | return True, 0, 0 62 | 63 | for w in words: 64 | if len(w) < min_length: 65 | continue 66 | if punct_word(w): 67 | continue 68 | if ignore_special_tokens and is_special_token(w): 69 | continue 70 | n_words += 1 71 | if w in english: 72 | n_english += 1 73 | 74 | if n_words < 2: 75 | return True, n_words, n_english 76 | if n_words < 5: 77 | valid_english = n_english >= n_words * pct_eng_short 78 | else: 79 | valid_english = n_english >= n_words * pct_eng_long 80 | return valid_english, n_words, n_english 81 | 82 | def correct_length(words, min_words, max_words, ignore_special_tokens=True): 83 | """ Ensure text meets threshold for containing English words 84 | and that it's within the min and max words limits. """ 85 | 86 | if min_words is None: 87 | min_words = 0 88 | 89 | if max_words is None: 90 | max_words = 99999 91 | 92 | n_words = 0 93 | for w in words: 94 | if punct_word(w): 95 | continue 96 | if ignore_special_tokens and is_special_token(w): 97 | continue 98 | n_words += 1 99 | valid = min_words <= n_words and n_words <= max_words 100 | return valid 101 | 102 | def punct_word(word, punctuation=string.punctuation): 103 | return all([True if c in punctuation else False for c in word]) 104 | 105 | def load_non_english_user_set(): 106 | non_english_user_set = set(np.load('uids.npz')['data']) 107 | return non_english_user_set 108 | 109 | def non_english_user(userid, non_english_user_set): 110 | neu_found = int(userid) in non_english_user_set 111 | return neu_found 112 | 113 | def separate_emojis_and_text(text): 114 | emoji_chars = [] 115 | non_emoji_chars = [] 116 | for c in text: 117 | if c in emoji.UNICODE_EMOJI: 118 | emoji_chars.append(c) 119 | else: 120 | non_emoji_chars.append(c) 121 | return ''.join(emoji_chars), ''.join(non_emoji_chars) 122 | 123 | def extract_emojis(text, wanted_emojis): 124 | text = remove_variation_selectors(text) 125 | return [c for c in text if c in wanted_emojis] 126 | 127 | def remove_variation_selectors(text): 128 | """ Remove styling glyph variants for Unicode characters. 129 | For instance, remove skin color from emojis. 130 | """ 131 | for var in VARIATION_SELECTORS: 132 | text = text.replace(var, '') 133 | return text 134 | 135 | def shorten_word(word): 136 | """ Shorten groupings of 3+ identical consecutive chars to 2, e.g. '!!!!' --> '!!' 137 | """ 138 | 139 | # only shorten ASCII words 140 | try: 141 | word.decode('ascii') 142 | except (UnicodeDecodeError, UnicodeEncodeError, AttributeError) as e: 143 | return word 144 | 145 | # must have at least 3 char to be shortened 146 | if len(word) < 3: 147 | return word 148 | 149 | # find groups of 3+ consecutive letters 150 | letter_groups = [list(g) for k, g in groupby(word)] 151 | triple_or_more = [''.join(g) for g in letter_groups if len(g) >= 3] 152 | if len(triple_or_more) == 0: 153 | return word 154 | 155 | # replace letters to find the short word 156 | short_word = word 157 | for trip in triple_or_more: 158 | short_word = short_word.replace(trip, trip[0]*2) 159 | 160 | return short_word 161 | 162 | def detect_special_tokens(word): 163 | try: 164 | int(word) 165 | word = SPECIAL_TOKENS[4] 166 | except ValueError: 167 | if AtMentionRegex.findall(word): 168 | word = SPECIAL_TOKENS[2] 169 | elif urlRegex.findall(word): 170 | word = SPECIAL_TOKENS[3] 171 | return word 172 | 173 | def process_word(word): 174 | """ Shortening and converting the word to a special token if relevant. 175 | """ 176 | word = shorten_word(word) 177 | word = detect_special_tokens(word) 178 | return word 179 | 180 | def remove_control_chars(text): 181 | return CONTROL_CHAR_REGEX.sub('', text) 182 | 183 | def convert_nonbreaking_space(text): 184 | # ugly hack handling non-breaking space no matter how badly it's been encoded in the input 185 | for r in ['\\\\xc2', '\\xc2', '\xc2', '\\\\xa0', '\\xa0', '\xa0']: 186 | text = text.replace(r, ' ') 187 | return text 188 | 189 | def convert_linebreaks(text): 190 | # ugly hack handling non-breaking space no matter how badly it's been encoded in the input 191 | # space around to ensure proper tokenization 192 | for r in ['\\\\n', '\\n', '\n', '\\\\r', '\\r', '\r', '
']: 193 | text = text.replace(r, ' ' + SPECIAL_TOKENS[5] + ' ') 194 | return text 195 | -------------------------------------------------------------------------------- /metric/torchMoji/torchmoji/global_variables.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ Global variables. 3 | """ 4 | import tempfile 5 | from os.path import abspath, dirname 6 | 7 | # The ordering of these special tokens matter 8 | # blank tokens can be used for new purposes 9 | # Tokenizer should be updated if special token prefix is changed 10 | SPECIAL_PREFIX = 'CUSTOM_' 11 | SPECIAL_TOKENS = ['CUSTOM_MASK', 12 | 'CUSTOM_UNKNOWN', 13 | 'CUSTOM_AT', 14 | 'CUSTOM_URL', 15 | 'CUSTOM_NUMBER', 16 | 'CUSTOM_BREAK'] 17 | SPECIAL_TOKENS.extend(['{}BLANK_{}'.format(SPECIAL_PREFIX, i) for i in range(6, 10)]) 18 | 19 | ROOT_PATH = dirname(dirname(abspath(__file__))) 20 | VOCAB_PATH = '{}/model/vocabulary.json'.format(ROOT_PATH) 21 | PRETRAINED_PATH = '{}/model/pytorch_model.bin'.format(ROOT_PATH) 22 | 23 | WEIGHTS_DIR = tempfile.mkdtemp() 24 | 25 | NB_TOKENS = 50000 26 | NB_EMOJI_CLASSES = 64 27 | FINETUNING_METHODS = ['last', 'full', 'new', 'chain-thaw'] 28 | FINETUNING_METRICS = ['acc', 'weighted'] 29 | -------------------------------------------------------------------------------- /metric/torchMoji/torchmoji/tokenizer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | Splits up a Unicode string into a list of tokens. 4 | Recognises: 5 | - Abbreviations 6 | - URLs 7 | - Emails 8 | - #hashtags 9 | - @mentions 10 | - emojis 11 | - emoticons (limited support) 12 | 13 | Multiple consecutive symbols are also treated as a single token. 14 | ''' 15 | from __future__ import absolute_import, division, print_function, unicode_literals 16 | 17 | import re 18 | 19 | # Basic patterns. 20 | RE_NUM = r'[0-9]+' 21 | RE_WORD = r'[a-zA-Z]+' 22 | RE_WHITESPACE = r'\s+' 23 | RE_ANY = r'.' 24 | 25 | # Combined words such as 'red-haired' or 'CUSTOM_TOKEN' 26 | RE_COMB = r'[a-zA-Z]+[-_][a-zA-Z]+' 27 | 28 | # English-specific patterns 29 | RE_CONTRACTIONS = RE_WORD + r'\'' + RE_WORD 30 | 31 | TITLES = [ 32 | r'Mr\.', 33 | r'Ms\.', 34 | r'Mrs\.', 35 | r'Dr\.', 36 | r'Prof\.', 37 | ] 38 | # Ensure case insensitivity 39 | RE_TITLES = r'|'.join([r'(?i)' + t for t in TITLES]) 40 | 41 | # Symbols have to be created as separate patterns in order to match consecutive 42 | # identical symbols. 43 | SYMBOLS = r'(){}~$^&*;:%+\xa3€`' 44 | RE_SYMBOL = r'|'.join([re.escape(s) + r'+' for s in SYMBOLS]) 45 | 46 | # Hash symbols and at symbols have to be defined separately in order to not 47 | # clash with hashtags and mentions if there are multiple - i.e. 48 | # ##hello -> ['#', '#hello'] instead of ['##', 'hello'] 49 | SPECIAL_SYMBOLS = r'|#+(?=#[a-zA-Z0-9_]+)|@+(?=@[a-zA-Z0-9_]+)|#+|@+' 50 | RE_SYMBOL += SPECIAL_SYMBOLS 51 | 52 | RE_ABBREVIATIONS = r'\b(?:', 65 | r':', 66 | r'=', 67 | r';', 68 | ] 69 | EMOTICONS_MID = [ 70 | r'-', 71 | r',', 72 | r'^', 73 | '\'', 74 | '\"', 75 | ] 76 | EMOTICONS_END = [ 77 | r'D', 78 | r'd', 79 | r'p', 80 | r'P', 81 | r'v', 82 | r')', 83 | r'o', 84 | r'O', 85 | r'(', 86 | r'3', 87 | r'/', 88 | r'|', 89 | '\\', 90 | ] 91 | EMOTICONS_EXTRA = [ 92 | r'-_-', 93 | r'x_x', 94 | r'^_^', 95 | r'o.o', 96 | r'o_o', 97 | r'(:', 98 | r'):', 99 | r');', 100 | r'(;', 101 | ] 102 | 103 | RE_EMOTICON = r'|'.join([re.escape(s) for s in EMOTICONS_EXTRA]) 104 | for s in EMOTICONS_START: 105 | for m in EMOTICONS_MID: 106 | for e in EMOTICONS_END: 107 | RE_EMOTICON += '|{0}{1}?{2}+'.format(re.escape(s), re.escape(m), re.escape(e)) 108 | 109 | # requires ucs4 in python2.7 or python3+ 110 | # RE_EMOJI = r"""[\U0001F300-\U0001F64F\U0001F680-\U0001F6FF\u2600-\u26FF\u2700-\u27BF]""" 111 | # safe for all python 112 | RE_EMOJI = r"""\ud83c[\udf00-\udfff]|\ud83d[\udc00-\ude4f\ude80-\udeff]|[\u2600-\u26FF\u2700-\u27BF]""" 113 | 114 | # List of matched token patterns, ordered from most specific to least specific. 115 | TOKENS = [ 116 | RE_URL, 117 | RE_EMAIL, 118 | RE_COMB, 119 | RE_HASHTAG, 120 | RE_MENTION, 121 | RE_HEART, 122 | RE_EMOTICON, 123 | RE_CONTRACTIONS, 124 | RE_TITLES, 125 | RE_ABBREVIATIONS, 126 | RE_NUM, 127 | RE_WORD, 128 | RE_SYMBOL, 129 | RE_EMOJI, 130 | RE_ANY 131 | ] 132 | 133 | # List of ignored token patterns 134 | IGNORED = [ 135 | RE_WHITESPACE 136 | ] 137 | 138 | # Final pattern 139 | RE_PATTERN = re.compile(r'|'.join(IGNORED) + r'|(' + r'|'.join(TOKENS) + r')', 140 | re.UNICODE) 141 | 142 | 143 | def tokenize(text): 144 | '''Splits given input string into a list of tokens. 145 | 146 | # Arguments: 147 | text: Input string to be tokenized. 148 | 149 | # Returns: 150 | List of strings (tokens). 151 | ''' 152 | result = RE_PATTERN.findall(text) 153 | 154 | # Remove empty strings 155 | result = [t for t in result if t.strip()] 156 | return result 157 | -------------------------------------------------------------------------------- /models/heads.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from models.pytorch_pretrained_bert import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config 5 | import os 6 | from transformers import BertTokenizer,BertModel,AutoConfig 7 | 8 | EPSILON = 1e-10 9 | 10 | class Discriminator(torch.nn.Module): 11 | """Transformer encoder followed by a Classification Head""" 12 | 13 | def __init__( 14 | self, 15 | class_size, 16 | pretrained_model="medium", 17 | cached_mode=False, 18 | load_weight=None, 19 | model_pretrained=None, 20 | entailment=False, 21 | device='cuda' 22 | ): 23 | super(Discriminator, self).__init__() 24 | 25 | self.entailment = entailment 26 | model_path = f'models/dialoGPT/{pretrained_model}/' 27 | config = GPT2Config.from_json_file(os.path.join(model_path, 'config.json')) 28 | self.tokenizer = GPT2Tokenizer.from_pretrained(model_path) 29 | if model_pretrained != None: 30 | self.encoder = model_pretrained 31 | else: 32 | self.encoder = load_model(GPT2LMHeadModel(config), model_path+f"{pretrained_model}_ft.pkl", None, verbose=True) 33 | self.embed_size = config.n_embd 34 | 35 | if(self.entailment): 36 | self.classifier_head = AttentionHead(class_size=class_size, embed_size=self.embed_size) 37 | else: 38 | self.classifier_head = ClassificationHead( 39 | class_size=class_size, 40 | embed_size=self.embed_size 41 | ) 42 | self.cached_mode = cached_mode 43 | if(load_weight != None): 44 | self.classifier_head.load_state_dict(torch.load(load_weight)) 45 | self.device = device 46 | self.class_size = class_size 47 | 48 | def get_classifier(self): 49 | return self.classifier_head 50 | 51 | def train_custom(self): 52 | for param in self.encoder.parameters(): 53 | param.requires_grad = False 54 | self.classifier_head.train() 55 | 56 | def avg_representation(self, x, entailment=False): 57 | mask = x.ne(0).unsqueeze(2).repeat( 58 | 1, 1, self.embed_size 59 | ).float().to(self.device).detach() 60 | hidden, _ = self.encoder.transformer(x) 61 | masked_hidden = hidden * mask 62 | if(entailment): 63 | return masked_hidden 64 | else: 65 | avg_hidden = torch.sum(masked_hidden, dim=1) / ( 66 | torch.sum(mask, dim=1).detach() + EPSILON 67 | ) 68 | return avg_hidden 69 | 70 | def forward(self, x): 71 | if(self.entailment): 72 | P = self.avg_representation(x[0].to(self.device),entailment=True) 73 | H = self.avg_representation(x[1].to(self.device),entailment=True) 74 | logits = self.classifier_head(P,H) 75 | return logits 76 | else: 77 | if self.cached_mode: 78 | avg_hidden = x.to(self.device) 79 | else: 80 | avg_hidden = self.avg_representation(x.to(self.device)) 81 | 82 | logits = self.classifier_head(avg_hidden) 83 | return logits 84 | 85 | 86 | class Scorer(torch.nn.Module): 87 | """Transformer encoder followed by a Classification Head""" 88 | 89 | def __init__( 90 | self, 91 | hidden_dim, 92 | output_dim, 93 | n_layers, 94 | bidirectional, 95 | dropout 96 | ): 97 | super(Scorer, self).__init__() 98 | 99 | # self.entailment = entailment 100 | # self.class_size = class_size 101 | self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 102 | self.bert = BertModel.from_pretrained('bert-base-uncased') 103 | embedding_dim = self.bert.config.to_dict()['hidden_size'] 104 | 105 | 106 | self.rnn = nn.GRU(embedding_dim, 107 | hidden_dim, 108 | num_layers = n_layers, 109 | bidirectional = bidirectional, 110 | batch_first = True, 111 | dropout = 0 if n_layers < 2 else dropout) 112 | 113 | self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim) 114 | 115 | self.dropout = nn.Dropout(dropout) 116 | 117 | def forward(self, text): 118 | 119 | #text = [batch size, sent len] 120 | 121 | with torch.no_grad(): 122 | embedded = self.bert(text)[0] 123 | 124 | #embedded = [batch size, sent len, emb dim] 125 | 126 | _, hidden = self.rnn(embedded) 127 | 128 | #hidden = [n layers * n directions, batch size, emb dim] 129 | 130 | if self.rnn.bidirectional: 131 | hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)) 132 | else: 133 | hidden = self.dropout(hidden[-1,:,:]) 134 | 135 | #hidden = [batch size, hid dim] 136 | 137 | output = self.out(hidden) 138 | 139 | #output = [batch size, out dim] 140 | 141 | return output 142 | 143 | 144 | class AttentionHead(nn.Module): 145 | ''' 146 | https://github.com/libowen2121/SNLI-decomposable-attention/blob/master/models/baseline_snli.py 147 | intra sentence attention 148 | ''' 149 | 150 | def __init__(self, embed_size, class_size): 151 | super(AttentionHead, self).__init__() 152 | 153 | self.embed_size = embed_size 154 | self.class_size = class_size 155 | 156 | self.mlp_f = self._mlp_layers(self.embed_size, self.embed_size) 157 | self.mlp_g = self._mlp_layers(2 * self.embed_size, self.embed_size) 158 | self.mlp_h = self._mlp_layers(2 * self.embed_size, self.embed_size) 159 | 160 | self.final_linear = nn.Linear(self.embed_size, self.class_size) 161 | 162 | def _mlp_layers(self, input_dim, output_dim): 163 | mlp_layers = [] 164 | mlp_layers.append(nn.Dropout(p=0.2)) 165 | mlp_layers.append(nn.Linear(input_dim, output_dim)) 166 | mlp_layers.append(nn.ReLU()) 167 | mlp_layers.append(nn.Dropout(p=0.2)) 168 | mlp_layers.append(nn.Linear(output_dim, output_dim)) 169 | mlp_layers.append(nn.ReLU()) 170 | return nn.Sequential(*mlp_layers) # * used to unpack list 171 | 172 | def forward(self, sent1_linear, sent2_linear): 173 | ''' 174 | sent_linear: batch_size x length x hidden_size 175 | ''' 176 | len1 = sent1_linear.size(1) 177 | len2 = sent2_linear.size(1) 178 | 179 | '''attend''' 180 | f1 = self.mlp_f(sent1_linear.view(-1, self.embed_size)) 181 | f2 = self.mlp_f(sent2_linear.view(-1, self.embed_size)) 182 | 183 | f1 = f1.view(-1, len1, self.embed_size) 184 | # batch_size x len1 x hidden_size 185 | f2 = f2.view(-1, len2, self.embed_size) 186 | # batch_size x len2 x hidden_size 187 | 188 | score1 = torch.bmm(f1, torch.transpose(f2, 1, 2)) 189 | # e_{ij} batch_size x len1 x len2 190 | prob1 = F.softmax(score1.view(-1, len2),dim=1).view(-1, len1, len2) 191 | # batch_size x len1 x len2 192 | 193 | score2 = torch.transpose(score1.contiguous(), 1, 2) 194 | score2 = score2.contiguous() 195 | # e_{ji} batch_size x len2 x len1 196 | prob2 = F.softmax(score2.view(-1, len1),dim=1).view(-1, len2, len1) 197 | # batch_size x len2 x len1 198 | 199 | sent1_combine = torch.cat( 200 | (sent1_linear, torch.bmm(prob1, sent2_linear)), 2) 201 | # batch_size x len1 x (hidden_size x 2) 202 | sent2_combine = torch.cat( 203 | (sent2_linear, torch.bmm(prob2, sent1_linear)), 2) 204 | # batch_size x len2 x (hidden_size x 2) 205 | 206 | '''sum''' 207 | g1 = self.mlp_g(sent1_combine.view(-1, 2 * self.embed_size)) 208 | g2 = self.mlp_g(sent2_combine.view(-1, 2 * self.embed_size)) 209 | g1 = g1.view(-1, len1, self.embed_size) 210 | # batch_size x len1 x hidden_size 211 | g2 = g2.view(-1, len2, self.embed_size) 212 | # batch_size x len2 x hidden_size 213 | 214 | sent1_output = torch.sum(g1, 1) # batch_size x 1 x hidden_size 215 | sent1_output = torch.squeeze(sent1_output, 1) 216 | sent2_output = torch.sum(g2, 1) # batch_size x 1 x hidden_size 217 | sent2_output = torch.squeeze(sent2_output, 1) 218 | 219 | input_combine = torch.cat((sent1_output, sent2_output), 1) 220 | # batch_size x (2 * hidden_size) 221 | h = self.mlp_h(input_combine) 222 | # batch_size * hidden_size 223 | 224 | h = self.final_linear(h) 225 | 226 | 227 | return h 228 | 229 | class ClassificationHead(torch.nn.Module): 230 | """Classification Head for transformer encoders""" 231 | 232 | def __init__(self, class_size, embed_size): 233 | super(ClassificationHead, self).__init__() 234 | self.class_size = class_size 235 | self.embed_size = embed_size 236 | self.mlp = torch.nn.Linear(embed_size, class_size) 237 | 238 | def forward(self, hidden_state): 239 | logits = self.mlp(hidden_state) 240 | return logits 241 | 242 | 243 | def load_model(model, checkpoint, args, verbose=False): 244 | if checkpoint is None or checkpoint == "None": 245 | if verbose: 246 | print('No checkpoint provided for %s!' % model._get_name()) 247 | else: 248 | if not os.path.exists(checkpoint): 249 | raise ValueError('checkpoint %s not exist' % checkpoint) 250 | if verbose: 251 | print('Loading finetuned model from %s' % checkpoint) 252 | model_state_dict = torch.load(checkpoint) 253 | 254 | model_state_dict = fix_state_dict_namespace(model_state_dict) 255 | 256 | start_model = model 257 | if (hasattr(model, "transformer") 258 | and all(not s.startswith('transformer.') 259 | for s in model_state_dict.keys())): 260 | print('Loading transfomer only') 261 | start_model = model.transformer 262 | start_model.load_state_dict(model_state_dict) 263 | 264 | return model 265 | 266 | 267 | def fix_state_dict_namespace(model_state_dict): 268 | old_keys = [] 269 | new_keys = [] 270 | for t in model_state_dict: 271 | new_key = t 272 | if t.startswith('module.'): 273 | new_key = t.replace('module.', '') 274 | old_keys.append(t) 275 | new_keys.append(new_key) 276 | 277 | for old_key, new_key in zip(old_keys, new_keys): 278 | model_state_dict[new_key] = model_state_dict.pop(old_key) 279 | 280 | return model_state_dict 281 | -------------------------------------------------------------------------------- /models/pytorch_pretrained_bert/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.6.1" 2 | from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer 3 | from .tokenization_openai import OpenAIGPTTokenizer 4 | from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus) 5 | from .tokenization_gpt2 import GPT2Tokenizer 6 | 7 | from .modeling import (BertConfig, BertModel, BertForPreTraining, 8 | BertForMaskedLM, BertForNextSentencePrediction, 9 | BertForSequenceClassification, BertForMultipleChoice, 10 | BertForTokenClassification, BertForQuestionAnswering, 11 | load_tf_weights_in_bert) 12 | from .modeling_openai import (OpenAIGPTConfig, OpenAIGPTModel, 13 | OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel, 14 | load_tf_weights_in_openai_gpt) 15 | from .modeling_transfo_xl import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel, 16 | load_tf_weights_in_transfo_xl) 17 | from .modeling_gpt2 import (GPT2Config, GPT2Model, 18 | GPT2LMHeadModel, GPT2DoubleHeadsModel, 19 | load_tf_weights_in_gpt2) 20 | 21 | from .optimization import BertAdam 22 | from .optimization_openai import OpenAIAdam 23 | 24 | from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE, cached_path, WEIGHTS_NAME, CONFIG_NAME 25 | -------------------------------------------------------------------------------- /models/pytorch_pretrained_bert/__main__.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | def main(): 3 | import sys 4 | if (len(sys.argv) != 4 and len(sys.argv) != 5) or sys.argv[1] not in [ 5 | "convert_tf_checkpoint_to_pytorch", 6 | "convert_openai_checkpoint", 7 | "convert_transfo_xl_checkpoint", 8 | "convert_gpt2_checkpoint", 9 | ]: 10 | print( 11 | "Should be used as one of: \n" 12 | ">> `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`, \n" 13 | ">> `pytorch_pretrained_bert convert_openai_checkpoint OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`, \n" 14 | ">> `pytorch_pretrained_bert convert_transfo_xl_checkpoint TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG]` or \n" 15 | ">> `pytorch_pretrained_bert convert_gpt2_checkpoint TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG]`") 16 | else: 17 | if sys.argv[1] == "convert_tf_checkpoint_to_pytorch": 18 | try: 19 | from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch 20 | except ImportError: 21 | print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, " 22 | "In that case, it requires TensorFlow to be installed. Please see " 23 | "https://www.tensorflow.org/install/ for installation instructions.") 24 | raise 25 | 26 | if len(sys.argv) != 5: 27 | # pylint: disable=line-too-long 28 | print("Should be used as `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`") 29 | else: 30 | PYTORCH_DUMP_OUTPUT = sys.argv.pop() 31 | TF_CONFIG = sys.argv.pop() 32 | TF_CHECKPOINT = sys.argv.pop() 33 | convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT) 34 | elif sys.argv[1] == "convert_openai_checkpoint": 35 | from .convert_openai_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch 36 | OPENAI_GPT_CHECKPOINT_FOLDER_PATH = sys.argv[2] 37 | PYTORCH_DUMP_OUTPUT = sys.argv[3] 38 | if len(sys.argv) == 5: 39 | OPENAI_GPT_CONFIG = sys.argv[4] 40 | else: 41 | OPENAI_GPT_CONFIG = "" 42 | convert_openai_checkpoint_to_pytorch(OPENAI_GPT_CHECKPOINT_FOLDER_PATH, 43 | OPENAI_GPT_CONFIG, 44 | PYTORCH_DUMP_OUTPUT) 45 | elif sys.argv[1] == "convert_transfo_xl_checkpoint": 46 | try: 47 | from .convert_transfo_xl_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch 48 | except ImportError: 49 | print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, " 50 | "In that case, it requires TensorFlow to be installed. Please see " 51 | "https://www.tensorflow.org/install/ for installation instructions.") 52 | raise 53 | 54 | if 'ckpt' in sys.argv[2].lower(): 55 | TF_CHECKPOINT = sys.argv[2] 56 | TF_DATASET_FILE = "" 57 | else: 58 | TF_DATASET_FILE = sys.argv[2] 59 | TF_CHECKPOINT = "" 60 | PYTORCH_DUMP_OUTPUT = sys.argv[3] 61 | if len(sys.argv) == 5: 62 | TF_CONFIG = sys.argv[4] 63 | else: 64 | TF_CONFIG = "" 65 | convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT, TF_DATASET_FILE) 66 | else: 67 | try: 68 | from .convert_gpt2_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch 69 | except ImportError: 70 | print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, " 71 | "In that case, it requires TensorFlow to be installed. Please see " 72 | "https://www.tensorflow.org/install/ for installation instructions.") 73 | raise 74 | 75 | TF_CHECKPOINT = sys.argv[2] 76 | PYTORCH_DUMP_OUTPUT = sys.argv[3] 77 | if len(sys.argv) == 5: 78 | TF_CONFIG = sys.argv[4] 79 | else: 80 | TF_CONFIG = "" 81 | convert_gpt2_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT) 82 | if __name__ == '__main__': 83 | main() 84 | -------------------------------------------------------------------------------- /models/pytorch_pretrained_bert/convert_gpt2_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert OpenAI GPT checkpoint.""" 16 | 17 | from __future__ import absolute_import, division, print_function 18 | 19 | import argparse 20 | from io import open 21 | 22 | import torch 23 | 24 | from pytorch_pretrained_bert.modeling_gpt2 import (CONFIG_NAME, WEIGHTS_NAME, 25 | GPT2Config, 26 | GPT2Model, 27 | load_tf_weights_in_gpt2) 28 | 29 | 30 | def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path): 31 | # Construct model 32 | if gpt2_config_file == "": 33 | config = GPT2Config() 34 | else: 35 | config = GPT2Config(gpt2_config_file) 36 | model = GPT2Model(config) 37 | 38 | # Load weights from numpy 39 | load_tf_weights_in_gpt2(model, gpt2_checkpoint_path) 40 | 41 | # Save pytorch-model 42 | pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME 43 | pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME 44 | print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) 45 | torch.save(model.state_dict(), pytorch_weights_dump_path) 46 | print("Save configuration file to {}".format(pytorch_config_dump_path)) 47 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 48 | f.write(config.to_json_string()) 49 | 50 | 51 | if __name__ == "__main__": 52 | parser = argparse.ArgumentParser() 53 | ## Required parameters 54 | parser.add_argument("--gpt2_checkpoint_path", 55 | default = None, 56 | type = str, 57 | required = True, 58 | help = "Path the TensorFlow checkpoint path.") 59 | parser.add_argument("--pytorch_dump_folder_path", 60 | default = None, 61 | type = str, 62 | required = True, 63 | help = "Path to the output PyTorch model.") 64 | parser.add_argument("--gpt2_config_file", 65 | default = "", 66 | type = str, 67 | help = "An optional config json file corresponding to the pre-trained OpenAI model. \n" 68 | "This specifies the model architecture.") 69 | args = parser.parse_args() 70 | convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path, 71 | args.gpt2_config_file, 72 | args.pytorch_dump_folder_path) 73 | -------------------------------------------------------------------------------- /models/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert OpenAI GPT checkpoint.""" 16 | 17 | from __future__ import absolute_import, division, print_function 18 | 19 | import argparse 20 | from io import open 21 | 22 | import torch 23 | 24 | from pytorch_pretrained_bert.modeling_openai import (CONFIG_NAME, WEIGHTS_NAME, 25 | OpenAIGPTConfig, 26 | OpenAIGPTModel, 27 | load_tf_weights_in_openai_gpt) 28 | 29 | 30 | def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path): 31 | # Construct model 32 | if openai_config_file == "": 33 | config = OpenAIGPTConfig() 34 | else: 35 | config = OpenAIGPTConfig(openai_config_file) 36 | model = OpenAIGPTModel(config) 37 | 38 | # Load weights from numpy 39 | load_tf_weights_in_openai_gpt(model, openai_checkpoint_folder_path) 40 | 41 | # Save pytorch-model 42 | pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME 43 | pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME 44 | print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) 45 | torch.save(model.state_dict(), pytorch_weights_dump_path) 46 | print("Save configuration file to {}".format(pytorch_config_dump_path)) 47 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 48 | f.write(config.to_json_string()) 49 | 50 | 51 | if __name__ == "__main__": 52 | parser = argparse.ArgumentParser() 53 | ## Required parameters 54 | parser.add_argument("--openai_checkpoint_folder_path", 55 | default = None, 56 | type = str, 57 | required = True, 58 | help = "Path the TensorFlow checkpoint path.") 59 | parser.add_argument("--pytorch_dump_folder_path", 60 | default = None, 61 | type = str, 62 | required = True, 63 | help = "Path to the output PyTorch model.") 64 | parser.add_argument("--openai_config_file", 65 | default = "", 66 | type = str, 67 | help = "An optional config json file corresponding to the pre-trained OpenAI model. \n" 68 | "This specifies the model architecture.") 69 | args = parser.parse_args() 70 | convert_openai_checkpoint_to_pytorch(args.openai_checkpoint_folder_path, 71 | args.openai_config_file, 72 | args.pytorch_dump_folder_path) 73 | -------------------------------------------------------------------------------- /models/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert BERT checkpoint.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import os 22 | import re 23 | import argparse 24 | import tensorflow as tf 25 | import torch 26 | import numpy as np 27 | 28 | from pytorch_pretrained_bert.modeling import BertConfig, BertForPreTraining, load_tf_weights_in_bert 29 | 30 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): 31 | # Initialise PyTorch model 32 | config = BertConfig.from_json_file(bert_config_file) 33 | print("Building PyTorch model from configuration: {}".format(str(config))) 34 | model = BertForPreTraining(config) 35 | 36 | # Load weights from tf checkpoint 37 | load_tf_weights_in_bert(model, tf_checkpoint_path) 38 | 39 | # Save pytorch-model 40 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 41 | torch.save(model.state_dict(), pytorch_dump_path) 42 | 43 | 44 | if __name__ == "__main__": 45 | parser = argparse.ArgumentParser() 46 | ## Required parameters 47 | parser.add_argument("--tf_checkpoint_path", 48 | default = None, 49 | type = str, 50 | required = True, 51 | help = "Path the TensorFlow checkpoint path.") 52 | parser.add_argument("--bert_config_file", 53 | default = None, 54 | type = str, 55 | required = True, 56 | help = "The config json file corresponding to the pre-trained BERT model. \n" 57 | "This specifies the model architecture.") 58 | parser.add_argument("--pytorch_dump_path", 59 | default = None, 60 | type = str, 61 | required = True, 62 | help = "Path to the output PyTorch model.") 63 | args = parser.parse_args() 64 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, 65 | args.bert_config_file, 66 | args.pytorch_dump_path) 67 | -------------------------------------------------------------------------------- /models/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert Transformer XL checkpoint and datasets.""" 16 | 17 | from __future__ import absolute_import, division, print_function 18 | 19 | import argparse 20 | import os 21 | import sys 22 | from io import open 23 | 24 | import torch 25 | 26 | import pytorch_pretrained_bert.tokenization_transfo_xl as data_utils 27 | from pytorch_pretrained_bert.modeling_transfo_xl import (CONFIG_NAME, 28 | WEIGHTS_NAME, 29 | TransfoXLConfig, 30 | TransfoXLLMHeadModel, 31 | load_tf_weights_in_transfo_xl) 32 | from pytorch_pretrained_bert.tokenization_transfo_xl import (CORPUS_NAME, 33 | VOCAB_NAME) 34 | 35 | if sys.version_info[0] == 2: 36 | import cPickle as pickle 37 | else: 38 | import pickle 39 | 40 | # We do this to be able to load python 2 datasets pickles 41 | # See e.g. https://stackoverflow.com/questions/2121874/python-pickling-after-changing-a-modules-directory/2121918#2121918 42 | data_utils.Vocab = data_utils.TransfoXLTokenizer 43 | data_utils.Corpus = data_utils.TransfoXLCorpus 44 | sys.modules['data_utils'] = data_utils 45 | sys.modules['vocabulary'] = data_utils 46 | 47 | def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path, 48 | transfo_xl_config_file, 49 | pytorch_dump_folder_path, 50 | transfo_xl_dataset_file): 51 | if transfo_xl_dataset_file: 52 | # Convert a pre-processed corpus (see original TensorFlow repo) 53 | with open(transfo_xl_dataset_file, "rb") as fp: 54 | corpus = pickle.load(fp, encoding="latin1") 55 | # Save vocabulary and dataset cache as Dictionaries (should be better than pickles for the long-term) 56 | pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' + VOCAB_NAME 57 | print("Save vocabulary to {}".format(pytorch_vocab_dump_path)) 58 | corpus_vocab_dict = corpus.vocab.__dict__ 59 | torch.save(corpus_vocab_dict, pytorch_vocab_dump_path) 60 | 61 | corpus_dict_no_vocab = corpus.__dict__ 62 | corpus_dict_no_vocab.pop('vocab', None) 63 | pytorch_dataset_dump_path = pytorch_dump_folder_path + '/' + CORPUS_NAME 64 | print("Save dataset to {}".format(pytorch_dataset_dump_path)) 65 | torch.save(corpus_dict_no_vocab, pytorch_dataset_dump_path) 66 | 67 | if tf_checkpoint_path: 68 | # Convert a pre-trained TensorFlow model 69 | config_path = os.path.abspath(transfo_xl_config_file) 70 | tf_path = os.path.abspath(tf_checkpoint_path) 71 | 72 | print("Converting Transformer XL checkpoint from {} with config at {}".format(tf_path, config_path)) 73 | # Initialise PyTorch model 74 | if transfo_xl_config_file == "": 75 | config = TransfoXLConfig() 76 | else: 77 | config = TransfoXLConfig(transfo_xl_config_file) 78 | print("Building PyTorch model from configuration: {}".format(str(config))) 79 | model = TransfoXLLMHeadModel(config) 80 | 81 | model = load_tf_weights_in_transfo_xl(model, config, tf_path) 82 | # Save pytorch-model 83 | pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME) 84 | pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME) 85 | print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path))) 86 | torch.save(model.state_dict(), pytorch_weights_dump_path) 87 | print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path))) 88 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 89 | f.write(config.to_json_string()) 90 | 91 | 92 | if __name__ == "__main__": 93 | parser = argparse.ArgumentParser() 94 | parser.add_argument("--pytorch_dump_folder_path", 95 | default = None, 96 | type = str, 97 | required = True, 98 | help = "Path to the folder to store the PyTorch model or dataset/vocab.") 99 | parser.add_argument("--tf_checkpoint_path", 100 | default = "", 101 | type = str, 102 | help = "An optional path to a TensorFlow checkpoint path to be converted.") 103 | parser.add_argument("--transfo_xl_config_file", 104 | default = "", 105 | type = str, 106 | help = "An optional config json file corresponding to the pre-trained BERT model. \n" 107 | "This specifies the model architecture.") 108 | parser.add_argument("--transfo_xl_dataset_file", 109 | default = "", 110 | type = str, 111 | help = "An optional dataset file to be converted in a vocabulary.") 112 | args = parser.parse_args() 113 | convert_transfo_xl_checkpoint_to_pytorch(args.tf_checkpoint_path, 114 | args.transfo_xl_config_file, 115 | args.pytorch_dump_folder_path, 116 | args.transfo_xl_dataset_file) 117 | -------------------------------------------------------------------------------- /models/pytorch_pretrained_bert/file_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utilities for working with the local dataset cache. 3 | This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp 4 | Copyright by the AllenNLP authors. 5 | """ 6 | from __future__ import (absolute_import, division, print_function, unicode_literals) 7 | 8 | import sys 9 | import json 10 | import logging 11 | import os 12 | import shutil 13 | import tempfile 14 | import fnmatch 15 | from functools import wraps 16 | from hashlib import sha256 17 | import sys 18 | from io import open 19 | 20 | import boto3 21 | import requests 22 | from botocore.exceptions import ClientError 23 | from tqdm import tqdm 24 | 25 | try: 26 | from urllib.parse import urlparse 27 | except ImportError: 28 | from urlparse import urlparse 29 | 30 | try: 31 | from pathlib import Path 32 | PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', 33 | Path.home() / '.pytorch_pretrained_bert')) 34 | except (AttributeError, ImportError): 35 | PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', 36 | os.path.join(os.path.expanduser("~"), '.pytorch_pretrained_bert')) 37 | 38 | CONFIG_NAME = "config.json" 39 | WEIGHTS_NAME = "pytorch_model.bin" 40 | 41 | logger = logging.getLogger(__name__) # pylint: disable=invalid-name 42 | 43 | 44 | def url_to_filename(url, etag=None): 45 | """ 46 | Convert `url` into a hashed filename in a repeatable way. 47 | If `etag` is specified, append its hash to the url's, delimited 48 | by a period. 49 | """ 50 | url_bytes = url.encode('utf-8') 51 | url_hash = sha256(url_bytes) 52 | filename = url_hash.hexdigest() 53 | 54 | if etag: 55 | etag_bytes = etag.encode('utf-8') 56 | etag_hash = sha256(etag_bytes) 57 | filename += '.' + etag_hash.hexdigest() 58 | 59 | return filename 60 | 61 | 62 | def filename_to_url(filename, cache_dir=None): 63 | """ 64 | Return the url and etag (which may be ``None``) stored for `filename`. 65 | Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist. 66 | """ 67 | if cache_dir is None: 68 | cache_dir = PYTORCH_PRETRAINED_BERT_CACHE 69 | if sys.version_info[0] == 3 and isinstance(cache_dir, Path): 70 | cache_dir = str(cache_dir) 71 | 72 | cache_path = os.path.join(cache_dir, filename) 73 | if not os.path.exists(cache_path): 74 | raise EnvironmentError("file {} not found".format(cache_path)) 75 | 76 | meta_path = cache_path + '.json' 77 | if not os.path.exists(meta_path): 78 | raise EnvironmentError("file {} not found".format(meta_path)) 79 | 80 | with open(meta_path, encoding="utf-8") as meta_file: 81 | metadata = json.load(meta_file) 82 | url = metadata['url'] 83 | etag = metadata['etag'] 84 | 85 | return url, etag 86 | 87 | 88 | def cached_path(url_or_filename, cache_dir=None): 89 | """ 90 | Given something that might be a URL (or might be a local path), 91 | determine which. If it's a URL, download the file and cache it, and 92 | return the path to the cached file. If it's already a local path, 93 | make sure the file exists and then return the path. 94 | """ 95 | if cache_dir is None: 96 | cache_dir = PYTORCH_PRETRAINED_BERT_CACHE 97 | if sys.version_info[0] == 3 and isinstance(url_or_filename, Path): 98 | url_or_filename = str(url_or_filename) 99 | if sys.version_info[0] == 3 and isinstance(cache_dir, Path): 100 | cache_dir = str(cache_dir) 101 | 102 | parsed = urlparse(url_or_filename) 103 | 104 | if parsed.scheme in ('http', 'https', 's3'): 105 | # URL, so get it from the cache (downloading if necessary) 106 | return get_from_cache(url_or_filename, cache_dir) 107 | elif os.path.exists(url_or_filename): 108 | # File, and it exists. 109 | return url_or_filename 110 | elif parsed.scheme == '': 111 | # File, but it doesn't exist. 112 | raise EnvironmentError("file {} not found".format(url_or_filename)) 113 | else: 114 | # Something unknown 115 | raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename)) 116 | 117 | 118 | def split_s3_path(url): 119 | """Split a full s3 path into the bucket name and path.""" 120 | parsed = urlparse(url) 121 | if not parsed.netloc or not parsed.path: 122 | raise ValueError("bad s3 path {}".format(url)) 123 | bucket_name = parsed.netloc 124 | s3_path = parsed.path 125 | # Remove '/' at beginning of path. 126 | if s3_path.startswith("/"): 127 | s3_path = s3_path[1:] 128 | return bucket_name, s3_path 129 | 130 | 131 | def s3_request(func): 132 | """ 133 | Wrapper function for s3 requests in order to create more helpful error 134 | messages. 135 | """ 136 | 137 | @wraps(func) 138 | def wrapper(url, *args, **kwargs): 139 | try: 140 | return func(url, *args, **kwargs) 141 | except ClientError as exc: 142 | if int(exc.response["Error"]["Code"]) == 404: 143 | raise EnvironmentError("file {} not found".format(url)) 144 | else: 145 | raise 146 | 147 | return wrapper 148 | 149 | 150 | @s3_request 151 | def s3_etag(url): 152 | """Check ETag on S3 object.""" 153 | s3_resource = boto3.resource("s3") 154 | bucket_name, s3_path = split_s3_path(url) 155 | s3_object = s3_resource.Object(bucket_name, s3_path) 156 | return s3_object.e_tag 157 | 158 | 159 | @s3_request 160 | def s3_get(url, temp_file): 161 | """Pull a file directly from S3.""" 162 | s3_resource = boto3.resource("s3") 163 | bucket_name, s3_path = split_s3_path(url) 164 | s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file) 165 | 166 | 167 | def http_get(url, temp_file): 168 | req = requests.get(url, stream=True) 169 | content_length = req.headers.get('Content-Length') 170 | total = int(content_length) if content_length is not None else None 171 | progress = tqdm(unit="B", total=total) 172 | for chunk in req.iter_content(chunk_size=1024): 173 | if chunk: # filter out keep-alive new chunks 174 | progress.update(len(chunk)) 175 | temp_file.write(chunk) 176 | progress.close() 177 | 178 | 179 | def get_from_cache(url, cache_dir=None): 180 | """ 181 | Given a URL, look for the corresponding dataset in the local cache. 182 | If it's not there, download it. Then return the path to the cached file. 183 | """ 184 | if cache_dir is None: 185 | cache_dir = PYTORCH_PRETRAINED_BERT_CACHE 186 | if sys.version_info[0] == 3 and isinstance(cache_dir, Path): 187 | cache_dir = str(cache_dir) 188 | 189 | if not os.path.exists(cache_dir): 190 | os.makedirs(cache_dir) 191 | 192 | # Get eTag to add to filename, if it exists. 193 | if url.startswith("s3://"): 194 | etag = s3_etag(url) 195 | else: 196 | try: 197 | response = requests.head(url, allow_redirects=True) 198 | if response.status_code != 200: 199 | etag = None 200 | else: 201 | etag = response.headers.get("ETag") 202 | except EnvironmentError: 203 | etag = None 204 | 205 | if sys.version_info[0] == 2 and etag is not None: 206 | etag = etag.decode('utf-8') 207 | filename = url_to_filename(url, etag) 208 | 209 | # get cache path to put the file 210 | cache_path = os.path.join(cache_dir, filename) 211 | 212 | # If we don't have a connection (etag is None) and can't identify the file 213 | # try to get the last downloaded one 214 | if not os.path.exists(cache_path) and etag is None: 215 | matching_files = fnmatch.filter(os.listdir(cache_dir), filename + '.*') 216 | matching_files = list(filter(lambda s: not s.endswith('.json'), matching_files)) 217 | if matching_files: 218 | cache_path = os.path.join(cache_dir, matching_files[-1]) 219 | 220 | if not os.path.exists(cache_path): 221 | # Download to temporary file, then copy to cache dir once finished. 222 | # Otherwise you get corrupt cache entries if the download gets interrupted. 223 | with tempfile.NamedTemporaryFile() as temp_file: 224 | logger.info("%s not found in cache, downloading to %s", url, temp_file.name) 225 | 226 | # GET file object 227 | if url.startswith("s3://"): 228 | s3_get(url, temp_file) 229 | else: 230 | http_get(url, temp_file) 231 | 232 | # we are copying the file before closing it, so flush to avoid truncation 233 | temp_file.flush() 234 | # shutil.copyfileobj() starts at the current position, so go to the start 235 | temp_file.seek(0) 236 | 237 | logger.info("copying %s to cache at %s", temp_file.name, cache_path) 238 | with open(cache_path, 'wb') as cache_file: 239 | shutil.copyfileobj(temp_file, cache_file) 240 | 241 | logger.info("creating metadata file for %s", cache_path) 242 | meta = {'url': url, 'etag': etag} 243 | meta_path = cache_path + '.json' 244 | with open(meta_path, 'w') as meta_file: 245 | output_string = json.dumps(meta) 246 | if sys.version_info[0] == 2 and isinstance(output_string, str): 247 | output_string = unicode(output_string, 'utf-8') # The beauty of python 2 248 | meta_file.write(output_string) 249 | 250 | logger.info("removing temp file %s", temp_file.name) 251 | 252 | return cache_path 253 | 254 | 255 | def read_set_from_file(filename): 256 | ''' 257 | Extract a de-duped collection (set) of text from a file. 258 | Expected file format is one item per line. 259 | ''' 260 | collection = set() 261 | with open(filename, 'r', encoding='utf-8') as file_: 262 | for line in file_: 263 | collection.add(line.rstrip()) 264 | return collection 265 | 266 | 267 | def get_file_extension(path, dot=True, lower=True): 268 | ext = os.path.splitext(path)[1] 269 | ext = ext if dot else ext[1:] 270 | return ext.lower() if lower else ext 271 | -------------------------------------------------------------------------------- /models/pytorch_pretrained_bert/optimization.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """PyTorch optimization for BERT model.""" 16 | 17 | import math 18 | import torch 19 | from torch.optim import Optimizer 20 | from torch.optim.optimizer import required 21 | from torch.nn.utils import clip_grad_norm_ 22 | import logging 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | def warmup_cosine(x, warmup=0.002): 27 | if x < warmup: 28 | return x/warmup 29 | x_ = (x - warmup) / (1 - warmup) # progress after warmup - 30 | return 0.5 * (1. + math.cos(math.pi * x_)) 31 | 32 | def warmup_constant(x, warmup=0.002): 33 | """ Linearly increases learning rate over `warmup`*`t_total` (as provided to BertAdam) training steps. 34 | Learning rate is 1. afterwards. """ 35 | if x < warmup: 36 | return x/warmup 37 | return 1.0 38 | 39 | def warmup_linear(x, warmup=0.002): 40 | """ Specifies a triangular learning rate schedule where peak is reached at `warmup`*`t_total`-th (as provided to BertAdam) training step. 41 | After `t_total`-th training step, learning rate is zero. """ 42 | if x < warmup: 43 | return x/warmup 44 | return max((x-1.)/(warmup-1.), 0) 45 | 46 | SCHEDULES = { 47 | 'warmup_cosine': warmup_cosine, 48 | 'warmup_constant': warmup_constant, 49 | 'warmup_linear': warmup_linear, 50 | } 51 | 52 | 53 | class BertAdam(Optimizer): 54 | """Implements BERT version of Adam algorithm with weight decay fix. 55 | Params: 56 | lr: learning rate 57 | warmup: portion of t_total for the warmup, -1 means no warmup. Default: -1 58 | t_total: total number of training steps for the learning 59 | rate schedule, -1 means constant learning rate. Default: -1 60 | schedule: schedule to use for the warmup (see above). Default: 'warmup_linear' 61 | b1: Adams b1. Default: 0.9 62 | b2: Adams b2. Default: 0.999 63 | e: Adams epsilon. Default: 1e-6 64 | weight_decay: Weight decay. Default: 0.01 65 | max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0 66 | """ 67 | def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear', 68 | b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01, 69 | max_grad_norm=1.0): 70 | if lr is not required and lr < 0.0: 71 | raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr)) 72 | if schedule not in SCHEDULES: 73 | raise ValueError("Invalid schedule parameter: {}".format(schedule)) 74 | if not 0.0 <= warmup < 1.0 and not warmup == -1: 75 | raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup)) 76 | if not 0.0 <= b1 < 1.0: 77 | raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1)) 78 | if not 0.0 <= b2 < 1.0: 79 | raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2)) 80 | if not e >= 0.0: 81 | raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e)) 82 | defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total, 83 | b1=b1, b2=b2, e=e, weight_decay=weight_decay, 84 | max_grad_norm=max_grad_norm) 85 | super(BertAdam, self).__init__(params, defaults) 86 | 87 | def get_lr(self): 88 | lr = [] 89 | for group in self.param_groups: 90 | for p in group['params']: 91 | state = self.state[p] 92 | if len(state) == 0: 93 | return [0] 94 | if group['t_total'] != -1: 95 | schedule_fct = SCHEDULES[group['schedule']] 96 | lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup']) 97 | else: 98 | lr_scheduled = group['lr'] 99 | lr.append(lr_scheduled) 100 | return lr 101 | 102 | def step(self, closure=None): 103 | """Performs a single optimization step. 104 | 105 | Arguments: 106 | closure (callable, optional): A closure that reevaluates the model 107 | and returns the loss. 108 | """ 109 | loss = None 110 | if closure is not None: 111 | loss = closure() 112 | 113 | warned_for_t_total = False 114 | 115 | for group in self.param_groups: 116 | for p in group['params']: 117 | if p.grad is None: 118 | continue 119 | grad = p.grad.data 120 | if grad.is_sparse: 121 | raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') 122 | 123 | state = self.state[p] 124 | 125 | # State initialization 126 | if len(state) == 0: 127 | state['step'] = 0 128 | # Exponential moving average of gradient values 129 | state['next_m'] = torch.zeros_like(p.data) 130 | # Exponential moving average of squared gradient values 131 | state['next_v'] = torch.zeros_like(p.data) 132 | 133 | next_m, next_v = state['next_m'], state['next_v'] 134 | beta1, beta2 = group['b1'], group['b2'] 135 | 136 | # Add grad clipping 137 | if group['max_grad_norm'] > 0: 138 | clip_grad_norm_(p, group['max_grad_norm']) 139 | 140 | # Decay the first and second moment running average coefficient 141 | # In-place operations to update the averages at the same time 142 | next_m.mul_(beta1).add_(1 - beta1, grad) 143 | next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad) 144 | update = next_m / (next_v.sqrt() + group['e']) 145 | 146 | # Just adding the square of the weights to the loss function is *not* 147 | # the correct way of using L2 regularization/weight decay with Adam, 148 | # since that will interact with the m and v parameters in strange ways. 149 | # 150 | # Instead we want to decay the weights in a manner that doesn't interact 151 | # with the m/v parameters. This is equivalent to adding the square 152 | # of the weights to the loss with plain (non-momentum) SGD. 153 | if group['weight_decay'] > 0.0: 154 | update += group['weight_decay'] * p.data 155 | 156 | if group['t_total'] != -1: 157 | schedule_fct = SCHEDULES[group['schedule']] 158 | progress = state['step']/group['t_total'] 159 | lr_scheduled = group['lr'] * schedule_fct(progress, group['warmup']) 160 | # warning for exceeding t_total (only active with warmup_linear 161 | if group['schedule'] == "warmup_linear" and progress > 1. and not warned_for_t_total: 162 | logger.warning( 163 | "Training beyond specified 't_total' steps with schedule '{}'. Learning rate set to {}. " 164 | "Please set 't_total' of {} correctly.".format(group['schedule'], lr_scheduled, self.__class__.__name__)) 165 | warned_for_t_total = True 166 | # end warning 167 | else: 168 | lr_scheduled = group['lr'] 169 | 170 | update_with_lr = lr_scheduled * update 171 | p.data.add_(-update_with_lr) 172 | 173 | state['step'] += 1 174 | 175 | # step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1 176 | # No bias correction 177 | # bias_correction1 = 1 - beta1 ** state['step'] 178 | # bias_correction2 = 1 - beta2 ** state['step'] 179 | 180 | return loss 181 | -------------------------------------------------------------------------------- /models/pytorch_pretrained_bert/optimization_openai.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """PyTorch optimization for OpenAI GPT model.""" 16 | 17 | import math 18 | import torch 19 | from torch.optim import Optimizer 20 | from torch.optim.optimizer import required 21 | from torch.nn.utils import clip_grad_norm_ 22 | import logging 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | def warmup_cosine(x, warmup=0.002): 27 | if x < warmup: 28 | return x/warmup 29 | x_ = (x - warmup) / (1 - warmup) # progress after warmup 30 | return 0.5 * (1. + math.cos(math.pi * x_)) 31 | 32 | def warmup_constant(x, warmup=0.002): 33 | """ Linearly increases learning rate over `warmup`*`t_total` (as provided to OpenAIAdam) training steps. 34 | Learning rate is 1. afterwards. """ 35 | if x < warmup: 36 | return x/warmup 37 | return 1.0 38 | 39 | def warmup_linear(x, warmup=0.002): 40 | """ Specifies a triangular learning rate schedule where peak is reached at `warmup`*`t_total`-th (as provided to OpenAIAdam) training step. 41 | After `t_total`-th training step, learning rate is zero. """ 42 | if x < warmup: 43 | return x/warmup 44 | return max((x-1.)/(warmup-1.), 0) 45 | 46 | SCHEDULES = { 47 | 'warmup_cosine':warmup_cosine, 48 | 'warmup_constant':warmup_constant, 49 | 'warmup_linear':warmup_linear, 50 | } 51 | 52 | 53 | class OpenAIAdam(Optimizer): 54 | """Implements Open AI version of Adam algorithm with weight decay fix. 55 | """ 56 | def __init__(self, params, lr=required, schedule='warmup_linear', warmup=-1, t_total=-1, 57 | b1=0.9, b2=0.999, e=1e-8, weight_decay=0, 58 | vector_l2=False, max_grad_norm=-1, **kwargs): 59 | if lr is not required and lr < 0.0: 60 | raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr)) 61 | if schedule not in SCHEDULES: 62 | raise ValueError("Invalid schedule parameter: {}".format(schedule)) 63 | if not 0.0 <= warmup < 1.0 and not warmup == -1: 64 | raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup)) 65 | if not 0.0 <= b1 < 1.0: 66 | raise ValueError("Invalid b1 parameter: {}".format(b1)) 67 | if not 0.0 <= b2 < 1.0: 68 | raise ValueError("Invalid b2 parameter: {}".format(b2)) 69 | if not e >= 0.0: 70 | raise ValueError("Invalid epsilon value: {}".format(e)) 71 | defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total, 72 | b1=b1, b2=b2, e=e, weight_decay=weight_decay, vector_l2=vector_l2, 73 | max_grad_norm=max_grad_norm) 74 | super(OpenAIAdam, self).__init__(params, defaults) 75 | 76 | def get_lr(self): 77 | lr = [] 78 | for group in self.param_groups: 79 | for p in group['params']: 80 | state = self.state[p] 81 | if len(state) == 0: 82 | return [0] 83 | if group['t_total'] != -1: 84 | schedule_fct = SCHEDULES[group['schedule']] 85 | lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup']) 86 | else: 87 | lr_scheduled = group['lr'] 88 | lr.append(lr_scheduled) 89 | return lr 90 | 91 | def step(self, closure=None): 92 | """Performs a single optimization step. 93 | 94 | Arguments: 95 | closure (callable, optional): A closure that reevaluates the model 96 | and returns the loss. 97 | """ 98 | loss = None 99 | if closure is not None: 100 | loss = closure() 101 | 102 | warned_for_t_total = False 103 | 104 | for group in self.param_groups: 105 | for p in group['params']: 106 | if p.grad is None: 107 | continue 108 | grad = p.grad.data 109 | if grad.is_sparse: 110 | raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') 111 | 112 | state = self.state[p] 113 | 114 | # State initialization 115 | if len(state) == 0: 116 | state['step'] = 0 117 | # Exponential moving average of gradient values 118 | state['exp_avg'] = torch.zeros_like(p.data) 119 | # Exponential moving average of squared gradient values 120 | state['exp_avg_sq'] = torch.zeros_like(p.data) 121 | 122 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 123 | beta1, beta2 = group['b1'], group['b2'] 124 | 125 | state['step'] += 1 126 | 127 | # Add grad clipping 128 | if group['max_grad_norm'] > 0: 129 | clip_grad_norm_(p, group['max_grad_norm']) 130 | 131 | # Decay the first and second moment running average coefficient 132 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 133 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 134 | denom = exp_avg_sq.sqrt().add_(group['e']) 135 | 136 | bias_correction1 = 1 - beta1 ** state['step'] 137 | bias_correction2 = 1 - beta2 ** state['step'] 138 | 139 | if group['t_total'] != -1: 140 | schedule_fct = SCHEDULES[group['schedule']] 141 | progress = state['step']/group['t_total'] 142 | lr_scheduled = group['lr'] * schedule_fct(progress, group['warmup']) 143 | # warning for exceeding t_total (only active with warmup_linear 144 | if group['schedule'] == "warmup_linear" and progress > 1. and not warned_for_t_total: 145 | logger.warning( 146 | "Training beyond specified 't_total' steps with schedule '{}'. Learning rate set to {}. " 147 | "Please set 't_total' of {} correctly.".format(group['schedule'], lr_scheduled, self.__class__.__name__)) 148 | warned_for_t_total = True 149 | # end warning 150 | else: 151 | lr_scheduled = group['lr'] 152 | 153 | step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1 154 | 155 | p.data.addcdiv_(-step_size, exp_avg, denom) 156 | 157 | # Add weight decay at the end (fixed version) 158 | if (len(p.size()) > 1 or group['vector_l2']) and group['weight_decay'] > 0: 159 | p.data.add_(-lr_scheduled * group['weight_decay'], p.data) 160 | 161 | return loss 162 | -------------------------------------------------------------------------------- /pytorch-logo-dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andreamad8/PPCM/e5bef1bbb70907a3d65de3225a00e4af9104d4a8/pytorch-logo-dark.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | transformers~=4.18.0 2 | matplotlib~=3.3.4 3 | boto3~=1.21.40 4 | IPython~=7.16.3 5 | Tabulate~=0.8.9 6 | sklearn~=0.0 7 | colorama~=0.4.4 8 | jsonlines~=3.0.0 9 | nltk~=3.6.7 10 | numpy~=1.19.5 11 | tqdm~=4.64.0 12 | setuptools~=59.6.0 13 | scikit-learn~=0.24.2 14 | requests~=2.27.1 15 | botocore~=1.24.40 16 | regex~=2022.3.15 17 | --extra-index-url https://download.pytorch.org/whl/cu113 18 | torch~=1.10.1 19 | torchvision 20 | torchaudio -------------------------------------------------------------------------------- /utils/utils_sample.py: -------------------------------------------------------------------------------- 1 | from utils.helper import cut_seq_to_eos 2 | from utils.helper import dist_score, truncate, pad_sequences 3 | from metric.bleu import moses_multi_bleu 4 | from collections import Counter 5 | import torch 6 | import numpy as np 7 | 8 | 9 | def _prec_recall_f1_score(pred_items, gold_items): 10 | """ 11 | Compute precision, recall and f1 given a set of gold and prediction items. 12 | :param pred_items: iterable of predicted values 13 | :param gold_items: iterable of gold values 14 | :return: tuple (p, r, f1) for precision, recall, f1 15 | """ 16 | common = Counter(gold_items) & Counter(pred_items) 17 | num_same = sum(common.values()) 18 | if num_same == 0: 19 | return 0 20 | precision = 1.0 * num_same / len(pred_items) 21 | recall = 1.0 * num_same / len(gold_items) 22 | f1 = (2 * precision * recall) / (precision + recall) 23 | return f1 24 | 25 | def scorer(args,turn,classifier,enc,class2idx,knowledge,plot=False,gold=None): 26 | hypotesis = [] 27 | plots_array = [] 28 | if(plot): 29 | loss = np.transpose(np.array(turn['loss']), (2, 0, 1)) # batch * sequence_len * iteration 30 | for i,t in enumerate(turn['text']): 31 | ind_eos = len(cut_seq_to_eos(t))-1 32 | 33 | text = enc.decode(cut_seq_to_eos(t)) 34 | dist = dist_score(text,enc) 35 | bleu = None 36 | f1 = None 37 | if(gold): 38 | bleu = truncate(moses_multi_bleu(np.array([text]),np.array([gold]))*100,2) 39 | f1 = truncate(_prec_recall_f1_score(enc.encode(text), enc.encode(gold))*100,2) 40 | hypotesis.append([i,1-dist,text, f"{bleu}/{f1}"]) 41 | ## plotting 42 | if(plot): plots_array.append(loss[i][:ind_eos,-1]) 43 | 44 | x = [h[2] for h in hypotesis] 45 | if(knowledge): 46 | sent_p = [knowledge for i in range(args.num_samples)] 47 | x = (sent_p,x) 48 | 49 | for j, (loss,correct,predition) in enumerate(zip(*predict(args,classifier,x,class2idx))): 50 | hypotesis[j] = [hypotesis[j][0],loss,hypotesis[j][1],correct,predition,hypotesis[j][3],hypotesis[j][2]] 51 | 52 | hypotesis = sorted(hypotesis, key = lambda x: x[1]) ## sort by loss 53 | acc = hypotesis[0][3] ## if it is correctly classifed the sample with the lowest loss 54 | hypotesis = [[h[0],truncate(h[1],4),truncate(h[2],4),h[4],h[5],h[6]] for h in hypotesis] 55 | return hypotesis, acc, plots_array 56 | 57 | def predict(args, classifier, X, class2idx): 58 | if(type(X) is tuple): 59 | input_p = pad_sequences([torch.tensor(classifier.tokenizer.encode(s)) for s in X[0]]) 60 | input_h = pad_sequences([torch.tensor(classifier.tokenizer.encode(s)) for s in X[1]]) 61 | X = [input_p,input_h] 62 | else: 63 | X = pad_sequences([torch.tensor(classifier.tokenizer.encode(s)) for s in X]) 64 | 65 | output_t = classifier(X) 66 | 67 | target_t = torch.tensor([args.label_class], device='cuda', dtype=torch.long).repeat(args.num_samples) 68 | ce_loss_logging = torch.nn.CrossEntropyLoss(reduction='none') 69 | loss = ce_loss_logging(output_t, target_t).detach().tolist() 70 | pred_t = output_t.argmax(dim=1, keepdim=True) 71 | correct = pred_t.eq(target_t.view_as(pred_t)).detach().tolist() 72 | return loss, sum(correct, []), [class2idx[int(pred[0])] for pred in pred_t.detach().tolist()] 73 | --------------------------------------------------------------------------------