├── Imgs ├── Prompt4NR.png └── templates_table.png ├── Discrete-Action ├── run.sh ├── model.py ├── utils.py ├── prepro_data.py └── predict.py ├── Discrete-Emotion ├── run.sh ├── model.py ├── utils.py ├── prepro_data.py └── predict.py ├── Discrete-Relevance ├── run.sh ├── model.py ├── utils.py ├── prepro_data.py └── predict.py ├── Hybrid-Relevance ├── run.sh ├── model.py ├── utils.py ├── prepro_data.py └── predict.py ├── Hybrid-Utility ├── run.sh ├── model.py ├── utils.py ├── prepro_data.py └── predict.py ├── Hybrid-Emotion ├── run.sh ├── model.py ├── utils.py ├── prepro_data.py └── predict.py ├── Continuous-Action ├── run.sh ├── model.py ├── utils.py ├── prepro_data.py └── predict.py ├── Continuous-Emotion ├── run.sh ├── model.py ├── utils.py └── prepro_data.py ├── Continuous-Relevance ├── run.sh ├── model.py ├── utils.py └── prepro_data.py ├── Hybrid-Action ├── model.py ├── utils.py ├── run.sh └── prepro_data.py ├── Continuous-Utility ├── model.py ├── utils.py ├── run.sh └── prepro_data.py ├── Discrete-Utility ├── model.py ├── utils.py ├── run.sh ├── prepro_data.py └── predict.py └── README.md /Imgs/Prompt4NR.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/resistzzz/Prompt4NR/HEAD/Imgs/Prompt4NR.png -------------------------------------------------------------------------------- /Imgs/templates_table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/resistzzz/Prompt4NR/HEAD/Imgs/templates_table.png -------------------------------------------------------------------------------- /Discrete-Action/run.sh: -------------------------------------------------------------------------------- 1 | python main-multigpu.py --data_path ../DATA/MIND-Small --epochs 3 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True 2 | python predict.py --data_path ../DATA/MIND-Small --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True -------------------------------------------------------------------------------- /Discrete-Emotion/run.sh: -------------------------------------------------------------------------------- 1 | python main-multigpu.py --data_path ../DATA/MIND-Small --epochs 4 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True 2 | python predict.py --data_path ../DATA/MIND-Small --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True -------------------------------------------------------------------------------- /Discrete-Relevance/run.sh: -------------------------------------------------------------------------------- 1 | python main-multigpu.py --data_path ../DATA/MIND-Small --epochs 4 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True 2 | python predict.py --data_path ../DATA/MIND-Small --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True -------------------------------------------------------------------------------- /Hybrid-Relevance/run.sh: -------------------------------------------------------------------------------- 1 | python main-multigpu.py --data_path ../DATA/MIND-Small --epochs 4 --batch_size 16 --test_batch_size 100 --num_conti1 3 --num_conti2 3 --wd 1e-3 --max_tokens 500 --log True --model_save True 2 | python predict.py --data_path ../DATA/MIND-Small --test_batch_size 100 --num_conti1 3 --num_conti2 3 --max_tokens 500 --model_file ./temp/BestModel.pt --log True -------------------------------------------------------------------------------- /Hybrid-Utility/run.sh: -------------------------------------------------------------------------------- 1 | python main-multigpu.py --data_path ../DATA/MIND-Small --epochs 4 --batch_size 16 --test_batch_size 100 --num_conti1 3 --num_conti2 3 --wd 1e-3 --max_tokens 500 --log True --model_save True 2 | python predict.py --data_path ../DATA/MIND-Small --test_batch_size 100 --num_conti1 3 --num_conti2 3 --max_tokens 500 --model_file ./temp/BestModel.pt --log True -------------------------------------------------------------------------------- /Hybrid-Emotion/run.sh: -------------------------------------------------------------------------------- 1 | python main-multigpu.py --data_path ../DATA/MIND-Small --epochs 4 --batch_size 16 --test_batch_size 100 --num_conti1 3 --num_conti2 3 --wd 1e-3 --max_tokens 500 --log True --model_save True 2 | python predict.py --data_path ../DATA/MIND-Small --test_batch_size 100 --num_conti1 3 --num_conti2 3 --max_tokens 500 --model_file ./temp/BestModel.pt --log True 3 | -------------------------------------------------------------------------------- /Continuous-Action/run.sh: -------------------------------------------------------------------------------- 1 | python main-multigpu.py --data_path ../DATA/MIND-Small --epochs 4 --batch_size 16 --test_batch_size 100 --num_conti1 3 --num_conti2 3 --num_conti3 3 --wd 1e-3 --max_tokens 500 --log True --model_save True 2 | python predict.py --data_path ../DATA/MIND-Small --test_batch_size 100 --num_conti1 3 --num_conti2 3 --num_conti3 3 --max_tokens 500 --model_file ./temp/BestModel.pt --log True -------------------------------------------------------------------------------- /Continuous-Emotion/run.sh: -------------------------------------------------------------------------------- 1 | python main-multigpu.py --data_path ../DATA/MIND-Small --epochs 4 --batch_size 16 --test_batch_size 100 --num_conti1 3 --num_conti2 3 --num_conti3 3 --wd 1e-3 --max_tokens 500 --log True --model_save True 2 | python predict.py --data_path ../DATA/MIND-Small --test_batch_size 100 --num_conti1 3 --num_conti2 3 --num_conti3 3 --max_tokens 500 --model_file ./temp/BestModel.pt --log True -------------------------------------------------------------------------------- /Continuous-Relevance/run.sh: -------------------------------------------------------------------------------- 1 | python main-multigpu.py --data_path ../DATA/MIND-Small --epochs 4 --batch_size 16 --test_batch_size 100 --num_conti1 3 --num_conti2 3 --num_conti3 3 --wd 1e-3 --max_tokens 500 --log True --model_save True 2 | python predict.py --data_path ../DATA/MIND-Small --test_batch_size 100 --num_conti1 3 --num_conti2 3 --num_conti3 3 --max_tokens 500 --model_file ./temp/BestModel.pt --log True -------------------------------------------------------------------------------- /Hybrid-Action/model.py: -------------------------------------------------------------------------------- 1 | 2 | from transformers import BertForMaskedLM 3 | import torch.nn as nn 4 | 5 | 6 | class BERTPrompt4NR(nn.Module): 7 | def __init__(self, model_name, answer_ids, args): 8 | super(BERTPrompt4NR, self).__init__() 9 | self.BERT = BertForMaskedLM.from_pretrained(model_name) 10 | self.BERT.resize_token_embeddings(args.vocab_size) 11 | 12 | for param in self.BERT.parameters(): 13 | param.requires_grad = True 14 | 15 | self.answer_ids = answer_ids 16 | self.mask_token_id = 103 17 | self.loss_func = nn.CrossEntropyLoss() 18 | 19 | def forward(self, batch_enc, batch_attn, batch_labs): 20 | outputs = self.BERT(input_ids=batch_enc, 21 | attention_mask=batch_attn) 22 | out_logits = outputs.logits 23 | 24 | mask_position = batch_enc.eq(self.mask_token_id) 25 | mask_logits = out_logits[mask_position, :].view(out_logits.size(0), -1, out_logits.size(-1))[:, -1, :] 26 | 27 | answer_logits = mask_logits[:, self.answer_ids] 28 | 29 | loss = self.loss_func(answer_logits, batch_labs) 30 | 31 | return loss, answer_logits.softmax(dim=1) 32 | -------------------------------------------------------------------------------- /Hybrid-Emotion/model.py: -------------------------------------------------------------------------------- 1 | 2 | from transformers import BertForMaskedLM 3 | import torch.nn as nn 4 | 5 | 6 | class BERTPrompt4NR(nn.Module): 7 | def __init__(self, model_name, answer_ids, args): 8 | super(BERTPrompt4NR, self).__init__() 9 | self.BERT = BertForMaskedLM.from_pretrained(model_name) 10 | self.BERT.resize_token_embeddings(args.vocab_size) 11 | 12 | for param in self.BERT.parameters(): 13 | param.requires_grad = True 14 | 15 | self.answer_ids = answer_ids 16 | self.mask_token_id = 103 17 | self.loss_func = nn.CrossEntropyLoss() 18 | 19 | def forward(self, batch_enc, batch_attn, batch_labs): 20 | outputs = self.BERT(input_ids=batch_enc, 21 | attention_mask=batch_attn) 22 | out_logits = outputs.logits 23 | 24 | mask_position = batch_enc.eq(self.mask_token_id) 25 | mask_logits = out_logits[mask_position, :].view(out_logits.size(0), -1, out_logits.size(-1))[:, -1, :] 26 | 27 | answer_logits = mask_logits[:, self.answer_ids] 28 | 29 | loss = self.loss_func(answer_logits, batch_labs) 30 | 31 | return loss, answer_logits.softmax(dim=1) 32 | -------------------------------------------------------------------------------- /Hybrid-Utility/model.py: -------------------------------------------------------------------------------- 1 | 2 | from transformers import BertForMaskedLM 3 | import torch.nn as nn 4 | 5 | 6 | class BERTPrompt4NR(nn.Module): 7 | def __init__(self, model_name, answer_ids, args): 8 | super(BERTPrompt4NR, self).__init__() 9 | self.BERT = BertForMaskedLM.from_pretrained(model_name) 10 | self.BERT.resize_token_embeddings(args.vocab_size) 11 | 12 | for param in self.BERT.parameters(): 13 | param.requires_grad = True 14 | 15 | self.answer_ids = answer_ids 16 | self.mask_token_id = 103 17 | self.loss_func = nn.CrossEntropyLoss() 18 | 19 | def forward(self, batch_enc, batch_attn, batch_labs): 20 | outputs = self.BERT(input_ids=batch_enc, 21 | attention_mask=batch_attn) 22 | out_logits = outputs.logits 23 | 24 | mask_position = batch_enc.eq(self.mask_token_id) 25 | mask_logits = out_logits[mask_position, :].view(out_logits.size(0), -1, out_logits.size(-1))[:, -1, :] 26 | 27 | answer_logits = mask_logits[:, self.answer_ids] 28 | 29 | loss = self.loss_func(answer_logits, batch_labs) 30 | 31 | return loss, answer_logits.softmax(dim=1) 32 | -------------------------------------------------------------------------------- /Continuous-Action/model.py: -------------------------------------------------------------------------------- 1 | 2 | from transformers import BertForMaskedLM 3 | import torch.nn as nn 4 | 5 | 6 | class BERTPrompt4NR(nn.Module): 7 | def __init__(self, model_name, answer_ids, args): 8 | super(BERTPrompt4NR, self).__init__() 9 | self.BERT = BertForMaskedLM.from_pretrained(model_name) 10 | self.BERT.resize_token_embeddings(args.vocab_size) 11 | 12 | for param in self.BERT.parameters(): 13 | param.requires_grad = True 14 | 15 | self.answer_ids = answer_ids 16 | self.mask_token_id = 103 17 | self.loss_func = nn.CrossEntropyLoss() 18 | 19 | def forward(self, batch_enc, batch_attn, batch_labs): 20 | outputs = self.BERT(input_ids=batch_enc, 21 | attention_mask=batch_attn) 22 | out_logits = outputs.logits 23 | 24 | mask_position = batch_enc.eq(self.mask_token_id) 25 | mask_logits = out_logits[mask_position, :].view(out_logits.size(0), -1, out_logits.size(-1))[:, -1, :] 26 | 27 | answer_logits = mask_logits[:, self.answer_ids] 28 | 29 | loss = self.loss_func(answer_logits, batch_labs) 30 | 31 | return loss, answer_logits.softmax(dim=1) 32 | -------------------------------------------------------------------------------- /Continuous-Emotion/model.py: -------------------------------------------------------------------------------- 1 | 2 | from transformers import BertForMaskedLM 3 | import torch.nn as nn 4 | 5 | 6 | class BERTPrompt4NR(nn.Module): 7 | def __init__(self, model_name, answer_ids, args): 8 | super(BERTPrompt4NR, self).__init__() 9 | self.BERT = BertForMaskedLM.from_pretrained(model_name) 10 | self.BERT.resize_token_embeddings(args.vocab_size) 11 | 12 | for param in self.BERT.parameters(): 13 | param.requires_grad = True 14 | 15 | self.answer_ids = answer_ids 16 | self.mask_token_id = 103 17 | self.loss_func = nn.CrossEntropyLoss() 18 | 19 | def forward(self, batch_enc, batch_attn, batch_labs): 20 | outputs = self.BERT(input_ids=batch_enc, 21 | attention_mask=batch_attn) 22 | out_logits = outputs.logits 23 | 24 | mask_position = batch_enc.eq(self.mask_token_id) 25 | mask_logits = out_logits[mask_position, :].view(out_logits.size(0), -1, out_logits.size(-1))[:, -1, :] 26 | 27 | answer_logits = mask_logits[:, self.answer_ids] 28 | 29 | loss = self.loss_func(answer_logits, batch_labs) 30 | 31 | return loss, answer_logits.softmax(dim=1) 32 | -------------------------------------------------------------------------------- /Continuous-Utility/model.py: -------------------------------------------------------------------------------- 1 | 2 | from transformers import BertForMaskedLM 3 | import torch.nn as nn 4 | 5 | 6 | class BERTPrompt4NR(nn.Module): 7 | def __init__(self, model_name, answer_ids, args): 8 | super(BERTPrompt4NR, self).__init__() 9 | self.BERT = BertForMaskedLM.from_pretrained(model_name) 10 | self.BERT.resize_token_embeddings(args.vocab_size) 11 | 12 | for param in self.BERT.parameters(): 13 | param.requires_grad = True 14 | 15 | self.answer_ids = answer_ids 16 | self.mask_token_id = 103 17 | self.loss_func = nn.CrossEntropyLoss() 18 | 19 | def forward(self, batch_enc, batch_attn, batch_labs): 20 | outputs = self.BERT(input_ids=batch_enc, 21 | attention_mask=batch_attn) 22 | out_logits = outputs.logits 23 | 24 | mask_position = batch_enc.eq(self.mask_token_id) 25 | mask_logits = out_logits[mask_position, :].view(out_logits.size(0), -1, out_logits.size(-1))[:, -1, :] 26 | 27 | answer_logits = mask_logits[:, self.answer_ids] 28 | 29 | loss = self.loss_func(answer_logits, batch_labs) 30 | 31 | return loss, answer_logits.softmax(dim=1) 32 | -------------------------------------------------------------------------------- /Discrete-Action/model.py: -------------------------------------------------------------------------------- 1 | 2 | from transformers import BertForMaskedLM 3 | import torch.nn as nn 4 | 5 | 6 | class BERTPrompt4NR(nn.Module): 7 | def __init__(self, model_name, answer_ids, args): 8 | super(BERTPrompt4NR, self).__init__() 9 | self.BERT = BertForMaskedLM.from_pretrained(model_name) 10 | self.BERT.resize_token_embeddings(args.vocab_size) 11 | 12 | for param in self.BERT.parameters(): 13 | param.requires_grad = True 14 | 15 | self.answer_ids = answer_ids 16 | self.mask_token_id = 103 17 | self.loss_func = nn.CrossEntropyLoss() 18 | 19 | def forward(self, batch_enc, batch_attn, batch_labs): 20 | outputs = self.BERT(input_ids=batch_enc, 21 | attention_mask=batch_attn) 22 | out_logits = outputs.logits 23 | 24 | mask_position = batch_enc.eq(self.mask_token_id) 25 | mask_logits = out_logits[mask_position, :].view(out_logits.size(0), -1, out_logits.size(-1))[:, -1, :] 26 | 27 | answer_logits = mask_logits[:, self.answer_ids] 28 | 29 | loss = self.loss_func(answer_logits, batch_labs) 30 | 31 | return loss, answer_logits.softmax(dim=1) 32 | -------------------------------------------------------------------------------- /Discrete-Emotion/model.py: -------------------------------------------------------------------------------- 1 | 2 | from transformers import BertForMaskedLM 3 | import torch.nn as nn 4 | 5 | 6 | class BERTPrompt4NR(nn.Module): 7 | def __init__(self, model_name, answer_ids, args): 8 | super(BERTPrompt4NR, self).__init__() 9 | self.BERT = BertForMaskedLM.from_pretrained(model_name) 10 | self.BERT.resize_token_embeddings(args.vocab_size) 11 | 12 | for param in self.BERT.parameters(): 13 | param.requires_grad = True 14 | 15 | self.answer_ids = answer_ids 16 | self.mask_token_id = 103 17 | self.loss_func = nn.CrossEntropyLoss() 18 | 19 | def forward(self, batch_enc, batch_attn, batch_labs): 20 | outputs = self.BERT(input_ids=batch_enc, 21 | attention_mask=batch_attn) 22 | out_logits = outputs.logits 23 | 24 | mask_position = batch_enc.eq(self.mask_token_id) 25 | mask_logits = out_logits[mask_position, :].view(out_logits.size(0), -1, out_logits.size(-1))[:, -1, :] 26 | 27 | answer_logits = mask_logits[:, self.answer_ids] 28 | 29 | loss = self.loss_func(answer_logits, batch_labs) 30 | 31 | return loss, answer_logits.softmax(dim=1) 32 | -------------------------------------------------------------------------------- /Discrete-Relevance/model.py: -------------------------------------------------------------------------------- 1 | 2 | from transformers import BertForMaskedLM 3 | import torch.nn as nn 4 | 5 | 6 | class BERTPrompt4NR(nn.Module): 7 | def __init__(self, model_name, answer_ids, args): 8 | super(BERTPrompt4NR, self).__init__() 9 | self.BERT = BertForMaskedLM.from_pretrained(model_name) 10 | self.BERT.resize_token_embeddings(args.vocab_size) 11 | 12 | for param in self.BERT.parameters(): 13 | param.requires_grad = True 14 | 15 | self.answer_ids = answer_ids 16 | self.mask_token_id = 103 17 | self.loss_func = nn.CrossEntropyLoss() 18 | 19 | def forward(self, batch_enc, batch_attn, batch_labs): 20 | outputs = self.BERT(input_ids=batch_enc, 21 | attention_mask=batch_attn) 22 | out_logits = outputs.logits 23 | 24 | mask_position = batch_enc.eq(self.mask_token_id) 25 | mask_logits = out_logits[mask_position, :].view(out_logits.size(0), -1, out_logits.size(-1))[:, -1, :] 26 | 27 | answer_logits = mask_logits[:, self.answer_ids] 28 | 29 | loss = self.loss_func(answer_logits, batch_labs) 30 | 31 | return loss, answer_logits.softmax(dim=1) 32 | -------------------------------------------------------------------------------- /Discrete-Utility/model.py: -------------------------------------------------------------------------------- 1 | 2 | from transformers import BertForMaskedLM 3 | import torch.nn as nn 4 | 5 | 6 | class BERTPrompt4NR(nn.Module): 7 | def __init__(self, model_name, answer_ids, args): 8 | super(BERTPrompt4NR, self).__init__() 9 | self.BERT = BertForMaskedLM.from_pretrained(model_name) 10 | self.BERT.resize_token_embeddings(args.vocab_size) 11 | 12 | for param in self.BERT.parameters(): 13 | param.requires_grad = True 14 | 15 | self.answer_ids = answer_ids 16 | self.mask_token_id = 103 17 | self.loss_func = nn.CrossEntropyLoss() 18 | 19 | def forward(self, batch_enc, batch_attn, batch_labs): 20 | outputs = self.BERT(input_ids=batch_enc, 21 | attention_mask=batch_attn) 22 | out_logits = outputs.logits 23 | 24 | mask_position = batch_enc.eq(self.mask_token_id) 25 | mask_logits = out_logits[mask_position, :].view(out_logits.size(0), -1, out_logits.size(-1))[:, -1, :] 26 | 27 | answer_logits = mask_logits[:, self.answer_ids] 28 | 29 | loss = self.loss_func(answer_logits, batch_labs) 30 | 31 | return loss, answer_logits.softmax(dim=1) 32 | -------------------------------------------------------------------------------- /Hybrid-Relevance/model.py: -------------------------------------------------------------------------------- 1 | 2 | from transformers import BertForMaskedLM 3 | import torch.nn as nn 4 | 5 | 6 | class BERTPrompt4NR(nn.Module): 7 | def __init__(self, model_name, answer_ids, args): 8 | super(BERTPrompt4NR, self).__init__() 9 | self.BERT = BertForMaskedLM.from_pretrained(model_name) 10 | self.BERT.resize_token_embeddings(args.vocab_size) 11 | 12 | for param in self.BERT.parameters(): 13 | param.requires_grad = True 14 | 15 | self.answer_ids = answer_ids 16 | self.mask_token_id = 103 17 | self.loss_func = nn.CrossEntropyLoss() 18 | 19 | def forward(self, batch_enc, batch_attn, batch_labs): 20 | outputs = self.BERT(input_ids=batch_enc, 21 | attention_mask=batch_attn) 22 | out_logits = outputs.logits 23 | 24 | mask_position = batch_enc.eq(self.mask_token_id) 25 | mask_logits = out_logits[mask_position, :].view(out_logits.size(0), -1, out_logits.size(-1))[:, -1, :] 26 | 27 | answer_logits = mask_logits[:, self.answer_ids] 28 | 29 | loss = self.loss_func(answer_logits, batch_labs) 30 | 31 | return loss, answer_logits.softmax(dim=1) 32 | -------------------------------------------------------------------------------- /Continuous-Relevance/model.py: -------------------------------------------------------------------------------- 1 | 2 | from transformers import BertForMaskedLM 3 | import torch.nn as nn 4 | 5 | 6 | class BERTPrompt4NR(nn.Module): 7 | def __init__(self, model_name, answer_ids, args): 8 | super(BERTPrompt4NR, self).__init__() 9 | self.BERT = BertForMaskedLM.from_pretrained(model_name) 10 | self.BERT.resize_token_embeddings(args.vocab_size) 11 | 12 | for param in self.BERT.parameters(): 13 | param.requires_grad = True 14 | 15 | self.answer_ids = answer_ids 16 | self.mask_token_id = 103 17 | self.loss_func = nn.CrossEntropyLoss() 18 | 19 | def forward(self, batch_enc, batch_attn, batch_labs): 20 | outputs = self.BERT(input_ids=batch_enc, 21 | attention_mask=batch_attn) 22 | out_logits = outputs.logits 23 | 24 | mask_position = batch_enc.eq(self.mask_token_id) 25 | mask_logits = out_logits[mask_position, :].view(out_logits.size(0), -1, out_logits.size(-1))[:, -1, :] 26 | 27 | answer_logits = mask_logits[:, self.answer_ids] 28 | 29 | loss = self.loss_func(answer_logits, batch_labs) 30 | 31 | return loss, answer_logits.softmax(dim=1) 32 | -------------------------------------------------------------------------------- /Hybrid-Action/utils.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import roc_auc_score 2 | import numpy as np 3 | 4 | 5 | def dcg_score(y_true, y_score, k=10): 6 | order = np.argsort(y_score)[::-1] 7 | y_true = np.take(y_true, order[:k]) 8 | gains = 2 ** y_true - 1 9 | discounts = np.log2(np.arange(len(y_true)) + 2) 10 | return np.sum(gains / discounts) 11 | 12 | 13 | def ndcg_score(y_true, y_score, k=10): 14 | best = dcg_score(y_true, y_true, k) 15 | actual = dcg_score(y_true, y_score, k) 16 | return actual / best 17 | 18 | 19 | def mrr_score(y_true, y_score): 20 | order = np.argsort(y_score)[::-1] 21 | y_true = np.take(y_true, order) 22 | rr_score = y_true / (np.arange(len(y_true)) + 1) 23 | return np.sum(rr_score) / np.sum(y_true) 24 | 25 | 26 | def evaluate(predicts, truths): 27 | aucs = [] 28 | mrrs = [] 29 | ndcg5s = [] 30 | ndcg10s = [] 31 | for pre, tru in zip(predicts, truths): 32 | y_true = np.array(tru, dtype='float32') 33 | y_score = 1.0 / np.array(pre, dtype='float32') 34 | auc = roc_auc_score(y_true, y_score) 35 | mrr = mrr_score(y_true, y_score) 36 | ndcg5 = ndcg_score(y_true, y_score, 5) 37 | ndcg10 = ndcg_score(y_true, y_score, 10) 38 | 39 | aucs.append(auc) 40 | mrrs.append(mrr) 41 | ndcg5s.append(ndcg5) 42 | ndcg10s.append(ndcg10) 43 | return np.mean(aucs), np.mean(mrrs), np.mean(ndcg5s), np.mean(ndcg10s) 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /Hybrid-Emotion/utils.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import roc_auc_score 2 | import numpy as np 3 | 4 | 5 | def dcg_score(y_true, y_score, k=10): 6 | order = np.argsort(y_score)[::-1] 7 | y_true = np.take(y_true, order[:k]) 8 | gains = 2 ** y_true - 1 9 | discounts = np.log2(np.arange(len(y_true)) + 2) 10 | return np.sum(gains / discounts) 11 | 12 | 13 | def ndcg_score(y_true, y_score, k=10): 14 | best = dcg_score(y_true, y_true, k) 15 | actual = dcg_score(y_true, y_score, k) 16 | return actual / best 17 | 18 | 19 | def mrr_score(y_true, y_score): 20 | order = np.argsort(y_score)[::-1] 21 | y_true = np.take(y_true, order) 22 | rr_score = y_true / (np.arange(len(y_true)) + 1) 23 | return np.sum(rr_score) / np.sum(y_true) 24 | 25 | 26 | def evaluate(predicts, truths): 27 | aucs = [] 28 | mrrs = [] 29 | ndcg5s = [] 30 | ndcg10s = [] 31 | for pre, tru in zip(predicts, truths): 32 | y_true = np.array(tru, dtype='float32') 33 | y_score = 1.0 / np.array(pre, dtype='float32') 34 | auc = roc_auc_score(y_true, y_score) 35 | mrr = mrr_score(y_true, y_score) 36 | ndcg5 = ndcg_score(y_true, y_score, 5) 37 | ndcg10 = ndcg_score(y_true, y_score, 10) 38 | 39 | aucs.append(auc) 40 | mrrs.append(mrr) 41 | ndcg5s.append(ndcg5) 42 | ndcg10s.append(ndcg10) 43 | return np.mean(aucs), np.mean(mrrs), np.mean(ndcg5s), np.mean(ndcg10s) 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /Hybrid-Utility/utils.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import roc_auc_score 2 | import numpy as np 3 | 4 | 5 | def dcg_score(y_true, y_score, k=10): 6 | order = np.argsort(y_score)[::-1] 7 | y_true = np.take(y_true, order[:k]) 8 | gains = 2 ** y_true - 1 9 | discounts = np.log2(np.arange(len(y_true)) + 2) 10 | return np.sum(gains / discounts) 11 | 12 | 13 | def ndcg_score(y_true, y_score, k=10): 14 | best = dcg_score(y_true, y_true, k) 15 | actual = dcg_score(y_true, y_score, k) 16 | return actual / best 17 | 18 | 19 | def mrr_score(y_true, y_score): 20 | order = np.argsort(y_score)[::-1] 21 | y_true = np.take(y_true, order) 22 | rr_score = y_true / (np.arange(len(y_true)) + 1) 23 | return np.sum(rr_score) / np.sum(y_true) 24 | 25 | 26 | def evaluate(predicts, truths): 27 | aucs = [] 28 | mrrs = [] 29 | ndcg5s = [] 30 | ndcg10s = [] 31 | for pre, tru in zip(predicts, truths): 32 | y_true = np.array(tru, dtype='float32') 33 | y_score = 1.0 / np.array(pre, dtype='float32') 34 | auc = roc_auc_score(y_true, y_score) 35 | mrr = mrr_score(y_true, y_score) 36 | ndcg5 = ndcg_score(y_true, y_score, 5) 37 | ndcg10 = ndcg_score(y_true, y_score, 10) 38 | 39 | aucs.append(auc) 40 | mrrs.append(mrr) 41 | ndcg5s.append(ndcg5) 42 | ndcg10s.append(ndcg10) 43 | return np.mean(aucs), np.mean(mrrs), np.mean(ndcg5s), np.mean(ndcg10s) 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /Continuous-Action/utils.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import roc_auc_score 2 | import numpy as np 3 | 4 | 5 | def dcg_score(y_true, y_score, k=10): 6 | order = np.argsort(y_score)[::-1] 7 | y_true = np.take(y_true, order[:k]) 8 | gains = 2 ** y_true - 1 9 | discounts = np.log2(np.arange(len(y_true)) + 2) 10 | return np.sum(gains / discounts) 11 | 12 | 13 | def ndcg_score(y_true, y_score, k=10): 14 | best = dcg_score(y_true, y_true, k) 15 | actual = dcg_score(y_true, y_score, k) 16 | return actual / best 17 | 18 | 19 | def mrr_score(y_true, y_score): 20 | order = np.argsort(y_score)[::-1] 21 | y_true = np.take(y_true, order) 22 | rr_score = y_true / (np.arange(len(y_true)) + 1) 23 | return np.sum(rr_score) / np.sum(y_true) 24 | 25 | 26 | def evaluate(predicts, truths): 27 | aucs = [] 28 | mrrs = [] 29 | ndcg5s = [] 30 | ndcg10s = [] 31 | for pre, tru in zip(predicts, truths): 32 | y_true = np.array(tru, dtype='float32') 33 | y_score = 1.0 / np.array(pre, dtype='float32') 34 | auc = roc_auc_score(y_true, y_score) 35 | mrr = mrr_score(y_true, y_score) 36 | ndcg5 = ndcg_score(y_true, y_score, 5) 37 | ndcg10 = ndcg_score(y_true, y_score, 10) 38 | 39 | aucs.append(auc) 40 | mrrs.append(mrr) 41 | ndcg5s.append(ndcg5) 42 | ndcg10s.append(ndcg10) 43 | return np.mean(aucs), np.mean(mrrs), np.mean(ndcg5s), np.mean(ndcg10s) 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /Continuous-Emotion/utils.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import roc_auc_score 2 | import numpy as np 3 | 4 | 5 | def dcg_score(y_true, y_score, k=10): 6 | order = np.argsort(y_score)[::-1] 7 | y_true = np.take(y_true, order[:k]) 8 | gains = 2 ** y_true - 1 9 | discounts = np.log2(np.arange(len(y_true)) + 2) 10 | return np.sum(gains / discounts) 11 | 12 | 13 | def ndcg_score(y_true, y_score, k=10): 14 | best = dcg_score(y_true, y_true, k) 15 | actual = dcg_score(y_true, y_score, k) 16 | return actual / best 17 | 18 | 19 | def mrr_score(y_true, y_score): 20 | order = np.argsort(y_score)[::-1] 21 | y_true = np.take(y_true, order) 22 | rr_score = y_true / (np.arange(len(y_true)) + 1) 23 | return np.sum(rr_score) / np.sum(y_true) 24 | 25 | 26 | def evaluate(predicts, truths): 27 | aucs = [] 28 | mrrs = [] 29 | ndcg5s = [] 30 | ndcg10s = [] 31 | for pre, tru in zip(predicts, truths): 32 | y_true = np.array(tru, dtype='float32') 33 | y_score = 1.0 / np.array(pre, dtype='float32') 34 | auc = roc_auc_score(y_true, y_score) 35 | mrr = mrr_score(y_true, y_score) 36 | ndcg5 = ndcg_score(y_true, y_score, 5) 37 | ndcg10 = ndcg_score(y_true, y_score, 10) 38 | 39 | aucs.append(auc) 40 | mrrs.append(mrr) 41 | ndcg5s.append(ndcg5) 42 | ndcg10s.append(ndcg10) 43 | return np.mean(aucs), np.mean(mrrs), np.mean(ndcg5s), np.mean(ndcg10s) 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /Continuous-Utility/utils.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import roc_auc_score 2 | import numpy as np 3 | 4 | 5 | def dcg_score(y_true, y_score, k=10): 6 | order = np.argsort(y_score)[::-1] 7 | y_true = np.take(y_true, order[:k]) 8 | gains = 2 ** y_true - 1 9 | discounts = np.log2(np.arange(len(y_true)) + 2) 10 | return np.sum(gains / discounts) 11 | 12 | 13 | def ndcg_score(y_true, y_score, k=10): 14 | best = dcg_score(y_true, y_true, k) 15 | actual = dcg_score(y_true, y_score, k) 16 | return actual / best 17 | 18 | 19 | def mrr_score(y_true, y_score): 20 | order = np.argsort(y_score)[::-1] 21 | y_true = np.take(y_true, order) 22 | rr_score = y_true / (np.arange(len(y_true)) + 1) 23 | return np.sum(rr_score) / np.sum(y_true) 24 | 25 | 26 | def evaluate(predicts, truths): 27 | aucs = [] 28 | mrrs = [] 29 | ndcg5s = [] 30 | ndcg10s = [] 31 | for pre, tru in zip(predicts, truths): 32 | y_true = np.array(tru, dtype='float32') 33 | y_score = 1.0 / np.array(pre, dtype='float32') 34 | auc = roc_auc_score(y_true, y_score) 35 | mrr = mrr_score(y_true, y_score) 36 | ndcg5 = ndcg_score(y_true, y_score, 5) 37 | ndcg10 = ndcg_score(y_true, y_score, 10) 38 | 39 | aucs.append(auc) 40 | mrrs.append(mrr) 41 | ndcg5s.append(ndcg5) 42 | ndcg10s.append(ndcg10) 43 | return np.mean(aucs), np.mean(mrrs), np.mean(ndcg5s), np.mean(ndcg10s) 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /Discrete-Action/utils.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import roc_auc_score 2 | import numpy as np 3 | 4 | 5 | def dcg_score(y_true, y_score, k=10): 6 | order = np.argsort(y_score)[::-1] 7 | y_true = np.take(y_true, order[:k]) 8 | gains = 2 ** y_true - 1 9 | discounts = np.log2(np.arange(len(y_true)) + 2) 10 | return np.sum(gains / discounts) 11 | 12 | 13 | def ndcg_score(y_true, y_score, k=10): 14 | best = dcg_score(y_true, y_true, k) 15 | actual = dcg_score(y_true, y_score, k) 16 | return actual / best 17 | 18 | 19 | def mrr_score(y_true, y_score): 20 | order = np.argsort(y_score)[::-1] 21 | y_true = np.take(y_true, order) 22 | rr_score = y_true / (np.arange(len(y_true)) + 1) 23 | return np.sum(rr_score) / np.sum(y_true) 24 | 25 | 26 | def evaluate(predicts, truths): 27 | aucs = [] 28 | mrrs = [] 29 | ndcg5s = [] 30 | ndcg10s = [] 31 | for pre, tru in zip(predicts, truths): 32 | y_true = np.array(tru, dtype='float32') 33 | y_score = 1.0 / np.array(pre, dtype='float32') 34 | auc = roc_auc_score(y_true, y_score) 35 | mrr = mrr_score(y_true, y_score) 36 | ndcg5 = ndcg_score(y_true, y_score, 5) 37 | ndcg10 = ndcg_score(y_true, y_score, 10) 38 | 39 | aucs.append(auc) 40 | mrrs.append(mrr) 41 | ndcg5s.append(ndcg5) 42 | ndcg10s.append(ndcg10) 43 | return np.mean(aucs), np.mean(mrrs), np.mean(ndcg5s), np.mean(ndcg10s) 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /Discrete-Emotion/utils.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import roc_auc_score 2 | import numpy as np 3 | 4 | 5 | def dcg_score(y_true, y_score, k=10): 6 | order = np.argsort(y_score)[::-1] 7 | y_true = np.take(y_true, order[:k]) 8 | gains = 2 ** y_true - 1 9 | discounts = np.log2(np.arange(len(y_true)) + 2) 10 | return np.sum(gains / discounts) 11 | 12 | 13 | def ndcg_score(y_true, y_score, k=10): 14 | best = dcg_score(y_true, y_true, k) 15 | actual = dcg_score(y_true, y_score, k) 16 | return actual / best 17 | 18 | 19 | def mrr_score(y_true, y_score): 20 | order = np.argsort(y_score)[::-1] 21 | y_true = np.take(y_true, order) 22 | rr_score = y_true / (np.arange(len(y_true)) + 1) 23 | return np.sum(rr_score) / np.sum(y_true) 24 | 25 | 26 | def evaluate(predicts, truths): 27 | aucs = [] 28 | mrrs = [] 29 | ndcg5s = [] 30 | ndcg10s = [] 31 | for pre, tru in zip(predicts, truths): 32 | y_true = np.array(tru, dtype='float32') 33 | y_score = 1.0 / np.array(pre, dtype='float32') 34 | auc = roc_auc_score(y_true, y_score) 35 | mrr = mrr_score(y_true, y_score) 36 | ndcg5 = ndcg_score(y_true, y_score, 5) 37 | ndcg10 = ndcg_score(y_true, y_score, 10) 38 | 39 | aucs.append(auc) 40 | mrrs.append(mrr) 41 | ndcg5s.append(ndcg5) 42 | ndcg10s.append(ndcg10) 43 | return np.mean(aucs), np.mean(mrrs), np.mean(ndcg5s), np.mean(ndcg10s) 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /Discrete-Relevance/utils.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import roc_auc_score 2 | import numpy as np 3 | 4 | 5 | def dcg_score(y_true, y_score, k=10): 6 | order = np.argsort(y_score)[::-1] 7 | y_true = np.take(y_true, order[:k]) 8 | gains = 2 ** y_true - 1 9 | discounts = np.log2(np.arange(len(y_true)) + 2) 10 | return np.sum(gains / discounts) 11 | 12 | 13 | def ndcg_score(y_true, y_score, k=10): 14 | best = dcg_score(y_true, y_true, k) 15 | actual = dcg_score(y_true, y_score, k) 16 | return actual / best 17 | 18 | 19 | def mrr_score(y_true, y_score): 20 | order = np.argsort(y_score)[::-1] 21 | y_true = np.take(y_true, order) 22 | rr_score = y_true / (np.arange(len(y_true)) + 1) 23 | return np.sum(rr_score) / np.sum(y_true) 24 | 25 | 26 | def evaluate(predicts, truths): 27 | aucs = [] 28 | mrrs = [] 29 | ndcg5s = [] 30 | ndcg10s = [] 31 | for pre, tru in zip(predicts, truths): 32 | y_true = np.array(tru, dtype='float32') 33 | y_score = 1.0 / np.array(pre, dtype='float32') 34 | auc = roc_auc_score(y_true, y_score) 35 | mrr = mrr_score(y_true, y_score) 36 | ndcg5 = ndcg_score(y_true, y_score, 5) 37 | ndcg10 = ndcg_score(y_true, y_score, 10) 38 | 39 | aucs.append(auc) 40 | mrrs.append(mrr) 41 | ndcg5s.append(ndcg5) 42 | ndcg10s.append(ndcg10) 43 | return np.mean(aucs), np.mean(mrrs), np.mean(ndcg5s), np.mean(ndcg10s) 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /Discrete-Utility/utils.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import roc_auc_score 2 | import numpy as np 3 | 4 | 5 | def dcg_score(y_true, y_score, k=10): 6 | order = np.argsort(y_score)[::-1] 7 | y_true = np.take(y_true, order[:k]) 8 | gains = 2 ** y_true - 1 9 | discounts = np.log2(np.arange(len(y_true)) + 2) 10 | return np.sum(gains / discounts) 11 | 12 | 13 | def ndcg_score(y_true, y_score, k=10): 14 | best = dcg_score(y_true, y_true, k) 15 | actual = dcg_score(y_true, y_score, k) 16 | return actual / best 17 | 18 | 19 | def mrr_score(y_true, y_score): 20 | order = np.argsort(y_score)[::-1] 21 | y_true = np.take(y_true, order) 22 | rr_score = y_true / (np.arange(len(y_true)) + 1) 23 | return np.sum(rr_score) / np.sum(y_true) 24 | 25 | 26 | def evaluate(predicts, truths): 27 | aucs = [] 28 | mrrs = [] 29 | ndcg5s = [] 30 | ndcg10s = [] 31 | for pre, tru in zip(predicts, truths): 32 | y_true = np.array(tru, dtype='float32') 33 | y_score = 1.0 / np.array(pre, dtype='float32') 34 | auc = roc_auc_score(y_true, y_score) 35 | mrr = mrr_score(y_true, y_score) 36 | ndcg5 = ndcg_score(y_true, y_score, 5) 37 | ndcg10 = ndcg_score(y_true, y_score, 10) 38 | 39 | aucs.append(auc) 40 | mrrs.append(mrr) 41 | ndcg5s.append(ndcg5) 42 | ndcg10s.append(ndcg10) 43 | return np.mean(aucs), np.mean(mrrs), np.mean(ndcg5s), np.mean(ndcg10s) 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /Hybrid-Relevance/utils.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import roc_auc_score 2 | import numpy as np 3 | 4 | 5 | def dcg_score(y_true, y_score, k=10): 6 | order = np.argsort(y_score)[::-1] 7 | y_true = np.take(y_true, order[:k]) 8 | gains = 2 ** y_true - 1 9 | discounts = np.log2(np.arange(len(y_true)) + 2) 10 | return np.sum(gains / discounts) 11 | 12 | 13 | def ndcg_score(y_true, y_score, k=10): 14 | best = dcg_score(y_true, y_true, k) 15 | actual = dcg_score(y_true, y_score, k) 16 | return actual / best 17 | 18 | 19 | def mrr_score(y_true, y_score): 20 | order = np.argsort(y_score)[::-1] 21 | y_true = np.take(y_true, order) 22 | rr_score = y_true / (np.arange(len(y_true)) + 1) 23 | return np.sum(rr_score) / np.sum(y_true) 24 | 25 | 26 | def evaluate(predicts, truths): 27 | aucs = [] 28 | mrrs = [] 29 | ndcg5s = [] 30 | ndcg10s = [] 31 | for pre, tru in zip(predicts, truths): 32 | y_true = np.array(tru, dtype='float32') 33 | y_score = 1.0 / np.array(pre, dtype='float32') 34 | auc = roc_auc_score(y_true, y_score) 35 | mrr = mrr_score(y_true, y_score) 36 | ndcg5 = ndcg_score(y_true, y_score, 5) 37 | ndcg10 = ndcg_score(y_true, y_score, 10) 38 | 39 | aucs.append(auc) 40 | mrrs.append(mrr) 41 | ndcg5s.append(ndcg5) 42 | ndcg10s.append(ndcg10) 43 | return np.mean(aucs), np.mean(mrrs), np.mean(ndcg5s), np.mean(ndcg10s) 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /Continuous-Relevance/utils.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import roc_auc_score 2 | import numpy as np 3 | 4 | 5 | def dcg_score(y_true, y_score, k=10): 6 | order = np.argsort(y_score)[::-1] 7 | y_true = np.take(y_true, order[:k]) 8 | gains = 2 ** y_true - 1 9 | discounts = np.log2(np.arange(len(y_true)) + 2) 10 | return np.sum(gains / discounts) 11 | 12 | 13 | def ndcg_score(y_true, y_score, k=10): 14 | best = dcg_score(y_true, y_true, k) 15 | actual = dcg_score(y_true, y_score, k) 16 | return actual / best 17 | 18 | 19 | def mrr_score(y_true, y_score): 20 | order = np.argsort(y_score)[::-1] 21 | y_true = np.take(y_true, order) 22 | rr_score = y_true / (np.arange(len(y_true)) + 1) 23 | return np.sum(rr_score) / np.sum(y_true) 24 | 25 | 26 | def evaluate(predicts, truths): 27 | aucs = [] 28 | mrrs = [] 29 | ndcg5s = [] 30 | ndcg10s = [] 31 | for pre, tru in zip(predicts, truths): 32 | y_true = np.array(tru, dtype='float32') 33 | y_score = 1.0 / np.array(pre, dtype='float32') 34 | auc = roc_auc_score(y_true, y_score) 35 | mrr = mrr_score(y_true, y_score) 36 | ndcg5 = ndcg_score(y_true, y_score, 5) 37 | ndcg10 = ndcg_score(y_true, y_score, 10) 38 | 39 | aucs.append(auc) 40 | mrrs.append(mrr) 41 | ndcg5s.append(ndcg5) 42 | ndcg10s.append(ndcg10) 43 | return np.mean(aucs), np.mean(mrrs), np.mean(ndcg5s), np.mean(ndcg10s) 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /Hybrid-Action/run.sh: -------------------------------------------------------------------------------- 1 | python main-multigpu.py --data_path ../DATA/MIND-Small --train_data_path ../DATA/MIND-Small --epochs 5 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True 2 | python predict.py --data_path ../DATA/MIND-Small --ratio 0.5 --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True 3 | 4 | python main-multigpu.py --data_path ../DATA/MIND-Small --train_data_path ../DATA/MIND-Small-0.5 --epochs 5 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True 5 | python predict.py --data_path ../DATA/MIND-Small --ratio 0.5 --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True 6 | 7 | python main-multigpu.py --data_path ../DATA/MIND-Small --train_data_path ../DATA/MIND-Small-0.3 --epochs 5 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True 8 | python predict.py --data_path ../DATA/MIND-Small --ratio 0.3 --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True 9 | 10 | python main-multigpu.py --data_path ../DATA/MIND-Small --train_data_path ../DATA/MIND-Small-0.2 --epochs 5 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True 11 | python predict.py --data_path ../DATA/MIND-Small --ratio 0.2 --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True 12 | 13 | python main-multigpu.py --data_path ../DATA/MIND-Small --train_data_path ../DATA/MIND-Small-0.1 --epochs 5 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True 14 | python predict.py --data_path ../DATA/MIND-Small --ratio 0.1 --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True 15 | 16 | -------------------------------------------------------------------------------- /Continuous-Utility/run.sh: -------------------------------------------------------------------------------- 1 | python main-multigpu.py --data_path ../DATA/MIND-Small --train_data_path ../DATA/MIND-Small --epochs 3 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True 2 | python predict.py --data_path ../DATA/MIND-Small --ratio 0.5 --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True 3 | 4 | python main-multigpu.py --data_path ../DATA/MIND-Small --train_data_path ../DATA/MIND-Small-0.5 --epochs 3 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True 5 | python predict.py --data_path ../DATA/MIND-Small --ratio 0.5 --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True 6 | 7 | python main-multigpu.py --data_path ../DATA/MIND-Small --train_data_path ../DATA/MIND-Small-0.3 --epochs 3 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True 8 | python predict.py --data_path ../DATA/MIND-Small --ratio 0.3 --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True 9 | 10 | python main-multigpu.py --data_path ../DATA/MIND-Small --train_data_path ../DATA/MIND-Small-0.2 --epochs 3 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True 11 | python predict.py --data_path ../DATA/MIND-Small --ratio 0.2 --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True 12 | 13 | python main-multigpu.py --data_path ../DATA/MIND-Small --train_data_path ../DATA/MIND-Small-0.1 --epochs 3 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True 14 | python predict.py --data_path ../DATA/MIND-Small --ratio 0.1 --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True 15 | 16 | python main-multigpu.py --data_path ../DATA/MIND-Small --train_data_path ../DATA/MIND-Small-0.05 --epochs 5 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True 17 | python predict.py --data_path ../DATA/MIND-Small --ratio 0.05 --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True 18 | 19 | python main-multigpu.py --data_path ../DATA/MIND-Small --train_data_path ../DATA/MIND-Small-0.01 --epochs 5 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True 20 | python predict.py --data_path ../DATA/MIND-Small --ratio 0.01 --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True -------------------------------------------------------------------------------- /Discrete-Utility/run.sh: -------------------------------------------------------------------------------- 1 | python main-multigpu.py --data_path ../DATA/MIND-Small --train_data_path ../DATA/MIND-Small --epochs 4 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True 2 | python predict.py --data_path ../DATA/MIND-Small --ratio 0.5 --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True 3 | 4 | python main-multigpu.py --data_path ../DATA/MIND-Small --train_data_path ../DATA/MIND-Small-0.5 --epochs 4 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True 5 | python predict.py --data_path ../DATA/MIND-Small --ratio 0.5 --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True 6 | 7 | python main-multigpu.py --data_path ../DATA/MIND-Small --train_data_path ../DATA/MIND-Small-0.3 --epochs 4 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True 8 | python predict.py --data_path ../DATA/MIND-Small --ratio 0.3 --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True 9 | 10 | python main-multigpu.py --data_path ../DATA/MIND-Small --train_data_path ../DATA/MIND-Small-0.2 --epochs 4 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True 11 | python predict.py --data_path ../DATA/MIND-Small --ratio 0.2 --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True 12 | 13 | python main-multigpu.py --data_path ../DATA/MIND-Small --train_data_path ../DATA/MIND-Small-0.1 --epochs 4 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True 14 | python predict.py --data_path ../DATA/MIND-Small --ratio 0.1 --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True 15 | 16 | python main-multigpu.py --data_path ../DATA/MIND-Small --train_data_path ../DATA/MIND-Small-0.05 --epochs 4 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True 17 | python predict.py --data_path ../DATA/MIND-Small --ratio 0.05 --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True 18 | 19 | python main-multigpu.py --data_path ../DATA/MIND-Small --train_data_path ../DATA/MIND-Small-0.01 --epochs 4 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True 20 | python predict.py --data_path ../DATA/MIND-Small --ratio 0.01 --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Prompt4NR: Prompt Learning for News Recommendation 2 | Source code for SIGIR 2023 paper: Prompt Learning for News Recommendation 3 | 4 | ### The Prompt4NR Framework 5 | 6 |

7 | 8 |

9 | 10 | ### Directory Structure: 11 | 12 directories correspond to 12 prompt templates three types (Discrete, Continuous, Hybrid) of templates from four perspectives (Relevance, Emotion, Action, Utility) 12 | - Discrete-Relevance, Discrete-Emotion, Discrete-Action, Discrete-Utility 13 | - Continuous-Relevance, Continuous-Emotion, Continuous-Action, Continuous-Utility 14 | - Hybrid-Relevance, Hybrid-Emotion, Hybrid-Action, Hybrid-Utility 15 | 16 | ### Details of the 12 templates are provided as follows: 17 | 18 | 19 | 20 | ### Dataset 21 | 22 | The experiments are based on public dataset MIND, we use the small version MIND-Small. 23 | 24 | For our paper, we have preprocessed the original dataset and store it as binary files via "pickle". Even though I use ".txt" as the file extension, they are still binary files stored by pickle, you can use pickle package to directly load them, which include: 25 | 26 | - train.txt: training set 27 | - val.txt: validation set 28 | - test.txt: testing set 29 | - news.txt: containing information of all news 30 | 31 | I have shared our preprocessed dataset on Google Drive as follows: 32 | 33 | 34 | 35 | ### Model Checkpoints 36 | 37 | I have shared our trained model checkpoints on Google Drive as follows: 38 | 39 | 40 | 41 | ### How to Run These codes 42 | In each directory, there is a script called ``run.sh`` that can run the codes for the corresponding template. 43 | Take “Discrete-Relevance” template as an example, the ``run.sh`` file is shown as follows: 44 | ``` 45 | python main-multigpu.py --data_path ../DATA/MIND-Small --epochs 4 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True 46 | python predict.py --data_path ../DATA/MIND-Small --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True 47 | ``` 48 | - The first line is used to train the model on the training set and evaluate it on the validation set at each epoch. During this process, the model with the best performance on the validation set will be stored. 49 | - The second line is used to evaluate the "best" model on the testing set to obtain the performance evaluation. 50 | 51 | We implement the source code via the Distributed Data Parallel (DDP) technology provided by pytorch. Hence, our codes is a Multi-GPUs version. We encourage you to overwrite our code to obtain a Single-GPU version. 52 | 53 | ### Enviroments 54 | - python==3.7 55 | - pytorch==1.13.0 56 | - cuda==116 57 | - transformers==4.27.0 58 | 59 | ### Citation 60 | If you use this codes, please cite our paper! 61 | ``` 62 | @inproceedings{zhang2023prompt, 63 | author = {Zhang, Zizhuo and Wang, Bang}, 64 | title = {Prompt Learning for News Recommendation}, 65 | year = {2023}, 66 | booktitle = {Proceedings of the 46th International ACM SIGIR Conference on Research and Development in Information Retrieval}, 67 | pages = {227–237}, 68 | numpages = {11}, 69 | location = {Taipei, Taiwan}, 70 | series = {SIGIR '23} 71 | } 72 | ``` 73 | -------------------------------------------------------------------------------- /Discrete-Action/prepro_data.py: -------------------------------------------------------------------------------- 1 | 2 | import re 3 | import random 4 | import numpy as np 5 | from torch.utils.data import Dataset 6 | import pickle 7 | import os 8 | import torch 9 | 10 | 11 | class MyDataset(Dataset): 12 | def __init__(self, args, tokenizer, news_dict, status='train'): 13 | self.tokenizer = tokenizer 14 | self.news_dict = news_dict 15 | self.args = args 16 | self.status = status 17 | 18 | self.data = [] 19 | self.imp_lens = [] 20 | if self.status == 'train': 21 | self.data_path = os.path.join(args.data_path, 'train.txt') 22 | elif self.status == 'val': 23 | self.data_path = os.path.join(args.data_path, 'val.txt') 24 | else: 25 | self.data_path = os.path.join(args.data_path, 'test.txt') 26 | self.load_data() 27 | 28 | def __len__(self): 29 | return len(self.data) 30 | 31 | def __getitem__(self, item): 32 | return self.data[item] 33 | 34 | def obtain_data(self, data): 35 | return data[0], data[1], data[2], data[3] 36 | 37 | def prepro_train(self, imp_ids, behaviors, news_dict, K_samples, 38 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450): 39 | template = "User: [SEP] News: [SEP] Dose the user click the news? [MASK]" 40 | for impid, behav in zip(imp_ids, behaviors): 41 | his_clicks = behav[0][-max_his:] 42 | his_clicks.reverse() 43 | his_titles = [] 44 | for news in his_clicks: 45 | title = news_dict[news]['title'] 46 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 47 | 48 | title = ' '.join(title.split(' ')[:max_title_len]) 49 | 50 | his_titles.append(title) 51 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles) 52 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len] 53 | his_sen = self.tokenizer.decode(his_sen_ids) 54 | base_sentence = template.replace("", his_sen) 55 | 56 | positives = behav[1] 57 | negatives = behav[2] 58 | 59 | for news in positives: 60 | title = news_dict[news]['title'] 61 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 62 | 63 | title = ' '.join(title.split(' ')[:max_candi_len]) 64 | 65 | sentence = base_sentence.replace("", title) 66 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid}) 67 | 68 | if len(negatives) >= K_samples: 69 | sample_negs = random.sample(negatives, k=K_samples) 70 | else: 71 | sample_negs = np.random.choice(negatives, K_samples, replace=True).tolist() 72 | 73 | for neg in sample_negs: 74 | neg_title = news_dict[neg]['title'] 75 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title) 76 | 77 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len]) 78 | 79 | sentence = base_sentence.replace("", neg_title) 80 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid}) 81 | 82 | def prepro_dev(self, imp_ids, behaviors, news_dict, 83 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450): 84 | template = "User: [SEP] News: [SEP] Dose the user click the news? [MASK]" 85 | for impid, behav in zip(imp_ids, behaviors): 86 | if len(behav[0]) == 0: 87 | continue 88 | his_clicks = behav[0][-max_his:] 89 | his_clicks.reverse() 90 | his_titles = [] 91 | for news in his_clicks: 92 | title = news_dict[news]['title'] 93 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 94 | 95 | title = ' '.join(title.split(' ')[:max_title_len]) 96 | 97 | his_titles.append(title) 98 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles) 99 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len] 100 | his_sen = self.tokenizer.decode(his_sen_ids) 101 | base_sentence = template.replace("", his_sen) 102 | 103 | positives = behav[1] 104 | negatives = behav[2] 105 | for news in positives: 106 | title = news_dict[news]['title'] 107 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 108 | 109 | title = ' '.join(title.split(' ')[:max_candi_len]) 110 | 111 | sentence = base_sentence.replace("", title) 112 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid}) 113 | 114 | for neg in negatives: 115 | neg_title = news_dict[neg]['title'] 116 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title) 117 | 118 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len]) 119 | 120 | sentence = base_sentence.replace("", neg_title) 121 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid}) 122 | 123 | def load_data(self): 124 | data = pickle.load(open(self.data_path, 'rb')) 125 | imps, users, times, behaviors = self.obtain_data(data) 126 | if self.status == 'train': 127 | self.prepro_train(imps, behaviors, self.news_dict, self.args.num_negs, self.args.max_his, 128 | max_his_len=self.args.max_his_len) 129 | else: 130 | self.prepro_dev(imps, behaviors, self.news_dict, self.args.max_his, 131 | max_his_len=self.args.max_his_len) 132 | 133 | def collate_fn(self, batch): 134 | sentences = [x['sentence'] for x in batch] 135 | target = [x['target'] for x in batch] 136 | imp_id = [x['imp'] for x in batch] 137 | 138 | encode_dict = self.tokenizer.batch_encode_plus( 139 | sentences, 140 | add_special_tokens=True, 141 | padding='max_length', 142 | max_length=self.args.max_tokens, 143 | truncation=True, 144 | pad_to_max_length=True, 145 | return_attention_mask=True, 146 | return_tensors='pt' 147 | ) 148 | 149 | batch_enc = encode_dict['input_ids'] 150 | batch_attn = encode_dict['attention_mask'] 151 | target = torch.LongTensor(target) 152 | 153 | return batch_enc, batch_attn, target, imp_id 154 | 155 | 156 | 157 | 158 | 159 | -------------------------------------------------------------------------------- /Discrete-Relevance/prepro_data.py: -------------------------------------------------------------------------------- 1 | 2 | import re 3 | import random 4 | import numpy as np 5 | from torch.utils.data import Dataset 6 | import pickle 7 | import os 8 | import torch 9 | 10 | 11 | class MyDataset(Dataset): 12 | def __init__(self, args, tokenizer, news_dict, status='train'): 13 | self.tokenizer = tokenizer 14 | self.news_dict = news_dict 15 | self.args = args 16 | self.status = status 17 | 18 | self.data = [] 19 | self.imp_lens = [] 20 | if self.status == 'train': 21 | self.data_path = os.path.join(args.data_path, 'train.txt') 22 | elif self.status == 'val': 23 | self.data_path = os.path.join(args.data_path, 'val.txt') 24 | else: 25 | self.data_path = os.path.join(args.data_path, 'test.txt') 26 | self.load_data() 27 | 28 | def __len__(self): 29 | return len(self.data) 30 | 31 | def __getitem__(self, item): 32 | return self.data[item] 33 | 34 | def obtain_data(self, data): 35 | # if self.status == 'train': 36 | # return data[0][:20], data[1][:20], data[2][:20], data[3][:20] 37 | # else: 38 | # return data[0], data[1], data[2], data[3] 39 | return data[0], data[1], data[2], data[3] 40 | 41 | def prepro_train(self, imp_ids, behaviors, news_dict, K_samples, 42 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450): 43 | template = " is [MASK] to " 44 | for impid, behav in zip(imp_ids, behaviors): 45 | his_clicks = behav[0][-max_his:] 46 | his_clicks.reverse() 47 | his_titles = [] 48 | for news in his_clicks: 49 | title = news_dict[news]['title'] 50 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 51 | 52 | title = ' '.join(title.split(' ')[:max_title_len]) 53 | 54 | his_titles.append(title) 55 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles) 56 | # his_sen = ' '.join(his_sen.split(' ')[:max_his_len]) 57 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len] 58 | his_sen = self.tokenizer.decode(his_sen_ids) 59 | base_sentence = template.replace("", his_sen) 60 | 61 | positives = behav[1] 62 | negatives = behav[2] 63 | 64 | for news in positives: 65 | title = news_dict[news]['title'] 66 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 67 | 68 | title = ' '.join(title.split(' ')[:max_candi_len]) 69 | 70 | sentence = base_sentence.replace("", title) 71 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid}) 72 | 73 | if len(negatives) >= K_samples: 74 | sample_negs = random.sample(negatives, k=K_samples) 75 | else: 76 | sample_negs = np.random.choice(negatives, K_samples, replace=True).tolist() 77 | 78 | for neg in sample_negs: 79 | neg_title = news_dict[neg]['title'] 80 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title) 81 | 82 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len]) 83 | 84 | sentence = base_sentence.replace("", neg_title) 85 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid}) 86 | 87 | def prepro_dev(self, imp_ids, behaviors, news_dict, 88 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450): 89 | template = " is [MASK] to " 90 | for impid, behav in zip(imp_ids, behaviors): 91 | if len(behav[0]) == 0: 92 | continue 93 | his_clicks = behav[0][-max_his:] 94 | his_clicks.reverse() 95 | his_titles = [] 96 | for news in his_clicks: 97 | title = news_dict[news]['title'] 98 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 99 | 100 | title = ' '.join(title.split(' ')[:max_title_len]) 101 | 102 | his_titles.append(title) 103 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles) 104 | # his_sen = ' '.join(his_sen.split(' ')[:max_his_len]) 105 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len] 106 | his_sen = self.tokenizer.decode(his_sen_ids) 107 | base_sentence = template.replace("", his_sen) 108 | 109 | positives = behav[1] 110 | negatives = behav[2] 111 | for news in positives: 112 | title = news_dict[news]['title'] 113 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 114 | 115 | title = ' '.join(title.split(' ')[:max_candi_len]) 116 | 117 | sentence = base_sentence.replace("", title) 118 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid}) 119 | 120 | for neg in negatives: 121 | neg_title = news_dict[neg]['title'] 122 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title) 123 | 124 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len]) 125 | 126 | sentence = base_sentence.replace("", neg_title) 127 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid}) 128 | 129 | def load_data(self): 130 | data = pickle.load(open(self.data_path, 'rb')) 131 | imps, users, times, behaviors = self.obtain_data(data) 132 | if self.status == 'train': 133 | self.prepro_train(imps, behaviors, self.news_dict, self.args.num_negs, self.args.max_his, 134 | max_his_len=self.args.max_his_len) 135 | else: 136 | self.prepro_dev(imps, behaviors, self.news_dict, self.args.max_his, 137 | max_his_len=self.args.max_his_len) 138 | 139 | def collate_fn(self, batch): 140 | sentences = [x['sentence'] for x in batch] 141 | target = [x['target'] for x in batch] 142 | imp_id = [x['imp'] for x in batch] 143 | 144 | encode_dict = self.tokenizer.batch_encode_plus( 145 | sentences, 146 | add_special_tokens=True, 147 | padding='max_length', 148 | max_length=self.args.max_tokens, 149 | truncation=True, 150 | pad_to_max_length=True, 151 | return_attention_mask=True, 152 | return_tensors='pt' 153 | ) 154 | 155 | batch_enc = encode_dict['input_ids'] 156 | batch_attn = encode_dict['attention_mask'] 157 | target = torch.LongTensor(target) 158 | 159 | return batch_enc, batch_attn, target, imp_id 160 | 161 | 162 | 163 | 164 | 165 | -------------------------------------------------------------------------------- /Discrete-Utility/prepro_data.py: -------------------------------------------------------------------------------- 1 | 2 | import re 3 | import random 4 | import numpy as np 5 | from torch.utils.data import Dataset 6 | import pickle 7 | import os 8 | import torch 9 | 10 | 11 | class MyDataset(Dataset): 12 | def __init__(self, args, tokenizer, news_dict, status='train'): 13 | self.tokenizer = tokenizer 14 | self.news_dict = news_dict 15 | self.args = args 16 | self.status = status 17 | 18 | self.data = [] 19 | self.imp_lens = [] 20 | if self.status == 'train': 21 | self.data_path = os.path.join(args.train_data_path, 'train.txt') 22 | elif self.status == 'val': 23 | self.data_path = os.path.join(args.data_path, 'val.txt') 24 | else: 25 | self.data_path = os.path.join(args.data_path, 'test.txt') 26 | self.load_data() 27 | 28 | def __len__(self): 29 | return len(self.data) 30 | 31 | def __getitem__(self, item): 32 | return self.data[item] 33 | 34 | def obtain_data(self, data): 35 | # if self.status == 'train': 36 | # return data[0][:20], data[1][:20], data[2][:20], data[3][:20] 37 | # else: 38 | # return data[0], data[1], data[2], data[3] 39 | return data[0], data[1], data[2], data[3] 40 | 41 | def prepro_train(self, imp_ids, behaviors, news_dict, K_samples, 42 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450): 43 | template = "Recommending to the user is a [MASK] choice according to " 44 | for impid, behav in zip(imp_ids, behaviors): 45 | his_clicks = behav[0][-max_his:] 46 | his_clicks.reverse() 47 | his_titles = [] 48 | for news in his_clicks: 49 | title = news_dict[news]['title'] 50 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 51 | 52 | title = ' '.join(title.split(' ')[:max_title_len]) 53 | 54 | his_titles.append(title) 55 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles) 56 | # his_sen = ' '.join(his_sen.split(' ')[:max_his_len]) 57 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len] 58 | his_sen = self.tokenizer.decode(his_sen_ids) 59 | base_sentence = template.replace("", his_sen) 60 | 61 | positives = behav[1] 62 | negatives = behav[2] 63 | 64 | for news in positives: 65 | title = news_dict[news]['title'] 66 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 67 | 68 | title = ' '.join(title.split(' ')[:max_candi_len]) 69 | 70 | sentence = base_sentence.replace("", title) 71 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid}) 72 | 73 | if len(negatives) >= K_samples: 74 | sample_negs = random.sample(negatives, k=K_samples) 75 | else: 76 | sample_negs = np.random.choice(negatives, K_samples, replace=True).tolist() 77 | 78 | for neg in sample_negs: 79 | neg_title = news_dict[neg]['title'] 80 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title) 81 | 82 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len]) 83 | 84 | sentence = base_sentence.replace("", neg_title) 85 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid}) 86 | 87 | def prepro_dev(self, imp_ids, behaviors, news_dict, 88 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450): 89 | template = "Recommending to the user is a [MASK] choice according to " 90 | for impid, behav in zip(imp_ids, behaviors): 91 | if len(behav[0]) == 0: 92 | continue 93 | his_clicks = behav[0][-max_his:] 94 | his_clicks.reverse() 95 | his_titles = [] 96 | for news in his_clicks: 97 | title = news_dict[news]['title'] 98 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 99 | 100 | title = ' '.join(title.split(' ')[:max_title_len]) 101 | 102 | his_titles.append(title) 103 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles) 104 | # his_sen = ' '.join(his_sen.split(' ')[:max_his_len]) 105 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len] 106 | his_sen = self.tokenizer.decode(his_sen_ids) 107 | base_sentence = template.replace("", his_sen) 108 | 109 | positives = behav[1] 110 | negatives = behav[2] 111 | for news in positives: 112 | title = news_dict[news]['title'] 113 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 114 | 115 | title = ' '.join(title.split(' ')[:max_candi_len]) 116 | 117 | sentence = base_sentence.replace("", title) 118 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid}) 119 | 120 | for neg in negatives: 121 | neg_title = news_dict[neg]['title'] 122 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title) 123 | 124 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len]) 125 | 126 | sentence = base_sentence.replace("", neg_title) 127 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid}) 128 | 129 | def load_data(self): 130 | data = pickle.load(open(self.data_path, 'rb')) 131 | imps, users, times, behaviors = self.obtain_data(data) 132 | if self.status == 'train': 133 | self.prepro_train(imps, behaviors, self.news_dict, self.args.num_negs, self.args.max_his, 134 | max_his_len=self.args.max_his_len) 135 | else: 136 | self.prepro_dev(imps, behaviors, self.news_dict, self.args.max_his, 137 | max_his_len=self.args.max_his_len) 138 | 139 | def collate_fn(self, batch): 140 | sentences = [x['sentence'] for x in batch] 141 | target = [x['target'] for x in batch] 142 | imp_id = [x['imp'] for x in batch] 143 | 144 | encode_dict = self.tokenizer.batch_encode_plus( 145 | sentences, 146 | add_special_tokens=True, 147 | padding='max_length', 148 | max_length=self.args.max_tokens, 149 | truncation=True, 150 | pad_to_max_length=True, 151 | return_attention_mask=True, 152 | return_tensors='pt' 153 | ) 154 | 155 | batch_enc = encode_dict['input_ids'] 156 | batch_attn = encode_dict['attention_mask'] 157 | target = torch.LongTensor(target) 158 | 159 | return batch_enc, batch_attn, target, imp_id 160 | 161 | 162 | 163 | 164 | 165 | -------------------------------------------------------------------------------- /Discrete-Emotion/prepro_data.py: -------------------------------------------------------------------------------- 1 | 2 | import re 3 | import random 4 | import numpy as np 5 | from torch.utils.data import Dataset 6 | import pickle 7 | import os 8 | import torch 9 | 10 | 11 | class MyDataset(Dataset): 12 | def __init__(self, args, tokenizer, news_dict, status='train'): 13 | self.tokenizer = tokenizer 14 | self.news_dict = news_dict 15 | self.args = args 16 | self.status = status 17 | 18 | self.data = [] 19 | self.imp_lens = [] 20 | if self.status == 'train': 21 | self.data_path = os.path.join(args.data_path, 'train.txt') 22 | elif self.status == 'val': 23 | self.data_path = os.path.join(args.data_path, 'val.txt') 24 | else: 25 | self.data_path = os.path.join(args.data_path, 'test.txt') 26 | self.load_data() 27 | 28 | def __len__(self): 29 | return len(self.data) 30 | 31 | def __getitem__(self, item): 32 | return self.data[item] 33 | 34 | def obtain_data(self, data): 35 | # if self.status == 'train': 36 | # return data[0][:20], data[1][:20], data[2][:20], data[3][:20] 37 | # else: 38 | # return data[0], data[1], data[2], data[3] 39 | return data[0], data[1], data[2], data[3] 40 | 41 | def prepro_train(self, imp_ids, behaviors, news_dict, K_samples, 42 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450): 43 | template = "The user feels [MASK] about according to his area of interest " 44 | for impid, behav in zip(imp_ids, behaviors): 45 | his_clicks = behav[0][-max_his:] 46 | his_clicks.reverse() 47 | his_titles = [] 48 | for news in his_clicks: 49 | title = news_dict[news]['title'] 50 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 51 | 52 | title = ' '.join(title.split(' ')[:max_title_len]) 53 | 54 | his_titles.append(title) 55 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles) 56 | # his_sen = ' '.join(his_sen.split(' ')[:max_his_len]) 57 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len] 58 | his_sen = self.tokenizer.decode(his_sen_ids) 59 | base_sentence = template.replace("", his_sen) 60 | 61 | positives = behav[1] 62 | negatives = behav[2] 63 | 64 | for news in positives: 65 | title = news_dict[news]['title'] 66 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 67 | 68 | title = ' '.join(title.split(' ')[:max_candi_len]) 69 | 70 | sentence = base_sentence.replace("", title) 71 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid}) 72 | 73 | if len(negatives) >= K_samples: 74 | sample_negs = random.sample(negatives, k=K_samples) 75 | else: 76 | sample_negs = np.random.choice(negatives, K_samples, replace=True).tolist() 77 | 78 | for neg in sample_negs: 79 | neg_title = news_dict[neg]['title'] 80 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title) 81 | 82 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len]) 83 | 84 | sentence = base_sentence.replace("", neg_title) 85 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid}) 86 | 87 | def prepro_dev(self, imp_ids, behaviors, news_dict, 88 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450): 89 | template = "The user feels [MASK] about according to his area of interest " 90 | for impid, behav in zip(imp_ids, behaviors): 91 | if len(behav[0]) == 0: 92 | continue 93 | his_clicks = behav[0][-max_his:] 94 | his_clicks.reverse() 95 | his_titles = [] 96 | for news in his_clicks: 97 | title = news_dict[news]['title'] 98 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 99 | 100 | title = ' '.join(title.split(' ')[:max_title_len]) 101 | 102 | his_titles.append(title) 103 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles) 104 | # his_sen = ' '.join(his_sen.split(' ')[:max_his_len]) 105 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len] 106 | his_sen = self.tokenizer.decode(his_sen_ids) 107 | base_sentence = template.replace("", his_sen) 108 | 109 | positives = behav[1] 110 | negatives = behav[2] 111 | for news in positives: 112 | title = news_dict[news]['title'] 113 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 114 | 115 | title = ' '.join(title.split(' ')[:max_candi_len]) 116 | 117 | sentence = base_sentence.replace("", title) 118 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid}) 119 | 120 | for neg in negatives: 121 | neg_title = news_dict[neg]['title'] 122 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title) 123 | 124 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len]) 125 | 126 | sentence = base_sentence.replace("", neg_title) 127 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid}) 128 | 129 | def load_data(self): 130 | data = pickle.load(open(self.data_path, 'rb')) 131 | imps, users, times, behaviors = self.obtain_data(data) 132 | if self.status == 'train': 133 | self.prepro_train(imps, behaviors, self.news_dict, self.args.num_negs, self.args.max_his, 134 | max_his_len=self.args.max_his_len) 135 | else: 136 | self.prepro_dev(imps, behaviors, self.news_dict, self.args.max_his, 137 | max_his_len=self.args.max_his_len) 138 | 139 | def collate_fn(self, batch): 140 | sentences = [x['sentence'] for x in batch] 141 | target = [x['target'] for x in batch] 142 | imp_id = [x['imp'] for x in batch] 143 | 144 | encode_dict = self.tokenizer.batch_encode_plus( 145 | sentences, 146 | add_special_tokens=True, 147 | padding='max_length', 148 | max_length=self.args.max_tokens, 149 | truncation=True, 150 | pad_to_max_length=True, 151 | return_attention_mask=True, 152 | return_tensors='pt' 153 | ) 154 | 155 | batch_enc = encode_dict['input_ids'] 156 | batch_attn = encode_dict['attention_mask'] 157 | target = torch.LongTensor(target) 158 | 159 | return batch_enc, batch_attn, target, imp_id 160 | 161 | 162 | 163 | 164 | 165 | -------------------------------------------------------------------------------- /Continuous-Emotion/prepro_data.py: -------------------------------------------------------------------------------- 1 | 2 | import re 3 | import random 4 | import numpy as np 5 | from torch.utils.data import Dataset 6 | import pickle 7 | import os 8 | import torch 9 | 10 | 11 | class MyDataset(Dataset): 12 | def __init__(self, args, tokenizer, news_dict, conti_tokens, status='train'): 13 | self.tokenizer = tokenizer 14 | self.news_dict = news_dict 15 | self.args = args 16 | self.status = status 17 | self.conti_tokens = conti_tokens 18 | 19 | self.data = [] 20 | self.imp_lens = [] 21 | if self.status == 'train': 22 | self.data_path = os.path.join(args.data_path, 'train.txt') 23 | elif self.status == 'val': 24 | self.data_path = os.path.join(args.data_path, 'val.txt') 25 | else: 26 | self.data_path = os.path.join(args.data_path, 'test.txt') 27 | self.load_data() 28 | 29 | def __len__(self): 30 | return len(self.data) 31 | 32 | def __getitem__(self, item): 33 | return self.data[item] 34 | 35 | def obtain_data(self, data): 36 | # if self.status == 'train': 37 | # return data[0][:20], data[1][:20], data[2][:20], data[3][:20] 38 | # else: 39 | # return data[0], data[1], data[2], data[3] 40 | return data[0], data[1], data[2], data[3] 41 | 42 | def prepro_train(self, imp_ids, behaviors, news_dict, K_samples, 43 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450): 44 | template1 = ''.join(self.conti_tokens[0]) + "" 45 | template2 = ''.join(self.conti_tokens[1]) + "" 46 | template3 = ''.join(self.conti_tokens[2]) + "[MASK]" 47 | template = template3 + template2 + template1 48 | 49 | for impid, behav in zip(imp_ids, behaviors): 50 | his_clicks = behav[0][-max_his:] 51 | his_clicks.reverse() 52 | his_titles = [] 53 | for news in his_clicks: 54 | title = news_dict[news]['title'] 55 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 56 | 57 | title = ' '.join(title.split(' ')[:max_title_len]) 58 | 59 | his_titles.append(title) 60 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles) 61 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len] 62 | his_sen = self.tokenizer.decode(his_sen_ids) 63 | base_sentence = template.replace("", his_sen) 64 | 65 | positives = behav[1] 66 | negatives = behav[2] 67 | 68 | for news in positives: 69 | title = news_dict[news]['title'] 70 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 71 | 72 | title = ' '.join(title.split(' ')[:max_candi_len]) 73 | 74 | sentence = base_sentence.replace("", title) 75 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid}) 76 | 77 | if len(negatives) >= K_samples: 78 | sample_negs = random.sample(negatives, k=K_samples) 79 | else: 80 | sample_negs = np.random.choice(negatives, K_samples, replace=True).tolist() 81 | 82 | for neg in sample_negs: 83 | neg_title = news_dict[neg]['title'] 84 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title) 85 | 86 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len]) 87 | 88 | sentence = base_sentence.replace("", neg_title) 89 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid}) 90 | 91 | def prepro_dev(self, imp_ids, behaviors, news_dict, 92 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450): 93 | template1 = ''.join(self.conti_tokens[0]) + "" 94 | template2 = ''.join(self.conti_tokens[1]) + "" 95 | template3 = ''.join(self.conti_tokens[2]) + "[MASK]" 96 | template = template3 + template2 + template1 97 | 98 | for impid, behav in zip(imp_ids, behaviors): 99 | if len(behav[0]) == 0: 100 | continue 101 | his_clicks = behav[0][-max_his:] 102 | his_clicks.reverse() 103 | his_titles = [] 104 | for news in his_clicks: 105 | title = news_dict[news]['title'] 106 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 107 | 108 | title = ' '.join(title.split(' ')[:max_title_len]) 109 | 110 | his_titles.append(title) 111 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles) 112 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len] 113 | his_sen = self.tokenizer.decode(his_sen_ids) 114 | base_sentence = template.replace("", his_sen) 115 | 116 | positives = behav[1] 117 | negatives = behav[2] 118 | for news in positives: 119 | title = news_dict[news]['title'] 120 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 121 | 122 | title = ' '.join(title.split(' ')[:max_candi_len]) 123 | 124 | sentence = base_sentence.replace("", title) 125 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid}) 126 | 127 | for neg in negatives: 128 | neg_title = news_dict[neg]['title'] 129 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title) 130 | 131 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len]) 132 | 133 | sentence = base_sentence.replace("", neg_title) 134 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid}) 135 | 136 | def load_data(self): 137 | data = pickle.load(open(self.data_path, 'rb')) 138 | imps, users, times, behaviors = self.obtain_data(data) 139 | if self.status == 'train': 140 | self.prepro_train(imps, behaviors, self.news_dict, self.args.num_negs, self.args.max_his, 141 | max_his_len=self.args.max_his_len) 142 | else: 143 | self.prepro_dev(imps, behaviors, self.news_dict, self.args.max_his, 144 | max_his_len=self.args.max_his_len) 145 | 146 | def collate_fn(self, batch): 147 | sentences = [x['sentence'] for x in batch] 148 | target = [x['target'] for x in batch] 149 | imp_id = [x['imp'] for x in batch] 150 | 151 | encode_dict = self.tokenizer.batch_encode_plus( 152 | sentences, 153 | add_special_tokens=True, 154 | padding='max_length', 155 | max_length=self.args.max_tokens, 156 | truncation=True, 157 | pad_to_max_length=True, 158 | return_attention_mask=True, 159 | return_tensors='pt' 160 | ) 161 | 162 | batch_enc = encode_dict['input_ids'] 163 | batch_attn = encode_dict['attention_mask'] 164 | target = torch.LongTensor(target) 165 | 166 | return batch_enc, batch_attn, target, imp_id 167 | 168 | 169 | 170 | 171 | 172 | -------------------------------------------------------------------------------- /Continuous-Relevance/prepro_data.py: -------------------------------------------------------------------------------- 1 | 2 | import re 3 | import random 4 | import numpy as np 5 | from torch.utils.data import Dataset 6 | import pickle 7 | import os 8 | import torch 9 | 10 | 11 | class MyDataset(Dataset): 12 | def __init__(self, args, tokenizer, news_dict, conti_tokens, status='train'): 13 | self.tokenizer = tokenizer 14 | self.news_dict = news_dict 15 | self.args = args 16 | self.status = status 17 | self.conti_tokens = conti_tokens 18 | 19 | self.data = [] 20 | self.imp_lens = [] 21 | if self.status == 'train': 22 | self.data_path = os.path.join(args.data_path, 'train.txt') 23 | elif self.status == 'val': 24 | self.data_path = os.path.join(args.data_path, 'val.txt') 25 | else: 26 | self.data_path = os.path.join(args.data_path, 'test.txt') 27 | self.load_data() 28 | 29 | def __len__(self): 30 | return len(self.data) 31 | 32 | def __getitem__(self, item): 33 | return self.data[item] 34 | 35 | def obtain_data(self, data): 36 | # if self.status == 'train': 37 | # return data[0][:20], data[1][:20], data[2][:20], data[3][:20] 38 | # else: 39 | # return data[0], data[1], data[2], data[3] 40 | return data[0], data[1], data[2], data[3] 41 | 42 | def prepro_train(self, imp_ids, behaviors, news_dict, K_samples, 43 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450): 44 | template1 = ''.join(self.conti_tokens[0]) + "" 45 | template2 = ''.join(self.conti_tokens[1]) + "" 46 | template3 = ''.join(self.conti_tokens[2]) + "[MASK]" 47 | template = template2 + template3 + template1 48 | 49 | for impid, behav in zip(imp_ids, behaviors): 50 | his_clicks = behav[0][-max_his:] 51 | his_clicks.reverse() 52 | his_titles = [] 53 | for news in his_clicks: 54 | title = news_dict[news]['title'] 55 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 56 | 57 | title = ' '.join(title.split(' ')[:max_title_len]) 58 | 59 | his_titles.append(title) 60 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles) 61 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len] 62 | his_sen = self.tokenizer.decode(his_sen_ids) 63 | base_sentence = template.replace("", his_sen) 64 | 65 | positives = behav[1] 66 | negatives = behav[2] 67 | 68 | for news in positives: 69 | title = news_dict[news]['title'] 70 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 71 | 72 | title = ' '.join(title.split(' ')[:max_candi_len]) 73 | 74 | sentence = base_sentence.replace("", title) 75 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid}) 76 | 77 | if len(negatives) >= K_samples: 78 | sample_negs = random.sample(negatives, k=K_samples) 79 | else: 80 | sample_negs = np.random.choice(negatives, K_samples, replace=True).tolist() 81 | 82 | for neg in sample_negs: 83 | neg_title = news_dict[neg]['title'] 84 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title) 85 | 86 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len]) 87 | 88 | sentence = base_sentence.replace("", neg_title) 89 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid}) 90 | 91 | def prepro_dev(self, imp_ids, behaviors, news_dict, 92 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450): 93 | template1 = ''.join(self.conti_tokens[0]) + "" 94 | template2 = ''.join(self.conti_tokens[1]) + "" 95 | template3 = ''.join(self.conti_tokens[2]) + "[MASK]" 96 | template = template2 + template3 + template1 97 | 98 | for impid, behav in zip(imp_ids, behaviors): 99 | if len(behav[0]) == 0: 100 | continue 101 | his_clicks = behav[0][-max_his:] 102 | his_clicks.reverse() 103 | his_titles = [] 104 | for news in his_clicks: 105 | title = news_dict[news]['title'] 106 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 107 | 108 | title = ' '.join(title.split(' ')[:max_title_len]) 109 | 110 | his_titles.append(title) 111 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles) 112 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len] 113 | his_sen = self.tokenizer.decode(his_sen_ids) 114 | base_sentence = template.replace("", his_sen) 115 | 116 | positives = behav[1] 117 | negatives = behav[2] 118 | for news in positives: 119 | title = news_dict[news]['title'] 120 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 121 | 122 | title = ' '.join(title.split(' ')[:max_candi_len]) 123 | 124 | sentence = base_sentence.replace("", title) 125 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid}) 126 | 127 | for neg in negatives: 128 | neg_title = news_dict[neg]['title'] 129 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title) 130 | 131 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len]) 132 | 133 | sentence = base_sentence.replace("", neg_title) 134 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid}) 135 | 136 | def load_data(self): 137 | data = pickle.load(open(self.data_path, 'rb')) 138 | imps, users, times, behaviors = self.obtain_data(data) 139 | if self.status == 'train': 140 | self.prepro_train(imps, behaviors, self.news_dict, self.args.num_negs, self.args.max_his, 141 | max_his_len=self.args.max_his_len) 142 | else: 143 | self.prepro_dev(imps, behaviors, self.news_dict, self.args.max_his, 144 | max_his_len=self.args.max_his_len) 145 | 146 | def collate_fn(self, batch): 147 | sentences = [x['sentence'] for x in batch] 148 | target = [x['target'] for x in batch] 149 | imp_id = [x['imp'] for x in batch] 150 | 151 | encode_dict = self.tokenizer.batch_encode_plus( 152 | sentences, 153 | add_special_tokens=True, 154 | padding='max_length', 155 | max_length=self.args.max_tokens, 156 | truncation=True, 157 | pad_to_max_length=True, 158 | return_attention_mask=True, 159 | return_tensors='pt' 160 | ) 161 | 162 | batch_enc = encode_dict['input_ids'] 163 | batch_attn = encode_dict['attention_mask'] 164 | target = torch.LongTensor(target) 165 | 166 | return batch_enc, batch_attn, target, imp_id 167 | 168 | 169 | 170 | 171 | 172 | -------------------------------------------------------------------------------- /Continuous-Utility/prepro_data.py: -------------------------------------------------------------------------------- 1 | 2 | import re 3 | import random 4 | import numpy as np 5 | from torch.utils.data import Dataset 6 | import pickle 7 | import os 8 | import torch 9 | 10 | 11 | class MyDataset(Dataset): 12 | def __init__(self, args, tokenizer, news_dict, conti_tokens, status='train'): 13 | self.tokenizer = tokenizer 14 | self.news_dict = news_dict 15 | self.args = args 16 | self.status = status 17 | self.conti_tokens = conti_tokens 18 | 19 | self.data = [] 20 | self.imp_lens = [] 21 | if self.status == 'train': 22 | self.data_path = os.path.join(args.train_data_path, 'train.txt') 23 | elif self.status == 'val': 24 | self.data_path = os.path.join(args.data_path, 'val.txt') 25 | else: 26 | self.data_path = os.path.join(args.data_path, 'test.txt') 27 | self.load_data() 28 | 29 | def __len__(self): 30 | return len(self.data) 31 | 32 | def __getitem__(self, item): 33 | return self.data[item] 34 | 35 | def obtain_data(self, data): 36 | # if self.status == 'train': 37 | # return data[0][:20], data[1][:20], data[2][:20], data[3][:20] 38 | # else: 39 | # return data[0], data[1], data[2], data[3] 40 | return data[0], data[1], data[2], data[3] 41 | 42 | def prepro_train(self, imp_ids, behaviors, news_dict, K_samples, 43 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450): 44 | template1 = ''.join(self.conti_tokens[0]) + "" 45 | template2 = ''.join(self.conti_tokens[1]) + "" 46 | template3 = ''.join(self.conti_tokens[2]) + "[MASK]" 47 | template = template2 + template3 + template1 48 | 49 | for impid, behav in zip(imp_ids, behaviors): 50 | his_clicks = behav[0][-max_his:] 51 | his_clicks.reverse() 52 | his_titles = [] 53 | for news in his_clicks: 54 | title = news_dict[news]['title'] 55 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 56 | 57 | title = ' '.join(title.split(' ')[:max_title_len]) 58 | 59 | his_titles.append(title) 60 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles) 61 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len] 62 | his_sen = self.tokenizer.decode(his_sen_ids) 63 | base_sentence = template.replace("", his_sen) 64 | 65 | positives = behav[1] 66 | negatives = behav[2] 67 | 68 | for news in positives: 69 | title = news_dict[news]['title'] 70 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 71 | 72 | title = ' '.join(title.split(' ')[:max_candi_len]) 73 | 74 | sentence = base_sentence.replace("", title) 75 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid}) 76 | 77 | if len(negatives) >= K_samples: 78 | sample_negs = random.sample(negatives, k=K_samples) 79 | else: 80 | sample_negs = np.random.choice(negatives, K_samples, replace=True).tolist() 81 | 82 | for neg in sample_negs: 83 | neg_title = news_dict[neg]['title'] 84 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title) 85 | 86 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len]) 87 | 88 | sentence = base_sentence.replace("", neg_title) 89 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid}) 90 | 91 | def prepro_dev(self, imp_ids, behaviors, news_dict, 92 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450): 93 | template1 = ''.join(self.conti_tokens[0]) + "" 94 | template2 = ''.join(self.conti_tokens[1]) + "" 95 | template3 = ''.join(self.conti_tokens[2]) + "[MASK]" 96 | template = template2 + template3 + template1 97 | 98 | for impid, behav in zip(imp_ids, behaviors): 99 | if len(behav[0]) == 0: 100 | continue 101 | his_clicks = behav[0][-max_his:] 102 | his_clicks.reverse() 103 | his_titles = [] 104 | for news in his_clicks: 105 | title = news_dict[news]['title'] 106 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 107 | 108 | title = ' '.join(title.split(' ')[:max_title_len]) 109 | 110 | his_titles.append(title) 111 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles) 112 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len] 113 | his_sen = self.tokenizer.decode(his_sen_ids) 114 | base_sentence = template.replace("", his_sen) 115 | 116 | positives = behav[1] 117 | negatives = behav[2] 118 | for news in positives: 119 | title = news_dict[news]['title'] 120 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 121 | 122 | title = ' '.join(title.split(' ')[:max_candi_len]) 123 | 124 | sentence = base_sentence.replace("", title) 125 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid}) 126 | 127 | for neg in negatives: 128 | neg_title = news_dict[neg]['title'] 129 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title) 130 | 131 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len]) 132 | 133 | sentence = base_sentence.replace("", neg_title) 134 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid}) 135 | 136 | def load_data(self): 137 | data = pickle.load(open(self.data_path, 'rb')) 138 | imps, users, times, behaviors = self.obtain_data(data) 139 | if self.status == 'train': 140 | self.prepro_train(imps, behaviors, self.news_dict, self.args.num_negs, self.args.max_his, 141 | max_his_len=self.args.max_his_len) 142 | else: 143 | self.prepro_dev(imps, behaviors, self.news_dict, self.args.max_his, 144 | max_his_len=self.args.max_his_len) 145 | 146 | def collate_fn(self, batch): 147 | sentences = [x['sentence'] for x in batch] 148 | target = [x['target'] for x in batch] 149 | imp_id = [x['imp'] for x in batch] 150 | 151 | encode_dict = self.tokenizer.batch_encode_plus( 152 | sentences, 153 | add_special_tokens=True, 154 | padding='max_length', 155 | max_length=self.args.max_tokens, 156 | truncation=True, 157 | pad_to_max_length=True, 158 | return_attention_mask=True, 159 | return_tensors='pt' 160 | ) 161 | 162 | batch_enc = encode_dict['input_ids'] 163 | batch_attn = encode_dict['attention_mask'] 164 | target = torch.LongTensor(target) 165 | 166 | return batch_enc, batch_attn, target, imp_id 167 | 168 | 169 | 170 | 171 | 172 | -------------------------------------------------------------------------------- /Hybrid-Emotion/prepro_data.py: -------------------------------------------------------------------------------- 1 | 2 | import re 3 | import random 4 | import numpy as np 5 | from torch.utils.data import Dataset 6 | import pickle 7 | import os 8 | import torch 9 | 10 | 11 | class MyDataset(Dataset): 12 | def __init__(self, args, tokenizer, news_dict, conti_tokens, status='train'): 13 | self.tokenizer = tokenizer 14 | self.news_dict = news_dict 15 | self.args = args 16 | self.status = status 17 | self.conti_tokens = conti_tokens 18 | 19 | self.data = [] 20 | self.imp_lens = [] 21 | if self.status == 'train': 22 | self.data_path = os.path.join(args.data_path, 'train.txt') 23 | elif self.status == 'val': 24 | self.data_path = os.path.join(args.data_path, 'val.txt') 25 | else: 26 | self.data_path = os.path.join(args.data_path, 'test.txt') 27 | self.load_data() 28 | 29 | def __len__(self): 30 | return len(self.data) 31 | 32 | def __getitem__(self, item): 33 | return self.data[item] 34 | 35 | def obtain_data(self, data): 36 | # if self.status == 'train': 37 | # return data[0][:20], data[1][:20], data[2][:20], data[3][:20] 38 | # else: 39 | # return data[0], data[1], data[2], data[3] 40 | return data[0], data[1], data[2], data[3] 41 | 42 | def prepro_train(self, imp_ids, behaviors, news_dict, K_samples, 43 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450): 44 | template1 = ''.join(self.conti_tokens[0]) + "" 45 | template2 = ''.join(self.conti_tokens[1]) + "" 46 | template3 = "The user feels [MASK] about the news" 47 | template = template1 + "[SEP]" + template2 + "[SEP]" + template3 48 | 49 | for impid, behav in zip(imp_ids, behaviors): 50 | his_clicks = behav[0][-max_his:] 51 | his_clicks.reverse() 52 | his_titles = [] 53 | for news in his_clicks: 54 | title = news_dict[news]['title'] 55 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 56 | 57 | title = ' '.join(title.split(' ')[:max_title_len]) 58 | 59 | his_titles.append(title) 60 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles) 61 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len] 62 | his_sen = self.tokenizer.decode(his_sen_ids) 63 | base_sentence = template.replace("", his_sen) 64 | 65 | positives = behav[1] 66 | negatives = behav[2] 67 | 68 | for news in positives: 69 | title = news_dict[news]['title'] 70 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 71 | 72 | title = ' '.join(title.split(' ')[:max_candi_len]) 73 | 74 | sentence = base_sentence.replace("", title) 75 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid}) 76 | 77 | if len(negatives) >= K_samples: 78 | sample_negs = random.sample(negatives, k=K_samples) 79 | else: 80 | sample_negs = np.random.choice(negatives, K_samples, replace=True).tolist() 81 | 82 | for neg in sample_negs: 83 | neg_title = news_dict[neg]['title'] 84 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title) 85 | 86 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len]) 87 | 88 | sentence = base_sentence.replace("", neg_title) 89 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid}) 90 | 91 | def prepro_dev(self, imp_ids, behaviors, news_dict, 92 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450): 93 | template1 = ''.join(self.conti_tokens[0]) + "" 94 | template2 = ''.join(self.conti_tokens[1]) + "" 95 | template3 = "The user feels [MASK] about the news" 96 | template = template1 + "[SEP]" + template2 + "[SEP]" + template3 97 | 98 | for impid, behav in zip(imp_ids, behaviors): 99 | if len(behav[0]) == 0: 100 | continue 101 | his_clicks = behav[0][-max_his:] 102 | his_clicks.reverse() 103 | his_titles = [] 104 | for news in his_clicks: 105 | title = news_dict[news]['title'] 106 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 107 | 108 | title = ' '.join(title.split(' ')[:max_title_len]) 109 | 110 | his_titles.append(title) 111 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles) 112 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len] 113 | his_sen = self.tokenizer.decode(his_sen_ids) 114 | base_sentence = template.replace("", his_sen) 115 | 116 | positives = behav[1] 117 | negatives = behav[2] 118 | for news in positives: 119 | title = news_dict[news]['title'] 120 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 121 | 122 | title = ' '.join(title.split(' ')[:max_candi_len]) 123 | 124 | sentence = base_sentence.replace("", title) 125 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid}) 126 | 127 | for neg in negatives: 128 | neg_title = news_dict[neg]['title'] 129 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title) 130 | 131 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len]) 132 | 133 | sentence = base_sentence.replace("", neg_title) 134 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid}) 135 | 136 | def load_data(self): 137 | data = pickle.load(open(self.data_path, 'rb')) 138 | imps, users, times, behaviors = self.obtain_data(data) 139 | if self.status == 'train': 140 | self.prepro_train(imps, behaviors, self.news_dict, self.args.num_negs, self.args.max_his, 141 | max_his_len=self.args.max_his_len) 142 | else: 143 | self.prepro_dev(imps, behaviors, self.news_dict, self.args.max_his, 144 | max_his_len=self.args.max_his_len) 145 | 146 | def collate_fn(self, batch): 147 | sentences = [x['sentence'] for x in batch] 148 | target = [x['target'] for x in batch] 149 | imp_id = [x['imp'] for x in batch] 150 | 151 | encode_dict = self.tokenizer.batch_encode_plus( 152 | sentences, 153 | add_special_tokens=True, 154 | padding='max_length', 155 | max_length=self.args.max_tokens, 156 | truncation=True, 157 | pad_to_max_length=True, 158 | return_attention_mask=True, 159 | return_tensors='pt' 160 | ) 161 | 162 | batch_enc = encode_dict['input_ids'] 163 | batch_attn = encode_dict['attention_mask'] 164 | target = torch.LongTensor(target) 165 | 166 | return batch_enc, batch_attn, target, imp_id 167 | 168 | 169 | 170 | 171 | 172 | -------------------------------------------------------------------------------- /Continuous-Action/prepro_data.py: -------------------------------------------------------------------------------- 1 | 2 | import re 3 | import random 4 | import numpy as np 5 | from torch.utils.data import Dataset 6 | import pickle 7 | import os 8 | import torch 9 | 10 | 11 | class MyDataset(Dataset): 12 | def __init__(self, args, tokenizer, news_dict, conti_tokens, status='train'): 13 | self.tokenizer = tokenizer 14 | self.news_dict = news_dict 15 | self.args = args 16 | self.status = status 17 | self.conti_tokens = conti_tokens 18 | 19 | self.data = [] 20 | self.imp_lens = [] 21 | if self.status == 'train': 22 | self.data_path = os.path.join(args.data_path, 'train.txt') 23 | elif self.status == 'val': 24 | self.data_path = os.path.join(args.data_path, 'val.txt') 25 | else: 26 | self.data_path = os.path.join(args.data_path, 'test.txt') 27 | self.load_data() 28 | 29 | def __len__(self): 30 | return len(self.data) 31 | 32 | def __getitem__(self, item): 33 | return self.data[item] 34 | 35 | def obtain_data(self, data): 36 | # if self.status == 'train': 37 | # return data[0][:20], data[1][:20], data[2][:20], data[3][:20] 38 | # else: 39 | # return data[0], data[1], data[2], data[3] 40 | return data[0], data[1], data[2], data[3] 41 | 42 | def prepro_train(self, imp_ids, behaviors, news_dict, K_samples, 43 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450): 44 | template1 = ''.join(self.conti_tokens[0]) + "" 45 | template2 = ''.join(self.conti_tokens[1]) + "" 46 | template3 = ''.join(self.conti_tokens[2]) + "[MASK]" 47 | template = template1 + "[SEP]" + template2 + "[SEP]" + template3 48 | 49 | for impid, behav in zip(imp_ids, behaviors): 50 | his_clicks = behav[0][-max_his:] 51 | his_clicks.reverse() 52 | his_titles = [] 53 | for news in his_clicks: 54 | title = news_dict[news]['title'] 55 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 56 | 57 | title = ' '.join(title.split(' ')[:max_title_len]) 58 | 59 | his_titles.append(title) 60 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles) 61 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len] 62 | his_sen = self.tokenizer.decode(his_sen_ids) 63 | base_sentence = template.replace("", his_sen) 64 | 65 | positives = behav[1] 66 | negatives = behav[2] 67 | 68 | for news in positives: 69 | title = news_dict[news]['title'] 70 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 71 | 72 | title = ' '.join(title.split(' ')[:max_candi_len]) 73 | 74 | sentence = base_sentence.replace("", title) 75 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid}) 76 | 77 | if len(negatives) >= K_samples: 78 | sample_negs = random.sample(negatives, k=K_samples) 79 | else: 80 | sample_negs = np.random.choice(negatives, K_samples, replace=True).tolist() 81 | 82 | for neg in sample_negs: 83 | neg_title = news_dict[neg]['title'] 84 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title) 85 | 86 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len]) 87 | 88 | sentence = base_sentence.replace("", neg_title) 89 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid}) 90 | 91 | def prepro_dev(self, imp_ids, behaviors, news_dict, 92 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450): 93 | template1 = ''.join(self.conti_tokens[0]) + "" 94 | template2 = ''.join(self.conti_tokens[1]) + "" 95 | template3 = ''.join(self.conti_tokens[2]) + "[MASK]" 96 | template = template1 + "[SEP]" + template2 + "[SEP]" + template3 97 | 98 | for impid, behav in zip(imp_ids, behaviors): 99 | if len(behav[0]) == 0: 100 | continue 101 | his_clicks = behav[0][-max_his:] 102 | his_clicks.reverse() 103 | his_titles = [] 104 | for news in his_clicks: 105 | title = news_dict[news]['title'] 106 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 107 | 108 | title = ' '.join(title.split(' ')[:max_title_len]) 109 | 110 | his_titles.append(title) 111 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles) 112 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len] 113 | his_sen = self.tokenizer.decode(his_sen_ids) 114 | base_sentence = template.replace("", his_sen) 115 | 116 | positives = behav[1] 117 | negatives = behav[2] 118 | for news in positives: 119 | title = news_dict[news]['title'] 120 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 121 | 122 | title = ' '.join(title.split(' ')[:max_candi_len]) 123 | 124 | sentence = base_sentence.replace("", title) 125 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid}) 126 | 127 | for neg in negatives: 128 | neg_title = news_dict[neg]['title'] 129 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title) 130 | 131 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len]) 132 | 133 | sentence = base_sentence.replace("", neg_title) 134 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid}) 135 | 136 | def load_data(self): 137 | data = pickle.load(open(self.data_path, 'rb')) 138 | imps, users, times, behaviors = self.obtain_data(data) 139 | if self.status == 'train': 140 | self.prepro_train(imps, behaviors, self.news_dict, self.args.num_negs, self.args.max_his, 141 | max_his_len=self.args.max_his_len) 142 | else: 143 | self.prepro_dev(imps, behaviors, self.news_dict, self.args.max_his, 144 | max_his_len=self.args.max_his_len) 145 | 146 | def collate_fn(self, batch): 147 | sentences = [x['sentence'] for x in batch] 148 | target = [x['target'] for x in batch] 149 | imp_id = [x['imp'] for x in batch] 150 | 151 | encode_dict = self.tokenizer.batch_encode_plus( 152 | sentences, 153 | add_special_tokens=True, 154 | padding='max_length', 155 | max_length=self.args.max_tokens, 156 | truncation=True, 157 | pad_to_max_length=True, 158 | return_attention_mask=True, 159 | return_tensors='pt' 160 | ) 161 | 162 | batch_enc = encode_dict['input_ids'] 163 | batch_attn = encode_dict['attention_mask'] 164 | target = torch.LongTensor(target) 165 | 166 | return batch_enc, batch_attn, target, imp_id 167 | 168 | 169 | 170 | 171 | 172 | -------------------------------------------------------------------------------- /Hybrid-Action/prepro_data.py: -------------------------------------------------------------------------------- 1 | 2 | import re 3 | import random 4 | import numpy as np 5 | from torch.utils.data import Dataset 6 | import pickle 7 | import os 8 | import torch 9 | 10 | 11 | class MyDataset(Dataset): 12 | def __init__(self, args, tokenizer, news_dict, conti_tokens, status='train'): 13 | self.tokenizer = tokenizer 14 | self.news_dict = news_dict 15 | self.args = args 16 | self.status = status 17 | self.conti_tokens = conti_tokens 18 | 19 | self.data = [] 20 | self.imp_lens = [] 21 | if self.status == 'train': 22 | self.data_path = os.path.join(args.train_data_path, 'train.txt') 23 | elif self.status == 'val': 24 | self.data_path = os.path.join(args.data_path, 'val.txt') 25 | else: 26 | self.data_path = os.path.join(args.data_path, 'test.txt') 27 | self.load_data() 28 | 29 | def __len__(self): 30 | return len(self.data) 31 | 32 | def __getitem__(self, item): 33 | return self.data[item] 34 | 35 | def obtain_data(self, data): 36 | # if self.status == 'train': 37 | # return data[0][:20], data[1][:20], data[2][:20], data[3][:20] 38 | # else: 39 | # return data[0], data[1], data[2], data[3] 40 | return data[0], data[1], data[2], data[3] 41 | 42 | def prepro_train(self, imp_ids, behaviors, news_dict, K_samples, 43 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450): 44 | template1 = ''.join(self.conti_tokens[0]) + "" 45 | template2 = ''.join(self.conti_tokens[1]) + "" 46 | template3 = "Does the user click the news? [MASK]" 47 | template = template1 + "[SEP]" + template2 + "[SEP]" + template3 48 | 49 | for impid, behav in zip(imp_ids, behaviors): 50 | his_clicks = behav[0][-max_his:] 51 | his_clicks.reverse() 52 | his_titles = [] 53 | for news in his_clicks: 54 | title = news_dict[news]['title'] 55 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 56 | 57 | title = ' '.join(title.split(' ')[:max_title_len]) 58 | 59 | his_titles.append(title) 60 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles) 61 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len] 62 | his_sen = self.tokenizer.decode(his_sen_ids) 63 | base_sentence = template.replace("", his_sen) 64 | 65 | positives = behav[1] 66 | negatives = behav[2] 67 | 68 | for news in positives: 69 | title = news_dict[news]['title'] 70 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 71 | 72 | title = ' '.join(title.split(' ')[:max_candi_len]) 73 | 74 | sentence = base_sentence.replace("", title) 75 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid}) 76 | 77 | if len(negatives) >= K_samples: 78 | sample_negs = random.sample(negatives, k=K_samples) 79 | else: 80 | sample_negs = np.random.choice(negatives, K_samples, replace=True).tolist() 81 | 82 | for neg in sample_negs: 83 | neg_title = news_dict[neg]['title'] 84 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title) 85 | 86 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len]) 87 | 88 | sentence = base_sentence.replace("", neg_title) 89 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid}) 90 | 91 | def prepro_dev(self, imp_ids, behaviors, news_dict, 92 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450): 93 | template1 = ''.join(self.conti_tokens[0]) + "" 94 | template2 = ''.join(self.conti_tokens[1]) + "" 95 | template3 = "Does the user click the news? [MASK]" 96 | template = template1 + "[SEP]" + template2 + "[SEP]" + template3 97 | 98 | for impid, behav in zip(imp_ids, behaviors): 99 | if len(behav[0]) == 0: 100 | continue 101 | his_clicks = behav[0][-max_his:] 102 | his_clicks.reverse() 103 | his_titles = [] 104 | for news in his_clicks: 105 | title = news_dict[news]['title'] 106 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 107 | 108 | title = ' '.join(title.split(' ')[:max_title_len]) 109 | 110 | his_titles.append(title) 111 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles) 112 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len] 113 | his_sen = self.tokenizer.decode(his_sen_ids) 114 | base_sentence = template.replace("", his_sen) 115 | 116 | positives = behav[1] 117 | negatives = behav[2] 118 | for news in positives: 119 | title = news_dict[news]['title'] 120 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 121 | 122 | title = ' '.join(title.split(' ')[:max_candi_len]) 123 | 124 | sentence = base_sentence.replace("", title) 125 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid}) 126 | 127 | for neg in negatives: 128 | neg_title = news_dict[neg]['title'] 129 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title) 130 | 131 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len]) 132 | 133 | sentence = base_sentence.replace("", neg_title) 134 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid}) 135 | 136 | def load_data(self): 137 | data = pickle.load(open(self.data_path, 'rb')) 138 | imps, users, times, behaviors = self.obtain_data(data) 139 | if self.status == 'train': 140 | self.prepro_train(imps, behaviors, self.news_dict, self.args.num_negs, self.args.max_his, 141 | max_his_len=self.args.max_his_len) 142 | else: 143 | self.prepro_dev(imps, behaviors, self.news_dict, self.args.max_his, 144 | max_his_len=self.args.max_his_len) 145 | 146 | def collate_fn(self, batch): 147 | sentences = [x['sentence'] for x in batch] 148 | target = [x['target'] for x in batch] 149 | imp_id = [x['imp'] for x in batch] 150 | 151 | encode_dict = self.tokenizer.batch_encode_plus( 152 | sentences, 153 | add_special_tokens=True, 154 | padding='max_length', 155 | max_length=self.args.max_tokens, 156 | truncation=True, 157 | pad_to_max_length=True, 158 | return_attention_mask=True, 159 | return_tensors='pt' 160 | ) 161 | 162 | batch_enc = encode_dict['input_ids'] 163 | batch_attn = encode_dict['attention_mask'] 164 | target = torch.LongTensor(target) 165 | 166 | return batch_enc, batch_attn, target, imp_id 167 | 168 | 169 | 170 | 171 | 172 | -------------------------------------------------------------------------------- /Hybrid-Utility/prepro_data.py: -------------------------------------------------------------------------------- 1 | 2 | import re 3 | import random 4 | import numpy as np 5 | from torch.utils.data import Dataset 6 | import pickle 7 | import os 8 | import torch 9 | 10 | 11 | class MyDataset(Dataset): 12 | def __init__(self, args, tokenizer, news_dict, conti_tokens, status='train'): 13 | self.tokenizer = tokenizer 14 | self.news_dict = news_dict 15 | self.args = args 16 | self.status = status 17 | self.conti_tokens = conti_tokens 18 | 19 | self.data = [] 20 | self.imp_lens = [] 21 | if self.status == 'train': 22 | self.data_path = os.path.join(args.data_path, 'train.txt') 23 | elif self.status == 'val': 24 | self.data_path = os.path.join(args.data_path, 'val.txt') 25 | else: 26 | self.data_path = os.path.join(args.data_path, 'test.txt') 27 | self.load_data() 28 | 29 | def __len__(self): 30 | return len(self.data) 31 | 32 | def __getitem__(self, item): 33 | return self.data[item] 34 | 35 | def obtain_data(self, data): 36 | # if self.status == 'train': 37 | # return data[0][:20], data[1][:20], data[2][:20], data[3][:20] 38 | # else: 39 | # return data[0], data[1], data[2], data[3] 40 | return data[0], data[1], data[2], data[3] 41 | 42 | def prepro_train(self, imp_ids, behaviors, news_dict, K_samples, 43 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450): 44 | template1 = ''.join(self.conti_tokens[0]) + "" 45 | template2 = ''.join(self.conti_tokens[1]) + "" 46 | template3 = "Recommending the news to the user is a [MASK] choice" 47 | template = template1 + "[SEP]" + template2 + "[SEP]" + template3 48 | 49 | for impid, behav in zip(imp_ids, behaviors): 50 | his_clicks = behav[0][-max_his:] 51 | his_clicks.reverse() 52 | his_titles = [] 53 | for news in his_clicks: 54 | title = news_dict[news]['title'] 55 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 56 | 57 | title = ' '.join(title.split(' ')[:max_title_len]) 58 | 59 | his_titles.append(title) 60 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles) 61 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len] 62 | his_sen = self.tokenizer.decode(his_sen_ids) 63 | base_sentence = template.replace("", his_sen) 64 | 65 | positives = behav[1] 66 | negatives = behav[2] 67 | 68 | for news in positives: 69 | title = news_dict[news]['title'] 70 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 71 | 72 | title = ' '.join(title.split(' ')[:max_candi_len]) 73 | 74 | sentence = base_sentence.replace("", title) 75 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid}) 76 | 77 | if len(negatives) >= K_samples: 78 | sample_negs = random.sample(negatives, k=K_samples) 79 | else: 80 | sample_negs = np.random.choice(negatives, K_samples, replace=True).tolist() 81 | 82 | for neg in sample_negs: 83 | neg_title = news_dict[neg]['title'] 84 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title) 85 | 86 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len]) 87 | 88 | sentence = base_sentence.replace("", neg_title) 89 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid}) 90 | 91 | def prepro_dev(self, imp_ids, behaviors, news_dict, 92 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450): 93 | template1 = ''.join(self.conti_tokens[0]) + "" 94 | template2 = ''.join(self.conti_tokens[1]) + "" 95 | template3 = "Recommending the news to the user is a [MASK] choice" 96 | template = template1 + "[SEP]" + template2 + "[SEP]" + template3 97 | 98 | for impid, behav in zip(imp_ids, behaviors): 99 | if len(behav[0]) == 0: 100 | continue 101 | his_clicks = behav[0][-max_his:] 102 | his_clicks.reverse() 103 | his_titles = [] 104 | for news in his_clicks: 105 | title = news_dict[news]['title'] 106 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 107 | 108 | title = ' '.join(title.split(' ')[:max_title_len]) 109 | 110 | his_titles.append(title) 111 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles) 112 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len] 113 | his_sen = self.tokenizer.decode(his_sen_ids) 114 | base_sentence = template.replace("", his_sen) 115 | 116 | positives = behav[1] 117 | negatives = behav[2] 118 | for news in positives: 119 | title = news_dict[news]['title'] 120 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 121 | 122 | title = ' '.join(title.split(' ')[:max_candi_len]) 123 | 124 | sentence = base_sentence.replace("", title) 125 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid}) 126 | 127 | for neg in negatives: 128 | neg_title = news_dict[neg]['title'] 129 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title) 130 | 131 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len]) 132 | 133 | sentence = base_sentence.replace("", neg_title) 134 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid}) 135 | 136 | def load_data(self): 137 | data = pickle.load(open(self.data_path, 'rb')) 138 | imps, users, times, behaviors = self.obtain_data(data) 139 | if self.status == 'train': 140 | self.prepro_train(imps, behaviors, self.news_dict, self.args.num_negs, self.args.max_his, 141 | max_his_len=self.args.max_his_len) 142 | else: 143 | self.prepro_dev(imps, behaviors, self.news_dict, self.args.max_his, 144 | max_his_len=self.args.max_his_len) 145 | 146 | def collate_fn(self, batch): 147 | sentences = [x['sentence'] for x in batch] 148 | target = [x['target'] for x in batch] 149 | imp_id = [x['imp'] for x in batch] 150 | 151 | encode_dict = self.tokenizer.batch_encode_plus( 152 | sentences, 153 | add_special_tokens=True, 154 | padding='max_length', 155 | max_length=self.args.max_tokens, 156 | truncation=True, 157 | pad_to_max_length=True, 158 | return_attention_mask=True, 159 | return_tensors='pt' 160 | ) 161 | 162 | batch_enc = encode_dict['input_ids'] 163 | batch_attn = encode_dict['attention_mask'] 164 | target = torch.LongTensor(target) 165 | 166 | return batch_enc, batch_attn, target, imp_id 167 | 168 | 169 | 170 | 171 | 172 | -------------------------------------------------------------------------------- /Hybrid-Relevance/prepro_data.py: -------------------------------------------------------------------------------- 1 | 2 | import re 3 | import random 4 | import numpy as np 5 | from torch.utils.data import Dataset 6 | import pickle 7 | import os 8 | import torch 9 | 10 | 11 | class MyDataset(Dataset): 12 | def __init__(self, args, tokenizer, news_dict, conti_tokens, status='train'): 13 | self.tokenizer = tokenizer 14 | self.news_dict = news_dict 15 | self.args = args 16 | self.status = status 17 | self.conti_tokens = conti_tokens 18 | 19 | self.data = [] 20 | self.imp_lens = [] 21 | if self.status == 'train': 22 | self.data_path = os.path.join(args.data_path, 'train.txt') 23 | elif self.status == 'val': 24 | self.data_path = os.path.join(args.data_path, 'val.txt') 25 | else: 26 | self.data_path = os.path.join(args.data_path, 'test.txt') 27 | self.load_data() 28 | 29 | def __len__(self): 30 | return len(self.data) 31 | 32 | def __getitem__(self, item): 33 | return self.data[item] 34 | 35 | def obtain_data(self, data): 36 | # if self.status == 'train': 37 | # return data[0][:20], data[1][:20], data[2][:20], data[3][:20] 38 | # else: 39 | # return data[0], data[1], data[2], data[3] 40 | return data[0], data[1], data[2], data[3] 41 | 42 | def prepro_train(self, imp_ids, behaviors, news_dict, K_samples, 43 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450): 44 | template1 = ''.join(self.conti_tokens[0]) + "" 45 | template2 = ''.join(self.conti_tokens[1]) + "" 46 | template3 = "This news is [MASK] to the user's area of interest" 47 | # template3 = " is [MASK] to " 48 | template = template1 + "[SEP]" + template2 + "[SEP]" + template3 49 | # template = template2 + template3 + template1 50 | 51 | for impid, behav in zip(imp_ids, behaviors): 52 | his_clicks = behav[0][-max_his:] 53 | his_clicks.reverse() 54 | his_titles = [] 55 | for news in his_clicks: 56 | title = news_dict[news]['title'] 57 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 58 | 59 | title = ' '.join(title.split(' ')[:max_title_len]) 60 | 61 | his_titles.append(title) 62 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles) 63 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len] 64 | his_sen = self.tokenizer.decode(his_sen_ids) 65 | base_sentence = template.replace("", his_sen) 66 | 67 | positives = behav[1] 68 | negatives = behav[2] 69 | 70 | for news in positives: 71 | title = news_dict[news]['title'] 72 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 73 | 74 | title = ' '.join(title.split(' ')[:max_candi_len]) 75 | 76 | sentence = base_sentence.replace("", title) 77 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid}) 78 | 79 | if len(negatives) >= K_samples: 80 | sample_negs = random.sample(negatives, k=K_samples) 81 | else: 82 | sample_negs = np.random.choice(negatives, K_samples, replace=True).tolist() 83 | 84 | for neg in sample_negs: 85 | neg_title = news_dict[neg]['title'] 86 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title) 87 | 88 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len]) 89 | 90 | sentence = base_sentence.replace("", neg_title) 91 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid}) 92 | 93 | def prepro_dev(self, imp_ids, behaviors, news_dict, 94 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450): 95 | template1 = ''.join(self.conti_tokens[0]) + "" 96 | template2 = ''.join(self.conti_tokens[1]) + "" 97 | template3 = "This news is [MASK] to the user's area of interest" 98 | # template3 = " is [MASK] to " 99 | template = template1 + "[SEP]" + template2 + "[SEP]" + template3 100 | # template = template2 + template3 + template1 101 | 102 | for impid, behav in zip(imp_ids, behaviors): 103 | if len(behav[0]) == 0: 104 | continue 105 | his_clicks = behav[0][-max_his:] 106 | his_clicks.reverse() 107 | his_titles = [] 108 | for news in his_clicks: 109 | title = news_dict[news]['title'] 110 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 111 | 112 | title = ' '.join(title.split(' ')[:max_title_len]) 113 | 114 | his_titles.append(title) 115 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles) 116 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len] 117 | his_sen = self.tokenizer.decode(his_sen_ids) 118 | base_sentence = template.replace("", his_sen) 119 | 120 | positives = behav[1] 121 | negatives = behav[2] 122 | for news in positives: 123 | title = news_dict[news]['title'] 124 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title) 125 | 126 | title = ' '.join(title.split(' ')[:max_candi_len]) 127 | 128 | sentence = base_sentence.replace("", title) 129 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid}) 130 | 131 | for neg in negatives: 132 | neg_title = news_dict[neg]['title'] 133 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title) 134 | 135 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len]) 136 | 137 | sentence = base_sentence.replace("", neg_title) 138 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid}) 139 | 140 | def load_data(self): 141 | data = pickle.load(open(self.data_path, 'rb')) 142 | imps, users, times, behaviors = self.obtain_data(data) 143 | if self.status == 'train': 144 | self.prepro_train(imps, behaviors, self.news_dict, self.args.num_negs, self.args.max_his, 145 | max_his_len=self.args.max_his_len) 146 | else: 147 | self.prepro_dev(imps, behaviors, self.news_dict, self.args.max_his, 148 | max_his_len=self.args.max_his_len) 149 | 150 | def collate_fn(self, batch): 151 | sentences = [x['sentence'] for x in batch] 152 | target = [x['target'] for x in batch] 153 | imp_id = [x['imp'] for x in batch] 154 | 155 | encode_dict = self.tokenizer.batch_encode_plus( 156 | sentences, 157 | add_special_tokens=True, 158 | padding='max_length', 159 | max_length=self.args.max_tokens, 160 | truncation=True, 161 | pad_to_max_length=True, 162 | return_attention_mask=True, 163 | return_tensors='pt' 164 | ) 165 | 166 | batch_enc = encode_dict['input_ids'] 167 | batch_attn = encode_dict['attention_mask'] 168 | target = torch.LongTensor(target) 169 | 170 | return batch_enc, batch_attn, target, imp_id 171 | 172 | 173 | 174 | 175 | 176 | -------------------------------------------------------------------------------- /Discrete-Action/predict.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import pickle 4 | import time 5 | import sys 6 | 7 | from tqdm import tqdm 8 | from datetime import datetime 9 | import torch.cuda 10 | from torch.utils.data import DataLoader 11 | 12 | from transformers import BertTokenizer 13 | from transformers import AdamW 14 | 15 | import torch.distributed as dist 16 | import torch.multiprocessing as mp 17 | from torch.nn.parallel import DistributedDataParallel as DDP 18 | from torch.utils.data.distributed import DistributedSampler 19 | 20 | from model import BERTPrompt4NR 21 | from prepro_data import * 22 | from utils import evaluate 23 | 24 | 25 | def setup(rank, world_size): 26 | os.environ['MASTER_ADDR'] = 'localhost' 27 | os.environ['MASTER_PORT'] = '23342' 28 | dist.init_process_group("nccl", rank=rank, world_size=world_size) 29 | 30 | 31 | def cleanup(): 32 | dist.destroy_process_group() 33 | 34 | 35 | def init_seed(seed): 36 | random.seed(seed) 37 | np.random.seed(seed) 38 | torch.manual_seed(seed) 39 | torch.cuda.manual_seed(seed) 40 | torch.cuda.manual_seed_all(seed) 41 | 42 | 43 | class Logger(object): 44 | def __init__(self, filename, stream=sys.stdout): 45 | self.terminal = stream 46 | self.log = open(filename, 'w') 47 | 48 | def write(self, message): 49 | self.terminal.write(message) 50 | self.log.write(message) 51 | 52 | def flush(self): 53 | pass 54 | 55 | 56 | def load_model(model_name, args): 57 | tokenizer = BertTokenizer.from_pretrained(model_name) 58 | 59 | new_tokens = ['[NSEP]'] 60 | tokenizer.add_tokens(new_tokens) 61 | new_vocab_size = len(tokenizer) 62 | args.vocab_size = new_vocab_size 63 | 64 | answer = ['no', 'yes'] 65 | answer_ids = tokenizer.encode(answer, add_special_tokens=False) 66 | 67 | net = BERTPrompt4NR(model_name, answer_ids, args) 68 | return net, tokenizer 69 | 70 | 71 | def eval(model, rank, world_size, data_loader): 72 | model.eval() 73 | data_loader = tqdm(data_loader) 74 | val_scores = [] 75 | acc_cnt = torch.zeros(2).to(rank) 76 | acc_cnt_pos = torch.zeros(2).to(rank) 77 | imp_ids = [] 78 | labels = [] 79 | for step, data in enumerate(data_loader): 80 | batch_enc, batch_attn, batch_labs, batch_imp = data 81 | imp_ids = imp_ids + batch_imp 82 | labels = labels + batch_labs.cpu().numpy().tolist() 83 | 84 | batch_enc = batch_enc.to(rank) 85 | batch_attn = batch_attn.to(rank) 86 | batch_labs = batch_labs.to(rank) 87 | 88 | loss, scores = model(batch_enc, batch_attn, batch_labs) 89 | 90 | ranking_scores = scores[:, 1].detach() 91 | val_scores.append(ranking_scores) 92 | 93 | predict = torch.argmax(scores.detach(), dim=1) 94 | num_correct = (predict == batch_labs).sum() 95 | acc_cnt[0] += num_correct 96 | acc_cnt[1] += predict.size(0) 97 | 98 | positive_idx = torch.where(batch_labs == 1)[0] 99 | num_correct_pos = (predict[positive_idx] == batch_labs[positive_idx]).sum() 100 | acc_cnt_pos[0] += num_correct_pos 101 | acc_cnt_pos[1] += positive_idx.size(0) 102 | 103 | dist.all_reduce(acc_cnt, op=dist.ReduceOp.SUM) 104 | dist.all_reduce(acc_cnt_pos, op=dist.ReduceOp.SUM) 105 | 106 | acc = acc_cnt[0] / acc_cnt[1] 107 | acc_pos = acc_cnt_pos[0] / acc_cnt_pos[1] 108 | pos_ratio = acc_cnt_pos[1] / acc_cnt[1] 109 | 110 | val_scores = torch.cat(val_scores, dim=0) 111 | val_impids = torch.IntTensor(imp_ids).to(rank) 112 | val_labels = torch.IntTensor(labels).to(rank) 113 | 114 | val_scores_list = [torch.zeros_like(val_scores).to(rank) for _ in range(world_size)] 115 | val_impids_list = [torch.zeros_like(val_impids).to(rank) for _ in range(world_size)] 116 | val_labels_list = [torch.zeros_like(val_labels).to(rank) for _ in range(world_size)] 117 | 118 | dist.all_gather(val_scores_list, val_scores) 119 | dist.all_gather(val_impids_list, val_impids) 120 | dist.all_gather(val_labels_list, val_labels) 121 | 122 | return val_scores_list, acc.item(), acc_pos.item(), pos_ratio.item(), val_impids_list, val_labels_list 123 | 124 | 125 | def ddp_main(rank, world_size, args): 126 | args.rank = rank 127 | args.world_size = world_size 128 | init_seed(rank + 1) 129 | if rank == 0: 130 | if args.log: 131 | sys.stdout = Logger(args.log_file, sys.stdout) 132 | setup(rank, world_size) 133 | 134 | print('| distributed init rank {}'.format(rank)) 135 | dist.barrier() 136 | 137 | # load model 138 | net, tokenizer = load_model(args.model_name, args) 139 | 140 | # load data 141 | news_dict = pickle.load(open(os.path.join(args.data_path, 'news.txt'), 'rb')) 142 | test_dataset = MyDataset(args, tokenizer, news_dict, status='test') 143 | 144 | if rank == 0: 145 | print(args) 146 | print('Vocabulary size of tokenizer after adding new tokens : %d' % args.vocab_size) 147 | print(test_dataset[0]['sentence']) 148 | print('num test: %d' % len(test_dataset)) 149 | 150 | test_sampler = DistributedSampler(test_dataset, 151 | rank=rank, 152 | num_replicas=world_size) 153 | nw = 2 154 | test_kwargs = {'batch_size': args.test_batch_size, 'sampler': test_sampler, 155 | 'shuffle': False, 'pin_memory': False, 156 | 'num_workers': nw, 'collate_fn': test_dataset.collate_fn} 157 | 158 | test_loader = DataLoader(test_dataset, **test_kwargs) 159 | 160 | net = net.to(rank) 161 | net = DDP(net, device_ids=[rank]) 162 | 163 | dist.barrier() 164 | 165 | map_location = {'cuda:%d' % 0: 'cuda:%d' % rank} 166 | net.module.load_state_dict(torch.load(args.model_file, map_location=map_location)) 167 | 168 | with torch.no_grad(): 169 | st_test = time.time() 170 | test_scores, acc_test, acc_pos_test, pos_ratio_test, test_impids, test_labels = \ 171 | eval(net, rank, world_size, test_loader) 172 | impressions = {} # {1: {'score': [], 'lab': []}} 173 | for i in range(world_size): 174 | scores, imp_id, labs = test_scores[i], test_impids[i], test_labels[i] 175 | assert scores.size() == imp_id.size() == labs.size() 176 | scores = scores.cpu().numpy().tolist() 177 | imp_id = imp_id.cpu().numpy().tolist() 178 | labs = labs.cpu().numpy().tolist() 179 | for j in range(len(scores)): 180 | sco, imp, lab = scores[j], imp_id[j], labs[j] 181 | if imp not in impressions: 182 | impressions[imp] = {'score': [], 'lab': []} 183 | impressions[imp]['score'].append(sco) 184 | impressions[imp]['lab'].append(lab) 185 | else: 186 | impressions[imp]['score'].append(sco) 187 | impressions[imp]['lab'].append(lab) 188 | predicts, truths = [], [] 189 | for imp in impressions: 190 | sims, labs = impressions[imp]['score'], impressions[imp]['lab'] 191 | sl_zip = sorted(zip(sims, labs), key=lambda x: x[0], reverse=True) 192 | sort_sims, sort_labs = zip(*sl_zip) 193 | predicts.append(list(range(1, len(sort_labs) + 1, 1))) 194 | truths.append(sort_labs) 195 | 196 | auc_test, mrr_test, ndcg5_test, ndcg10_test = evaluate(predicts, truths) 197 | end_test = time.time() 198 | test_spend = (end_test - st_test) / 60 199 | 200 | if rank == 0: 201 | print("Test: AUC: %0.4f\tMRR: %0.4f\tnDCG@5: %0.4f\tnDCG@10: %0.4f\t[Test-Time: %0.2f mim]" % 202 | (auc_test, mrr_test, ndcg5_test, ndcg10_test, test_spend)) 203 | cleanup() 204 | 205 | 206 | if __name__ == '__main__': 207 | t0 = time.time() 208 | parser = argparse.ArgumentParser() 209 | parser.add_argument('--data_path', default='../DATA/MIND-Demo', type=str, help='Path') 210 | parser.add_argument('--model_name', default='bert-base-uncased', type=str) 211 | 212 | parser.add_argument('--test_batch_size', default=15, type=int, help='test batch_size') 213 | parser.add_argument('--max_his', default=50, type=int, help='max number of history') 214 | parser.add_argument('--max_tokens', default=500, type=int, help='max number of tokens') 215 | 216 | parser.add_argument('--max_his_len', default=450, type=int, help='max number of history') 217 | 218 | parser.add_argument('--device', default='cuda', help='device id') 219 | parser.add_argument('--world_size', default=2, type=int, help='number of distributed processes') 220 | 221 | parser.add_argument('--model_file', default='', type=str, help='model file') 222 | # parser.add_argument('--log', default=False, type=bool, help='whether write log file') 223 | parser.add_argument('--log', default=True, type=bool, help='whether write log file') 224 | 225 | args = parser.parse_args() 226 | 227 | if args.data_path == '../DATA/MIND-Demo': 228 | if args.log: 229 | if not os.path.exists('./log-Test'): 230 | os.makedirs('./log-Test') 231 | log_file = './log-Test/' + 'Tbs' + str(args.test_batch_size) + '-' + str(datetime.now())[-5:]+'.txt' 232 | args.log_file = log_file 233 | else: 234 | if args.log: 235 | if not os.path.exists('./log-Test-Small'): 236 | os.makedirs('./log-Test-Small') 237 | log_file = './log-Test-Small/' + 'Tbs' + str(args.test_batch_size) + '-' + str(datetime.now())[-5:]+'.txt' 238 | args.log_file = log_file 239 | 240 | WORLD_SIZE = torch.cuda.device_count() 241 | mp.spawn(ddp_main, 242 | args=(WORLD_SIZE, args), 243 | nprocs=WORLD_SIZE, 244 | join=True) 245 | t1 = time.time() 246 | run_time = (t1 - t0) / 3600 247 | print('Running time: %0.4f' % run_time) -------------------------------------------------------------------------------- /Discrete-Emotion/predict.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import pickle 4 | import time 5 | import sys 6 | 7 | from tqdm import tqdm 8 | from datetime import datetime 9 | import torch.cuda 10 | from torch.utils.data import DataLoader 11 | 12 | from transformers import BertTokenizer 13 | from transformers import AdamW 14 | 15 | import torch.distributed as dist 16 | import torch.multiprocessing as mp 17 | from torch.nn.parallel import DistributedDataParallel as DDP 18 | from torch.utils.data.distributed import DistributedSampler 19 | 20 | from model import BERTPrompt4NR 21 | from prepro_data import * 22 | from utils import evaluate 23 | 24 | 25 | def setup(rank, world_size): 26 | os.environ['MASTER_ADDR'] = 'localhost' 27 | os.environ['MASTER_PORT'] = '23342' 28 | dist.init_process_group("nccl", rank=rank, world_size=world_size) 29 | 30 | 31 | def cleanup(): 32 | dist.destroy_process_group() 33 | 34 | 35 | def init_seed(seed): 36 | random.seed(seed) 37 | np.random.seed(seed) 38 | torch.manual_seed(seed) 39 | torch.cuda.manual_seed(seed) 40 | torch.cuda.manual_seed_all(seed) 41 | 42 | 43 | class Logger(object): 44 | def __init__(self, filename, stream=sys.stdout): 45 | self.terminal = stream 46 | self.log = open(filename, 'w') 47 | 48 | def write(self, message): 49 | self.terminal.write(message) 50 | self.log.write(message) 51 | 52 | def flush(self): 53 | pass 54 | 55 | 56 | def load_model(model_name, args): 57 | tokenizer = BertTokenizer.from_pretrained(model_name) 58 | 59 | new_tokens = ['[NSEP]'] 60 | tokenizer.add_tokens(new_tokens) 61 | new_vocab_size = len(tokenizer) 62 | args.vocab_size = new_vocab_size 63 | 64 | answer = ['boring', 'interesting'] 65 | answer_ids = tokenizer.encode(answer, add_special_tokens=False) 66 | 67 | net = BERTPrompt4NR(model_name, answer_ids, args) 68 | return net, tokenizer 69 | 70 | 71 | def eval(model, rank, world_size, data_loader): 72 | model.eval() 73 | data_loader = tqdm(data_loader) 74 | val_scores = [] 75 | acc_cnt = torch.zeros(2).to(rank) 76 | acc_cnt_pos = torch.zeros(2).to(rank) 77 | imp_ids = [] 78 | labels = [] 79 | for step, data in enumerate(data_loader): 80 | batch_enc, batch_attn, batch_labs, batch_imp = data 81 | imp_ids = imp_ids + batch_imp 82 | labels = labels + batch_labs.cpu().numpy().tolist() 83 | 84 | batch_enc = batch_enc.to(rank) 85 | batch_attn = batch_attn.to(rank) 86 | batch_labs = batch_labs.to(rank) 87 | 88 | loss, scores = model(batch_enc, batch_attn, batch_labs) 89 | 90 | ranking_scores = scores[:, 1].detach() 91 | val_scores.append(ranking_scores) 92 | 93 | predict = torch.argmax(scores.detach(), dim=1) 94 | num_correct = (predict == batch_labs).sum() 95 | acc_cnt[0] += num_correct 96 | acc_cnt[1] += predict.size(0) 97 | 98 | positive_idx = torch.where(batch_labs == 1)[0] 99 | num_correct_pos = (predict[positive_idx] == batch_labs[positive_idx]).sum() 100 | acc_cnt_pos[0] += num_correct_pos 101 | acc_cnt_pos[1] += positive_idx.size(0) 102 | 103 | dist.all_reduce(acc_cnt, op=dist.ReduceOp.SUM) 104 | dist.all_reduce(acc_cnt_pos, op=dist.ReduceOp.SUM) 105 | 106 | acc = acc_cnt[0] / acc_cnt[1] 107 | acc_pos = acc_cnt_pos[0] / acc_cnt_pos[1] 108 | pos_ratio = acc_cnt_pos[1] / acc_cnt[1] 109 | 110 | val_scores = torch.cat(val_scores, dim=0) 111 | val_impids = torch.IntTensor(imp_ids).to(rank) 112 | val_labels = torch.IntTensor(labels).to(rank) 113 | 114 | val_scores_list = [torch.zeros_like(val_scores).to(rank) for _ in range(world_size)] 115 | val_impids_list = [torch.zeros_like(val_impids).to(rank) for _ in range(world_size)] 116 | val_labels_list = [torch.zeros_like(val_labels).to(rank) for _ in range(world_size)] 117 | 118 | dist.all_gather(val_scores_list, val_scores) 119 | dist.all_gather(val_impids_list, val_impids) 120 | dist.all_gather(val_labels_list, val_labels) 121 | 122 | return val_scores_list, acc.item(), acc_pos.item(), pos_ratio.item(), val_impids_list, val_labels_list 123 | 124 | 125 | def ddp_main(rank, world_size, args): 126 | args.rank = rank 127 | args.world_size = world_size 128 | init_seed(rank + 1) 129 | if rank == 0: 130 | if args.log: 131 | sys.stdout = Logger(args.log_file, sys.stdout) 132 | setup(rank, world_size) 133 | 134 | print('| distributed init rank {}'.format(rank)) 135 | dist.barrier() 136 | 137 | # load model 138 | net, tokenizer = load_model(args.model_name, args) 139 | 140 | # load data 141 | news_dict = pickle.load(open(os.path.join(args.data_path, 'news.txt'), 'rb')) 142 | test_dataset = MyDataset(args, tokenizer, news_dict, status='test') 143 | 144 | if rank == 0: 145 | print(args) 146 | print('Vocabulary size of tokenizer after adding new tokens : %d' % args.vocab_size) 147 | print(test_dataset[0]['sentence']) 148 | print('num test: %d' % len(test_dataset)) 149 | 150 | test_sampler = DistributedSampler(test_dataset, 151 | rank=rank, 152 | num_replicas=world_size) 153 | nw = 2 154 | test_kwargs = {'batch_size': args.test_batch_size, 'sampler': test_sampler, 155 | 'shuffle': False, 'pin_memory': False, 156 | 'num_workers': nw, 'collate_fn': test_dataset.collate_fn} 157 | 158 | test_loader = DataLoader(test_dataset, **test_kwargs) 159 | 160 | net = net.to(rank) 161 | net = DDP(net, device_ids=[rank]) 162 | 163 | dist.barrier() 164 | 165 | map_location = {'cuda:%d' % 0: 'cuda:%d' % rank} 166 | net.module.load_state_dict(torch.load(args.model_file, map_location=map_location)) 167 | 168 | with torch.no_grad(): 169 | st_test = time.time() 170 | test_scores, acc_test, acc_pos_test, pos_ratio_test, test_impids, test_labels = \ 171 | eval(net, rank, world_size, test_loader) 172 | impressions = {} # {1: {'score': [], 'lab': []}} 173 | for i in range(world_size): 174 | scores, imp_id, labs = test_scores[i], test_impids[i], test_labels[i] 175 | assert scores.size() == imp_id.size() == labs.size() 176 | scores = scores.cpu().numpy().tolist() 177 | imp_id = imp_id.cpu().numpy().tolist() 178 | labs = labs.cpu().numpy().tolist() 179 | for j in range(len(scores)): 180 | sco, imp, lab = scores[j], imp_id[j], labs[j] 181 | if imp not in impressions: 182 | impressions[imp] = {'score': [], 'lab': []} 183 | impressions[imp]['score'].append(sco) 184 | impressions[imp]['lab'].append(lab) 185 | else: 186 | impressions[imp]['score'].append(sco) 187 | impressions[imp]['lab'].append(lab) 188 | predicts, truths = [], [] 189 | for imp in impressions: 190 | sims, labs = impressions[imp]['score'], impressions[imp]['lab'] 191 | sl_zip = sorted(zip(sims, labs), key=lambda x: x[0], reverse=True) 192 | sort_sims, sort_labs = zip(*sl_zip) 193 | predicts.append(list(range(1, len(sort_labs) + 1, 1))) 194 | truths.append(sort_labs) 195 | 196 | auc_test, mrr_test, ndcg5_test, ndcg10_test = evaluate(predicts, truths) 197 | end_test = time.time() 198 | test_spend = (end_test - st_test) / 60 199 | 200 | if rank == 0: 201 | print("Test: AUC: %0.4f\tMRR: %0.4f\tnDCG@5: %0.4f\tnDCG@10: %0.4f\t[Test-Time: %0.2f mim]" % 202 | (auc_test, mrr_test, ndcg5_test, ndcg10_test, test_spend)) 203 | cleanup() 204 | 205 | 206 | if __name__ == '__main__': 207 | t0 = time.time() 208 | parser = argparse.ArgumentParser() 209 | parser.add_argument('--data_path', default='../DATA/MIND-Small', type=str, help='Path') 210 | parser.add_argument('--model_name', default='bert-base-uncased', type=str) 211 | 212 | parser.add_argument('--test_batch_size', default=15, type=int, help='test batch_size') 213 | parser.add_argument('--max_his', default=50, type=int, help='max number of history') 214 | parser.add_argument('--max_tokens', default=500, type=int, help='max number of tokens') 215 | 216 | parser.add_argument('--max_his_len', default=450, type=int, help='max number of history') 217 | 218 | parser.add_argument('--device', default='cuda', help='device id') 219 | parser.add_argument('--world_size', default=2, type=int, help='number of distributed processes') 220 | 221 | parser.add_argument('--model_file', default='', type=str, help='model file') 222 | # parser.add_argument('--log', default=False, type=bool, help='whether write log file') 223 | parser.add_argument('--log', default=True, type=bool, help='whether write log file') 224 | 225 | args = parser.parse_args() 226 | 227 | if args.data_path == '../DATA/MIND-Demo': 228 | if args.log: 229 | if not os.path.exists('./log-Test'): 230 | os.makedirs('./log-Test') 231 | log_file = './log-Test/' + 'Tbs' + str(args.test_batch_size) + '-' + str(datetime.now())[-5:]+'.txt' 232 | args.log_file = log_file 233 | else: # Mind-Small 234 | if args.log: 235 | if not os.path.exists('./log-Test-Small'): 236 | os.makedirs('./log-Test-Small') 237 | log_file = './log-Test-Small/' + 'Tbs' + str(args.test_batch_size) + '-' + str(datetime.now())[-5:]+'.txt' 238 | args.log_file = log_file 239 | 240 | WORLD_SIZE = torch.cuda.device_count() 241 | mp.spawn(ddp_main, 242 | args=(WORLD_SIZE, args), 243 | nprocs=WORLD_SIZE, 244 | join=True) 245 | t1 = time.time() 246 | run_time = (t1 - t0) / 3600 247 | print('Running time: %0.4f' % run_time) -------------------------------------------------------------------------------- /Discrete-Relevance/predict.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import pickle 4 | import time 5 | import sys 6 | 7 | from tqdm import tqdm 8 | from datetime import datetime 9 | import torch.cuda 10 | from torch.utils.data import DataLoader 11 | 12 | from transformers import BertTokenizer 13 | from transformers import AdamW 14 | 15 | import torch.distributed as dist 16 | import torch.multiprocessing as mp 17 | from torch.nn.parallel import DistributedDataParallel as DDP 18 | from torch.utils.data.distributed import DistributedSampler 19 | 20 | from model import BERTPrompt4NR 21 | from prepro_data import * 22 | from utils import evaluate 23 | 24 | 25 | def setup(rank, world_size): 26 | os.environ['MASTER_ADDR'] = 'localhost' 27 | os.environ['MASTER_PORT'] = '23342' 28 | dist.init_process_group("nccl", rank=rank, world_size=world_size) 29 | 30 | 31 | def cleanup(): 32 | dist.destroy_process_group() 33 | 34 | 35 | def init_seed(seed): 36 | random.seed(seed) 37 | np.random.seed(seed) 38 | torch.manual_seed(seed) 39 | torch.cuda.manual_seed(seed) 40 | torch.cuda.manual_seed_all(seed) 41 | 42 | 43 | class Logger(object): 44 | def __init__(self, filename, stream=sys.stdout): 45 | self.terminal = stream 46 | self.log = open(filename, 'w') 47 | 48 | def write(self, message): 49 | self.terminal.write(message) 50 | self.log.write(message) 51 | 52 | def flush(self): 53 | pass 54 | 55 | 56 | def load_model(model_name, args): 57 | tokenizer = BertTokenizer.from_pretrained(model_name) 58 | 59 | new_tokens = ['[NSEP]'] 60 | tokenizer.add_tokens(new_tokens) 61 | new_vocab_size = len(tokenizer) 62 | args.vocab_size = new_vocab_size 63 | 64 | answer = ['unrelated', 'related'] 65 | answer_ids = tokenizer.encode(answer, add_special_tokens=False) 66 | 67 | net = BERTPrompt4NR(model_name, answer_ids, args) 68 | return net, tokenizer 69 | 70 | 71 | def eval(model, rank, world_size, data_loader): 72 | model.eval() 73 | data_loader = tqdm(data_loader) 74 | val_scores = [] 75 | acc_cnt = torch.zeros(2).to(rank) 76 | acc_cnt_pos = torch.zeros(2).to(rank) 77 | imp_ids = [] 78 | labels = [] 79 | for step, data in enumerate(data_loader): 80 | batch_enc, batch_attn, batch_labs, batch_imp = data 81 | imp_ids = imp_ids + batch_imp 82 | labels = labels + batch_labs.cpu().numpy().tolist() 83 | 84 | batch_enc = batch_enc.to(rank) 85 | batch_attn = batch_attn.to(rank) 86 | batch_labs = batch_labs.to(rank) 87 | 88 | loss, scores = model(batch_enc, batch_attn, batch_labs) 89 | 90 | ranking_scores = scores[:, 1].detach() 91 | val_scores.append(ranking_scores) 92 | 93 | predict = torch.argmax(scores.detach(), dim=1) 94 | num_correct = (predict == batch_labs).sum() 95 | acc_cnt[0] += num_correct 96 | acc_cnt[1] += predict.size(0) 97 | 98 | positive_idx = torch.where(batch_labs == 1)[0] 99 | num_correct_pos = (predict[positive_idx] == batch_labs[positive_idx]).sum() 100 | acc_cnt_pos[0] += num_correct_pos 101 | acc_cnt_pos[1] += positive_idx.size(0) 102 | 103 | dist.all_reduce(acc_cnt, op=dist.ReduceOp.SUM) 104 | dist.all_reduce(acc_cnt_pos, op=dist.ReduceOp.SUM) 105 | 106 | acc = acc_cnt[0] / acc_cnt[1] 107 | acc_pos = acc_cnt_pos[0] / acc_cnt_pos[1] 108 | pos_ratio = acc_cnt_pos[1] / acc_cnt[1] 109 | 110 | val_scores = torch.cat(val_scores, dim=0) 111 | val_impids = torch.IntTensor(imp_ids).to(rank) 112 | val_labels = torch.IntTensor(labels).to(rank) 113 | 114 | val_scores_list = [torch.zeros_like(val_scores).to(rank) for _ in range(world_size)] 115 | val_impids_list = [torch.zeros_like(val_impids).to(rank) for _ in range(world_size)] 116 | val_labels_list = [torch.zeros_like(val_labels).to(rank) for _ in range(world_size)] 117 | 118 | dist.all_gather(val_scores_list, val_scores) 119 | dist.all_gather(val_impids_list, val_impids) 120 | dist.all_gather(val_labels_list, val_labels) 121 | 122 | return val_scores_list, acc.item(), acc_pos.item(), pos_ratio.item(), val_impids_list, val_labels_list 123 | 124 | 125 | def ddp_main(rank, world_size, args): 126 | args.rank = rank 127 | args.world_size = world_size 128 | init_seed(rank + 1) 129 | if rank == 0: 130 | if args.log: 131 | sys.stdout = Logger(args.log_file, sys.stdout) 132 | setup(rank, world_size) 133 | 134 | print('| distributed init rank {}'.format(rank)) 135 | dist.barrier() 136 | 137 | # load model 138 | net, tokenizer = load_model(args.model_name, args) 139 | 140 | # load data 141 | news_dict = pickle.load(open(os.path.join(args.data_path, 'news.txt'), 'rb')) 142 | test_dataset = MyDataset(args, tokenizer, news_dict, status='test') 143 | 144 | if rank == 0: 145 | print(args) 146 | print('Vocabulary size of tokenizer after adding new tokens : %d' % args.vocab_size) 147 | print(test_dataset[0]['sentence']) 148 | print('num test: %d' % len(test_dataset)) 149 | 150 | test_sampler = DistributedSampler(test_dataset, 151 | rank=rank, 152 | num_replicas=world_size) 153 | nw = 2 154 | test_kwargs = {'batch_size': args.test_batch_size, 'sampler': test_sampler, 155 | 'shuffle': False, 'pin_memory': False, 156 | 'num_workers': nw, 'collate_fn': test_dataset.collate_fn} 157 | 158 | test_loader = DataLoader(test_dataset, **test_kwargs) 159 | 160 | net = net.to(rank) 161 | net = DDP(net, device_ids=[rank]) 162 | 163 | dist.barrier() 164 | 165 | map_location = {'cuda:%d' % 0: 'cuda:%d' % rank} 166 | net.module.load_state_dict(torch.load(args.model_file, map_location=map_location)) 167 | 168 | with torch.no_grad(): 169 | st_test = time.time() 170 | test_scores, acc_test, acc_pos_test, pos_ratio_test, test_impids, test_labels = \ 171 | eval(net, rank, world_size, test_loader) 172 | impressions = {} # {1: {'score': [], 'lab': []}} 173 | for i in range(world_size): 174 | scores, imp_id, labs = test_scores[i], test_impids[i], test_labels[i] 175 | assert scores.size() == imp_id.size() == labs.size() 176 | scores = scores.cpu().numpy().tolist() 177 | imp_id = imp_id.cpu().numpy().tolist() 178 | labs = labs.cpu().numpy().tolist() 179 | for j in range(len(scores)): 180 | sco, imp, lab = scores[j], imp_id[j], labs[j] 181 | if imp not in impressions: 182 | impressions[imp] = {'score': [], 'lab': []} 183 | impressions[imp]['score'].append(sco) 184 | impressions[imp]['lab'].append(lab) 185 | else: 186 | impressions[imp]['score'].append(sco) 187 | impressions[imp]['lab'].append(lab) 188 | predicts, truths = [], [] 189 | for imp in impressions: 190 | sims, labs = impressions[imp]['score'], impressions[imp]['lab'] 191 | sl_zip = sorted(zip(sims, labs), key=lambda x: x[0], reverse=True) 192 | sort_sims, sort_labs = zip(*sl_zip) 193 | predicts.append(list(range(1, len(sort_labs) + 1, 1))) 194 | truths.append(sort_labs) 195 | 196 | auc_test, mrr_test, ndcg5_test, ndcg10_test = evaluate(predicts, truths) 197 | end_test = time.time() 198 | test_spend = (end_test - st_test) / 60 199 | 200 | if rank == 0: 201 | print("Test: AUC: %0.4f\tMRR: %0.4f\tnDCG@5: %0.4f\tnDCG@10: %0.4f\t[Test-Time: %0.2f mim]" % 202 | (auc_test, mrr_test, ndcg5_test, ndcg10_test, test_spend)) 203 | cleanup() 204 | 205 | 206 | if __name__ == '__main__': 207 | t0 = time.time() 208 | parser = argparse.ArgumentParser() 209 | parser.add_argument('--data_path', default='../DATA/MIND-Small', type=str, help='Path') 210 | parser.add_argument('--model_name', default='bert-base-uncased', type=str) 211 | 212 | parser.add_argument('--test_batch_size', default=15, type=int, help='test batch_size') 213 | parser.add_argument('--max_his', default=50, type=int, help='max number of history') 214 | parser.add_argument('--max_tokens', default=500, type=int, help='max number of tokens') 215 | 216 | parser.add_argument('--max_his_len', default=450, type=int, help='max number of history') 217 | 218 | parser.add_argument('--device', default='cuda', help='device id') 219 | parser.add_argument('--world_size', default=2, type=int, help='number of distributed processes') 220 | 221 | parser.add_argument('--model_file', default='', type=str, help='model file') 222 | parser.add_argument('--log', default=False, type=bool, help='whether write log file') 223 | # parser.add_argument('--log', default=True, type=bool, help='whether write log file') 224 | 225 | args = parser.parse_args() 226 | 227 | if args.data_path == '../DATA/MIND-Demo': 228 | if args.log: 229 | if not os.path.exists('./log-Test'): 230 | os.makedirs('./log-Test') 231 | log_file = './log-Test/' + 'Tbs' + str(args.test_batch_size) + '-' + str(datetime.now())[-5:]+'.txt' 232 | args.log_file = log_file 233 | else: # Mind-Small 234 | if args.log: 235 | if not os.path.exists('./log-Test-Small'): 236 | os.makedirs('./log-Test-Small') 237 | log_file = './log-Test-Small/' + 'Tbs' + str(args.test_batch_size) + '-' + str(datetime.now())[-5:]+'.txt' 238 | args.log_file = log_file 239 | 240 | WORLD_SIZE = torch.cuda.device_count() 241 | mp.spawn(ddp_main, 242 | args=(WORLD_SIZE, args), 243 | nprocs=WORLD_SIZE, 244 | join=True) 245 | t1 = time.time() 246 | run_time = (t1 - t0) / 3600 247 | print('Running time: %0.4f' % run_time) -------------------------------------------------------------------------------- /Hybrid-Utility/predict.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import pickle 4 | import time 5 | import sys 6 | 7 | from tqdm import tqdm 8 | from datetime import datetime 9 | import torch.cuda 10 | from torch.utils.data import DataLoader 11 | 12 | from transformers import BertTokenizer 13 | from transformers import AdamW 14 | 15 | import torch.distributed as dist 16 | import torch.multiprocessing as mp 17 | from torch.nn.parallel import DistributedDataParallel as DDP 18 | from torch.utils.data.distributed import DistributedSampler 19 | 20 | from model import BERTPrompt4NR 21 | from prepro_data import * 22 | from utils import evaluate 23 | 24 | 25 | def setup(rank, world_size): 26 | os.environ['MASTER_ADDR'] = 'localhost' 27 | os.environ['MASTER_PORT'] = '23342' 28 | dist.init_process_group("nccl", rank=rank, world_size=world_size) 29 | 30 | 31 | def cleanup(): 32 | dist.destroy_process_group() 33 | 34 | 35 | def init_seed(seed): 36 | random.seed(seed) 37 | np.random.seed(seed) 38 | torch.manual_seed(seed) 39 | torch.cuda.manual_seed(seed) 40 | torch.cuda.manual_seed_all(seed) 41 | 42 | 43 | class Logger(object): 44 | def __init__(self, filename, stream=sys.stdout): 45 | self.terminal = stream 46 | self.log = open(filename, 'w') 47 | 48 | def write(self, message): 49 | self.terminal.write(message) 50 | self.log.write(message) 51 | 52 | def flush(self): 53 | pass 54 | 55 | 56 | def load_tokenizer(model_name, args): 57 | tokenizer = BertTokenizer.from_pretrained(model_name) 58 | conti_tokens1 = [] 59 | for i in range(args.num_conti1): 60 | conti_tokens1.append('[P' + str(i + 1) + ']') 61 | conti_tokens2 = [] 62 | for i in range(args.num_conti2): 63 | conti_tokens2.append('[Q' + str(i + 1) + ']') 64 | 65 | new_tokens = ['[NSEP]'] 66 | tokenizer.add_tokens(new_tokens) 67 | 68 | conti_tokens = conti_tokens1 + conti_tokens2 69 | tokenizer.add_tokens(conti_tokens) 70 | 71 | new_vocab_size = len(tokenizer) 72 | args.vocab_size = new_vocab_size 73 | 74 | return tokenizer, conti_tokens1, conti_tokens2 75 | 76 | 77 | def load_model(model_name, tokenizer, args): 78 | answer = ['bad', 'good'] 79 | answer_ids = tokenizer.encode(answer, add_special_tokens=False) 80 | 81 | net = BERTPrompt4NR(model_name, answer_ids, args) 82 | return net 83 | 84 | 85 | def eval(model, rank, world_size, data_loader): 86 | model.eval() 87 | data_loader = tqdm(data_loader) 88 | val_scores = [] 89 | acc_cnt = torch.zeros(2).to(rank) 90 | acc_cnt_pos = torch.zeros(2).to(rank) 91 | imp_ids = [] 92 | labels = [] 93 | for step, data in enumerate(data_loader): 94 | batch_enc, batch_attn, batch_labs, batch_imp = data 95 | imp_ids = imp_ids + batch_imp 96 | labels = labels + batch_labs.cpu().numpy().tolist() 97 | 98 | batch_enc = batch_enc.to(rank) 99 | batch_attn = batch_attn.to(rank) 100 | batch_labs = batch_labs.to(rank) 101 | 102 | loss, scores = model(batch_enc, batch_attn, batch_labs) 103 | 104 | ranking_scores = scores[:, 1].detach() 105 | val_scores.append(ranking_scores) 106 | 107 | predict = torch.argmax(scores.detach(), dim=1) 108 | num_correct = (predict == batch_labs).sum() 109 | acc_cnt[0] += num_correct 110 | acc_cnt[1] += predict.size(0) 111 | 112 | positive_idx = torch.where(batch_labs == 1)[0] 113 | num_correct_pos = (predict[positive_idx] == batch_labs[positive_idx]).sum() 114 | acc_cnt_pos[0] += num_correct_pos 115 | acc_cnt_pos[1] += positive_idx.size(0) 116 | 117 | dist.all_reduce(acc_cnt, op=dist.ReduceOp.SUM) 118 | dist.all_reduce(acc_cnt_pos, op=dist.ReduceOp.SUM) 119 | 120 | acc = acc_cnt[0] / acc_cnt[1] 121 | acc_pos = acc_cnt_pos[0] / acc_cnt_pos[1] 122 | pos_ratio = acc_cnt_pos[1] / acc_cnt[1] 123 | 124 | val_scores = torch.cat(val_scores, dim=0) 125 | val_impids = torch.IntTensor(imp_ids).to(rank) 126 | val_labels = torch.IntTensor(labels).to(rank) 127 | 128 | val_scores_list = [torch.zeros_like(val_scores).to(rank) for _ in range(world_size)] 129 | val_impids_list = [torch.zeros_like(val_impids).to(rank) for _ in range(world_size)] 130 | val_labels_list = [torch.zeros_like(val_labels).to(rank) for _ in range(world_size)] 131 | 132 | dist.all_gather(val_scores_list, val_scores) 133 | dist.all_gather(val_impids_list, val_impids) 134 | dist.all_gather(val_labels_list, val_labels) 135 | 136 | return val_scores_list, acc.item(), acc_pos.item(), pos_ratio.item(), val_impids_list, val_labels_list 137 | 138 | 139 | def ddp_main(rank, world_size, args): 140 | args.rank = rank 141 | args.world_size = world_size 142 | init_seed(rank + 1) 143 | if rank == 0: 144 | if args.log: 145 | sys.stdout = Logger(args.log_file, sys.stdout) 146 | setup(rank, world_size) 147 | 148 | print('| distributed init rank {}'.format(rank)) 149 | dist.barrier() 150 | 151 | # load tokenizer 152 | tokenizer, conti_tokens1, conti_tokens2 = load_tokenizer(args.model_name, args) 153 | conti_tokens = [conti_tokens1, conti_tokens2] 154 | 155 | # load model 156 | net = load_model(args.model_name, tokenizer, args) 157 | 158 | # load data 159 | news_dict = pickle.load(open(os.path.join(args.data_path, 'news.txt'), 'rb')) 160 | test_dataset = MyDataset(args, tokenizer, news_dict, conti_tokens, status='test') 161 | 162 | if rank == 0: 163 | print(args) 164 | print('Vocabulary size of tokenizer after adding new tokens : %d' % args.vocab_size) 165 | print(test_dataset[0]['sentence']) 166 | print('num test: %d' % len(test_dataset)) 167 | 168 | test_sampler = DistributedSampler(test_dataset, 169 | rank=rank, 170 | num_replicas=world_size) 171 | nw = 2 172 | test_kwargs = {'batch_size': args.test_batch_size, 'sampler': test_sampler, 173 | 'shuffle': False, 'pin_memory': False, 174 | 'num_workers': nw, 'collate_fn': test_dataset.collate_fn} 175 | 176 | test_loader = DataLoader(test_dataset, **test_kwargs) 177 | 178 | net = net.to(rank) 179 | net = DDP(net, device_ids=[rank]) 180 | 181 | dist.barrier() 182 | 183 | map_location = {'cuda:%d' % 0: 'cuda:%d' % rank} 184 | net.module.load_state_dict(torch.load(args.model_file, map_location=map_location)) 185 | 186 | with torch.no_grad(): 187 | st_test = time.time() 188 | test_scores, acc_test, acc_pos_test, pos_ratio_test, test_impids, test_labels = \ 189 | eval(net, rank, world_size, test_loader) 190 | impressions = {} # {1: {'score': [], 'lab': []}} 191 | for i in range(world_size): 192 | scores, imp_id, labs = test_scores[i], test_impids[i], test_labels[i] 193 | assert scores.size() == imp_id.size() == labs.size() 194 | scores = scores.cpu().numpy().tolist() 195 | imp_id = imp_id.cpu().numpy().tolist() 196 | labs = labs.cpu().numpy().tolist() 197 | for j in range(len(scores)): 198 | sco, imp, lab = scores[j], imp_id[j], labs[j] 199 | if imp not in impressions: 200 | impressions[imp] = {'score': [], 'lab': []} 201 | impressions[imp]['score'].append(sco) 202 | impressions[imp]['lab'].append(lab) 203 | else: 204 | impressions[imp]['score'].append(sco) 205 | impressions[imp]['lab'].append(lab) 206 | predicts, truths = [], [] 207 | for imp in impressions: 208 | sims, labs = impressions[imp]['score'], impressions[imp]['lab'] 209 | sl_zip = sorted(zip(sims, labs), key=lambda x: x[0], reverse=True) 210 | sort_sims, sort_labs = zip(*sl_zip) 211 | predicts.append(list(range(1, len(sort_labs) + 1, 1))) 212 | truths.append(sort_labs) 213 | 214 | auc_test, mrr_test, ndcg5_test, ndcg10_test = evaluate(predicts, truths) 215 | end_test = time.time() 216 | test_spend = (end_test - st_test) / 60 217 | 218 | if rank == 0: 219 | print("Test: AUC: %0.4f\tMRR: %0.4f\tnDCG@5: %0.4f\tnDCG@10: %0.4f\t[Test-Time: %0.2f mim]" % 220 | (auc_test, mrr_test, ndcg5_test, ndcg10_test, test_spend)) 221 | cleanup() 222 | 223 | 224 | if __name__ == '__main__': 225 | t0 = time.time() 226 | parser = argparse.ArgumentParser() 227 | parser.add_argument('--data_path', default='../DATA/MIND-Small', type=str, help='Path') 228 | parser.add_argument('--model_name', default='bert-base-uncased', type=str) 229 | 230 | parser.add_argument('--test_batch_size', default=15, type=int, help='test batch_size') 231 | parser.add_argument('--max_his', default=50, type=int, help='max number of history') 232 | parser.add_argument('--max_tokens', default=500, type=int, help='max number of tokens') 233 | 234 | parser.add_argument('--max_his_len', default=450, type=int, help='max number of history') 235 | 236 | parser.add_argument('--num_conti1', default=3, type=int, help='number of continuous tokens') 237 | parser.add_argument('--num_conti2', default=3, type=int, help='number of continuous tokens') 238 | 239 | parser.add_argument('--device', default='cuda', help='device id') 240 | parser.add_argument('--world_size', default=2, type=int, help='number of distributed processes') 241 | 242 | parser.add_argument('--model_file', default='', type=str, help='model file') 243 | parser.add_argument('--log', default=False, type=bool, help='whether write log file') 244 | # parser.add_argument('--log', default=True, type=bool, help='whether write log file') 245 | 246 | args = parser.parse_args() 247 | 248 | if args.data_path == '../DATA/MIND-Demo': 249 | if args.log: 250 | if not os.path.exists('./log-Test'): 251 | os.makedirs('./log-Test') 252 | log_file = './log-Test/' + 'Tbs' + str(args.test_batch_size) + \ 253 | '-n' + str(args.num_conti1) + str(args.num_conti2) + \ 254 | '-' + str(datetime.now())[-5:]+'.txt' 255 | args.log_file = log_file 256 | else: 257 | if args.log: 258 | if not os.path.exists('./log-Test-Small'): 259 | os.makedirs('./log-Test-Small') 260 | log_file = './log-Test-Small/' + 'Tbs' + str(args.test_batch_size) + \ 261 | '-n' + str(args.num_conti1) + str(args.num_conti2) + \ 262 | '-' + str(datetime.now())[-5:]+'.txt' 263 | args.log_file = log_file 264 | 265 | WORLD_SIZE = torch.cuda.device_count() 266 | mp.spawn(ddp_main, 267 | args=(WORLD_SIZE, args), 268 | nprocs=WORLD_SIZE, 269 | join=True) 270 | t1 = time.time() 271 | run_time = (t1 - t0) / 3600 272 | print('Running time: %0.4f' % run_time) -------------------------------------------------------------------------------- /Hybrid-Relevance/predict.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import pickle 4 | import time 5 | import sys 6 | 7 | from tqdm import tqdm 8 | from datetime import datetime 9 | import torch.cuda 10 | from torch.utils.data import DataLoader 11 | 12 | from transformers import BertTokenizer 13 | from transformers import AdamW 14 | 15 | import torch.distributed as dist 16 | import torch.multiprocessing as mp 17 | from torch.nn.parallel import DistributedDataParallel as DDP 18 | from torch.utils.data.distributed import DistributedSampler 19 | 20 | from model import BERTPrompt4NR 21 | from prepro_data import * 22 | from utils import evaluate 23 | 24 | 25 | def setup(rank, world_size): 26 | os.environ['MASTER_ADDR'] = 'localhost' 27 | os.environ['MASTER_PORT'] = '23342' 28 | dist.init_process_group("nccl", rank=rank, world_size=world_size) 29 | 30 | 31 | def cleanup(): 32 | dist.destroy_process_group() 33 | 34 | 35 | def init_seed(seed): 36 | random.seed(seed) 37 | np.random.seed(seed) 38 | torch.manual_seed(seed) 39 | torch.cuda.manual_seed(seed) 40 | torch.cuda.manual_seed_all(seed) 41 | 42 | 43 | class Logger(object): 44 | def __init__(self, filename, stream=sys.stdout): 45 | self.terminal = stream 46 | self.log = open(filename, 'w') 47 | 48 | def write(self, message): 49 | self.terminal.write(message) 50 | self.log.write(message) 51 | 52 | def flush(self): 53 | pass 54 | 55 | 56 | def load_tokenizer(model_name, args): 57 | tokenizer = BertTokenizer.from_pretrained(model_name) 58 | conti_tokens1 = [] 59 | for i in range(args.num_conti1): 60 | conti_tokens1.append('[P' + str(i + 1) + ']') 61 | conti_tokens2 = [] 62 | for i in range(args.num_conti2): 63 | conti_tokens2.append('[Q' + str(i + 1) + ']') 64 | 65 | new_tokens = ['[NSEP]'] 66 | tokenizer.add_tokens(new_tokens) 67 | 68 | conti_tokens = conti_tokens1 + conti_tokens2 69 | tokenizer.add_tokens(conti_tokens) 70 | 71 | new_vocab_size = len(tokenizer) 72 | args.vocab_size = new_vocab_size 73 | 74 | return tokenizer, conti_tokens1, conti_tokens2 75 | 76 | 77 | def load_model(model_name, tokenizer, args): 78 | answer = ['unrelated', 'related'] 79 | answer_ids = tokenizer.encode(answer, add_special_tokens=False) 80 | 81 | net = BERTPrompt4NR(model_name, answer_ids, args) 82 | return net 83 | 84 | 85 | def eval(model, rank, world_size, data_loader): 86 | model.eval() 87 | data_loader = tqdm(data_loader) 88 | val_scores = [] 89 | acc_cnt = torch.zeros(2).to(rank) 90 | acc_cnt_pos = torch.zeros(2).to(rank) 91 | imp_ids = [] 92 | labels = [] 93 | for step, data in enumerate(data_loader): 94 | batch_enc, batch_attn, batch_labs, batch_imp = data 95 | imp_ids = imp_ids + batch_imp 96 | labels = labels + batch_labs.cpu().numpy().tolist() 97 | 98 | batch_enc = batch_enc.to(rank) 99 | batch_attn = batch_attn.to(rank) 100 | batch_labs = batch_labs.to(rank) 101 | 102 | loss, scores = model(batch_enc, batch_attn, batch_labs) 103 | 104 | ranking_scores = scores[:, 1].detach() 105 | val_scores.append(ranking_scores) 106 | 107 | predict = torch.argmax(scores.detach(), dim=1) 108 | num_correct = (predict == batch_labs).sum() 109 | acc_cnt[0] += num_correct 110 | acc_cnt[1] += predict.size(0) 111 | 112 | positive_idx = torch.where(batch_labs == 1)[0] 113 | num_correct_pos = (predict[positive_idx] == batch_labs[positive_idx]).sum() 114 | acc_cnt_pos[0] += num_correct_pos 115 | acc_cnt_pos[1] += positive_idx.size(0) 116 | 117 | dist.all_reduce(acc_cnt, op=dist.ReduceOp.SUM) 118 | dist.all_reduce(acc_cnt_pos, op=dist.ReduceOp.SUM) 119 | 120 | acc = acc_cnt[0] / acc_cnt[1] 121 | acc_pos = acc_cnt_pos[0] / acc_cnt_pos[1] 122 | pos_ratio = acc_cnt_pos[1] / acc_cnt[1] 123 | 124 | val_scores = torch.cat(val_scores, dim=0) 125 | val_impids = torch.IntTensor(imp_ids).to(rank) 126 | val_labels = torch.IntTensor(labels).to(rank) 127 | 128 | val_scores_list = [torch.zeros_like(val_scores).to(rank) for _ in range(world_size)] 129 | val_impids_list = [torch.zeros_like(val_impids).to(rank) for _ in range(world_size)] 130 | val_labels_list = [torch.zeros_like(val_labels).to(rank) for _ in range(world_size)] 131 | 132 | dist.all_gather(val_scores_list, val_scores) 133 | dist.all_gather(val_impids_list, val_impids) 134 | dist.all_gather(val_labels_list, val_labels) 135 | 136 | return val_scores_list, acc.item(), acc_pos.item(), pos_ratio.item(), val_impids_list, val_labels_list 137 | 138 | 139 | def ddp_main(rank, world_size, args): 140 | args.rank = rank 141 | args.world_size = world_size 142 | init_seed(rank + 1) 143 | if rank == 0: 144 | if args.log: 145 | sys.stdout = Logger(args.log_file, sys.stdout) 146 | setup(rank, world_size) 147 | 148 | print('| distributed init rank {}'.format(rank)) 149 | dist.barrier() 150 | 151 | # load tokenizer 152 | tokenizer, conti_tokens1, conti_tokens2 = load_tokenizer(args.model_name, args) 153 | conti_tokens = [conti_tokens1, conti_tokens2] 154 | 155 | # load model 156 | net = load_model(args.model_name, tokenizer, args) 157 | 158 | # load data 159 | news_dict = pickle.load(open(os.path.join(args.data_path, 'news.txt'), 'rb')) 160 | test_dataset = MyDataset(args, tokenizer, news_dict, conti_tokens, status='test') 161 | 162 | if rank == 0: 163 | print(args) 164 | print('Vocabulary size of tokenizer after adding new tokens : %d' % args.vocab_size) 165 | print(test_dataset[0]['sentence']) 166 | print('num test: %d' % len(test_dataset)) 167 | 168 | test_sampler = DistributedSampler(test_dataset, 169 | rank=rank, 170 | num_replicas=world_size) 171 | nw = 2 172 | test_kwargs = {'batch_size': args.test_batch_size, 'sampler': test_sampler, 173 | 'shuffle': False, 'pin_memory': False, 174 | 'num_workers': nw, 'collate_fn': test_dataset.collate_fn} 175 | 176 | test_loader = DataLoader(test_dataset, **test_kwargs) 177 | 178 | net = net.to(rank) 179 | net = DDP(net, device_ids=[rank]) 180 | 181 | dist.barrier() 182 | 183 | map_location = {'cuda:%d' % 0: 'cuda:%d' % rank} 184 | net.module.load_state_dict(torch.load(args.model_file, map_location=map_location)) 185 | 186 | with torch.no_grad(): 187 | st_test = time.time() 188 | test_scores, acc_test, acc_pos_test, pos_ratio_test, test_impids, test_labels = \ 189 | eval(net, rank, world_size, test_loader) 190 | impressions = {} # {1: {'score': [], 'lab': []}} 191 | for i in range(world_size): 192 | scores, imp_id, labs = test_scores[i], test_impids[i], test_labels[i] 193 | assert scores.size() == imp_id.size() == labs.size() 194 | scores = scores.cpu().numpy().tolist() 195 | imp_id = imp_id.cpu().numpy().tolist() 196 | labs = labs.cpu().numpy().tolist() 197 | for j in range(len(scores)): 198 | sco, imp, lab = scores[j], imp_id[j], labs[j] 199 | if imp not in impressions: 200 | impressions[imp] = {'score': [], 'lab': []} 201 | impressions[imp]['score'].append(sco) 202 | impressions[imp]['lab'].append(lab) 203 | else: 204 | impressions[imp]['score'].append(sco) 205 | impressions[imp]['lab'].append(lab) 206 | predicts, truths = [], [] 207 | for imp in impressions: 208 | sims, labs = impressions[imp]['score'], impressions[imp]['lab'] 209 | sl_zip = sorted(zip(sims, labs), key=lambda x: x[0], reverse=True) 210 | sort_sims, sort_labs = zip(*sl_zip) 211 | predicts.append(list(range(1, len(sort_labs) + 1, 1))) 212 | truths.append(sort_labs) 213 | 214 | auc_test, mrr_test, ndcg5_test, ndcg10_test = evaluate(predicts, truths) 215 | end_test = time.time() 216 | test_spend = (end_test - st_test) / 60 217 | 218 | if rank == 0: 219 | print("Test: AUC: %0.4f\tMRR: %0.4f\tnDCG@5: %0.4f\tnDCG@10: %0.4f\t[Test-Time: %0.2f mim]" % 220 | (auc_test, mrr_test, ndcg5_test, ndcg10_test, test_spend)) 221 | cleanup() 222 | 223 | 224 | if __name__ == '__main__': 225 | t0 = time.time() 226 | parser = argparse.ArgumentParser() 227 | parser.add_argument('--data_path', default='../DATA/MIND-Demo', type=str, help='Path') 228 | parser.add_argument('--model_name', default='bert-base-uncased', type=str) 229 | 230 | parser.add_argument('--test_batch_size', default=15, type=int, help='test batch_size') 231 | parser.add_argument('--max_his', default=50, type=int, help='max number of history') 232 | parser.add_argument('--max_tokens', default=500, type=int, help='max number of tokens') 233 | 234 | parser.add_argument('--max_his_len', default=450, type=int, help='max number of history') 235 | 236 | parser.add_argument('--num_conti1', default=3, type=int, help='number of continuous tokens') 237 | parser.add_argument('--num_conti2', default=3, type=int, help='number of continuous tokens') 238 | 239 | parser.add_argument('--device', default='cuda', help='device id') 240 | parser.add_argument('--world_size', default=2, type=int, help='number of distributed processes') 241 | 242 | parser.add_argument('--model_file', default='', type=str, help='model file') 243 | # parser.add_argument('--log', default=False, type=bool, help='whether write log file') 244 | parser.add_argument('--log', default=True, type=bool, help='whether write log file') 245 | 246 | args = parser.parse_args() 247 | 248 | if args.data_path == '../DATA/MIND-Demo': 249 | if args.log: 250 | if not os.path.exists('./log-Test'): 251 | os.makedirs('./log-Test') 252 | log_file = './log-Test/' + 'Tbs' + str(args.test_batch_size) + \ 253 | '-n' + str(args.num_conti1) + str(args.num_conti2) + \ 254 | '-' + str(datetime.now())[-5:]+'.txt' 255 | args.log_file = log_file 256 | else: 257 | if args.log: 258 | if not os.path.exists('./log-Test-Small'): 259 | os.makedirs('./log-Test-Small') 260 | log_file = './log-Test-Small/' + 'Tbs' + str(args.test_batch_size) + \ 261 | '-n' + str(args.num_conti1) + str(args.num_conti2) + \ 262 | '-' + str(datetime.now())[-5:]+'.txt' 263 | args.log_file = log_file 264 | 265 | WORLD_SIZE = torch.cuda.device_count() 266 | mp.spawn(ddp_main, 267 | args=(WORLD_SIZE, args), 268 | nprocs=WORLD_SIZE, 269 | join=True) 270 | t1 = time.time() 271 | run_time = (t1 - t0) / 3600 272 | print('Running time: %0.4f' % run_time) -------------------------------------------------------------------------------- /Hybrid-Emotion/predict.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import pickle 4 | import time 5 | import sys 6 | 7 | from tqdm import tqdm 8 | from datetime import datetime 9 | import torch.cuda 10 | from torch.utils.data import DataLoader 11 | 12 | from transformers import BertTokenizer 13 | from transformers import AdamW 14 | 15 | import torch.distributed as dist 16 | import torch.multiprocessing as mp 17 | from torch.nn.parallel import DistributedDataParallel as DDP 18 | from torch.utils.data.distributed import DistributedSampler 19 | 20 | from model import BERTPrompt4NR 21 | from prepro_data import * 22 | from utils import evaluate 23 | 24 | 25 | def setup(rank, world_size): 26 | os.environ['MASTER_ADDR'] = 'localhost' 27 | os.environ['MASTER_PORT'] = '23342' 28 | dist.init_process_group("nccl", rank=rank, world_size=world_size) 29 | 30 | 31 | def cleanup(): 32 | dist.destroy_process_group() 33 | 34 | 35 | def init_seed(seed): 36 | random.seed(seed) 37 | np.random.seed(seed) 38 | torch.manual_seed(seed) 39 | torch.cuda.manual_seed(seed) 40 | torch.cuda.manual_seed_all(seed) 41 | 42 | 43 | class Logger(object): 44 | def __init__(self, filename, stream=sys.stdout): 45 | self.terminal = stream 46 | self.log = open(filename, 'w') 47 | 48 | def write(self, message): 49 | self.terminal.write(message) 50 | self.log.write(message) 51 | 52 | def flush(self): 53 | pass 54 | 55 | 56 | def load_tokenizer(model_name, args): 57 | tokenizer = BertTokenizer.from_pretrained(model_name) 58 | conti_tokens1 = [] 59 | for i in range(args.num_conti1): 60 | conti_tokens1.append('[P' + str(i + 1) + ']') 61 | conti_tokens2 = [] 62 | for i in range(args.num_conti2): 63 | conti_tokens2.append('[Q' + str(i + 1) + ']') 64 | 65 | new_tokens = ['[NSEP]'] 66 | tokenizer.add_tokens(new_tokens) 67 | 68 | conti_tokens = conti_tokens1 + conti_tokens2 69 | tokenizer.add_tokens(conti_tokens) 70 | 71 | new_vocab_size = len(tokenizer) 72 | args.vocab_size = new_vocab_size 73 | 74 | return tokenizer, conti_tokens1, conti_tokens2 75 | 76 | 77 | def load_model(model_name, tokenizer, args): 78 | answer = ['boring', 'interesting'] 79 | answer_ids = tokenizer.encode(answer, add_special_tokens=False) 80 | 81 | net = BERTPrompt4NR(model_name, answer_ids, args) 82 | return net 83 | 84 | 85 | def eval(model, rank, world_size, data_loader): 86 | model.eval() 87 | data_loader = tqdm(data_loader) 88 | val_scores = [] 89 | acc_cnt = torch.zeros(2).to(rank) 90 | acc_cnt_pos = torch.zeros(2).to(rank) 91 | imp_ids = [] 92 | labels = [] 93 | for step, data in enumerate(data_loader): 94 | batch_enc, batch_attn, batch_labs, batch_imp = data 95 | imp_ids = imp_ids + batch_imp 96 | labels = labels + batch_labs.cpu().numpy().tolist() 97 | 98 | batch_enc = batch_enc.to(rank) 99 | batch_attn = batch_attn.to(rank) 100 | batch_labs = batch_labs.to(rank) 101 | 102 | loss, scores = model(batch_enc, batch_attn, batch_labs) 103 | 104 | ranking_scores = scores[:, 1].detach() 105 | val_scores.append(ranking_scores) 106 | 107 | predict = torch.argmax(scores.detach(), dim=1) 108 | num_correct = (predict == batch_labs).sum() 109 | acc_cnt[0] += num_correct 110 | acc_cnt[1] += predict.size(0) 111 | 112 | positive_idx = torch.where(batch_labs == 1)[0] 113 | num_correct_pos = (predict[positive_idx] == batch_labs[positive_idx]).sum() 114 | acc_cnt_pos[0] += num_correct_pos 115 | acc_cnt_pos[1] += positive_idx.size(0) 116 | 117 | dist.all_reduce(acc_cnt, op=dist.ReduceOp.SUM) 118 | dist.all_reduce(acc_cnt_pos, op=dist.ReduceOp.SUM) 119 | 120 | acc = acc_cnt[0] / acc_cnt[1] 121 | acc_pos = acc_cnt_pos[0] / acc_cnt_pos[1] 122 | pos_ratio = acc_cnt_pos[1] / acc_cnt[1] 123 | 124 | val_scores = torch.cat(val_scores, dim=0) 125 | val_impids = torch.IntTensor(imp_ids).to(rank) 126 | val_labels = torch.IntTensor(labels).to(rank) 127 | 128 | val_scores_list = [torch.zeros_like(val_scores).to(rank) for _ in range(world_size)] 129 | val_impids_list = [torch.zeros_like(val_impids).to(rank) for _ in range(world_size)] 130 | val_labels_list = [torch.zeros_like(val_labels).to(rank) for _ in range(world_size)] 131 | 132 | dist.all_gather(val_scores_list, val_scores) 133 | dist.all_gather(val_impids_list, val_impids) 134 | dist.all_gather(val_labels_list, val_labels) 135 | 136 | return val_scores_list, acc.item(), acc_pos.item(), pos_ratio.item(), val_impids_list, val_labels_list 137 | 138 | 139 | def ddp_main(rank, world_size, args): 140 | args.rank = rank 141 | args.world_size = world_size 142 | init_seed(rank + 1) 143 | if rank == 0: 144 | if args.log: 145 | sys.stdout = Logger(args.log_file, sys.stdout) 146 | setup(rank, world_size) 147 | 148 | print('| distributed init rank {}'.format(rank)) 149 | dist.barrier() 150 | 151 | # load tokenizer 152 | tokenizer, conti_tokens1, conti_tokens2 = load_tokenizer(args.model_name, args) 153 | conti_tokens = [conti_tokens1, conti_tokens2] 154 | 155 | # load model 156 | net = load_model(args.model_name, tokenizer, args) 157 | 158 | # load data 159 | news_dict = pickle.load(open(os.path.join(args.data_path, 'news.txt'), 'rb')) 160 | test_dataset = MyDataset(args, tokenizer, news_dict, conti_tokens, status='test') 161 | 162 | if rank == 0: 163 | print(args) 164 | print('Vocabulary size of tokenizer after adding new tokens : %d' % args.vocab_size) 165 | print(test_dataset[0]['sentence']) 166 | print('num test: %d' % len(test_dataset)) 167 | 168 | test_sampler = DistributedSampler(test_dataset, 169 | rank=rank, 170 | num_replicas=world_size) 171 | nw = 2 172 | test_kwargs = {'batch_size': args.test_batch_size, 'sampler': test_sampler, 173 | 'shuffle': False, 'pin_memory': False, 174 | 'num_workers': nw, 'collate_fn': test_dataset.collate_fn} 175 | 176 | test_loader = DataLoader(test_dataset, **test_kwargs) 177 | 178 | net = net.to(rank) 179 | net = DDP(net, device_ids=[rank]) 180 | 181 | dist.barrier() 182 | 183 | map_location = {'cuda:%d' % 0: 'cuda:%d' % rank} 184 | net.module.load_state_dict(torch.load(args.model_file, map_location=map_location)) 185 | 186 | with torch.no_grad(): 187 | st_test = time.time() 188 | test_scores, acc_test, acc_pos_test, pos_ratio_test, test_impids, test_labels = \ 189 | eval(net, rank, world_size, test_loader) 190 | impressions = {} # {1: {'score': [], 'lab': []}} 191 | for i in range(world_size): 192 | scores, imp_id, labs = test_scores[i], test_impids[i], test_labels[i] 193 | assert scores.size() == imp_id.size() == labs.size() 194 | scores = scores.cpu().numpy().tolist() 195 | imp_id = imp_id.cpu().numpy().tolist() 196 | labs = labs.cpu().numpy().tolist() 197 | for j in range(len(scores)): 198 | sco, imp, lab = scores[j], imp_id[j], labs[j] 199 | if imp not in impressions: 200 | impressions[imp] = {'score': [], 'lab': []} 201 | impressions[imp]['score'].append(sco) 202 | impressions[imp]['lab'].append(lab) 203 | else: 204 | impressions[imp]['score'].append(sco) 205 | impressions[imp]['lab'].append(lab) 206 | predicts, truths = [], [] 207 | for imp in impressions: 208 | sims, labs = impressions[imp]['score'], impressions[imp]['lab'] 209 | sl_zip = sorted(zip(sims, labs), key=lambda x: x[0], reverse=True) 210 | sort_sims, sort_labs = zip(*sl_zip) 211 | predicts.append(list(range(1, len(sort_labs) + 1, 1))) 212 | truths.append(sort_labs) 213 | 214 | auc_test, mrr_test, ndcg5_test, ndcg10_test = evaluate(predicts, truths) 215 | end_test = time.time() 216 | test_spend = (end_test - st_test) / 60 217 | 218 | if rank == 0: 219 | print("Test: AUC: %0.4f\tMRR: %0.4f\tnDCG@5: %0.4f\tnDCG@10: %0.4f\t[Test-Time: %0.2f mim]" % 220 | (auc_test, mrr_test, ndcg5_test, ndcg10_test, test_spend)) 221 | cleanup() 222 | 223 | 224 | if __name__ == '__main__': 225 | t0 = time.time() 226 | parser = argparse.ArgumentParser() 227 | parser.add_argument('--data_path', default='../DATA/MIND-Small', type=str, help='Path') 228 | parser.add_argument('--model_name', default='bert-base-uncased', type=str) 229 | 230 | parser.add_argument('--test_batch_size', default=15, type=int, help='test batch_size') 231 | parser.add_argument('--max_his', default=50, type=int, help='max number of history') 232 | parser.add_argument('--max_tokens', default=500, type=int, help='max number of tokens') 233 | 234 | parser.add_argument('--max_his_len', default=450, type=int, help='max number of history') 235 | 236 | parser.add_argument('--num_conti1', default=3, type=int, help='number of continuous tokens') 237 | parser.add_argument('--num_conti2', default=3, type=int, help='number of continuous tokens') 238 | 239 | parser.add_argument('--device', default='cuda', help='device id') 240 | parser.add_argument('--world_size', default=2, type=int, help='number of distributed processes') 241 | 242 | parser.add_argument('--model_file', default='', type=str, help='model file') 243 | # parser.add_argument('--log', default=False, type=bool, help='whether write log file') 244 | parser.add_argument('--log', default=True, type=bool, help='whether write log file') 245 | 246 | args = parser.parse_args() 247 | 248 | # Create log file 249 | if args.data_path == '../DATA/MIND-Demo': 250 | if args.log: 251 | if not os.path.exists('./log-Test'): 252 | os.makedirs('./log-Test') 253 | log_file = './log-Test/' + 'Tbs' + str(args.test_batch_size) + \ 254 | '-n' + str(args.num_conti1) + str(args.num_conti2) + \ 255 | '-' + str(datetime.now())[-5:]+'.txt' 256 | args.log_file = log_file 257 | else: 258 | if args.log: 259 | if not os.path.exists('./log-Test-Small'): 260 | os.makedirs('./log-Test-Small') 261 | log_file = './log-Test-Small/' + 'Tbs' + str(args.test_batch_size) + \ 262 | '-n' + str(args.num_conti1) + str(args.num_conti2) + \ 263 | '-' + str(datetime.now())[-5:]+'.txt' 264 | args.log_file = log_file 265 | 266 | WORLD_SIZE = torch.cuda.device_count() 267 | mp.spawn(ddp_main, 268 | args=(WORLD_SIZE, args), 269 | nprocs=WORLD_SIZE, 270 | join=True) 271 | t1 = time.time() 272 | run_time = (t1 - t0) / 3600 273 | print('Running time: %0.4f' % run_time) -------------------------------------------------------------------------------- /Discrete-Utility/predict.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import pickle 4 | import time 5 | import sys 6 | 7 | from tqdm import tqdm 8 | from datetime import datetime 9 | import torch.cuda 10 | from torch.utils.data import DataLoader 11 | 12 | from transformers import BertTokenizer 13 | from transformers import AdamW 14 | 15 | import torch.distributed as dist 16 | import torch.multiprocessing as mp 17 | from torch.nn.parallel import DistributedDataParallel as DDP 18 | from torch.utils.data.distributed import DistributedSampler 19 | from torch.distributed.fsdp import FullyShardedDataParallel as FSDP 20 | 21 | from model import BERTPrompt4NR 22 | from prepro_data import * 23 | from utils import evaluate 24 | 25 | 26 | def setup(rank, world_size): 27 | os.environ['MASTER_ADDR'] = 'localhost' 28 | os.environ['MASTER_PORT'] = '23342' 29 | dist.init_process_group("nccl", rank=rank, world_size=world_size) 30 | 31 | 32 | def cleanup(): 33 | dist.destroy_process_group() 34 | 35 | 36 | def init_seed(seed): 37 | random.seed(seed) 38 | np.random.seed(seed) 39 | torch.manual_seed(seed) 40 | torch.cuda.manual_seed(seed) 41 | torch.cuda.manual_seed_all(seed) 42 | 43 | 44 | class Logger(object): 45 | def __init__(self, filename, stream=sys.stdout): 46 | self.terminal = stream 47 | self.log = open(filename, 'w') 48 | 49 | def write(self, message): 50 | self.terminal.write(message) 51 | self.log.write(message) 52 | 53 | def flush(self): 54 | pass 55 | 56 | 57 | def load_model(model_name, args): 58 | tokenizer = BertTokenizer.from_pretrained(model_name) 59 | 60 | new_tokens = ['[NSEP]'] 61 | tokenizer.add_tokens(new_tokens) 62 | new_vocab_size = len(tokenizer) 63 | args.vocab_size = new_vocab_size 64 | 65 | answer = ['bad', 'good'] 66 | answer_ids = tokenizer.encode(answer, add_special_tokens=False) 67 | 68 | net = BERTPrompt4NR(model_name, answer_ids, args) 69 | return net, tokenizer 70 | 71 | 72 | def eval(model, rank, world_size, data_loader): 73 | model.eval() 74 | data_loader = tqdm(data_loader) 75 | val_scores = [] 76 | acc_cnt = torch.zeros(2).to(rank) 77 | acc_cnt_pos = torch.zeros(2).to(rank) 78 | imp_ids = [] 79 | labels = [] 80 | for step, data in enumerate(data_loader): 81 | batch_enc, batch_attn, batch_labs, batch_imp = data 82 | imp_ids = imp_ids + batch_imp 83 | labels = labels + batch_labs.cpu().numpy().tolist() 84 | 85 | batch_enc = batch_enc.to(rank) 86 | batch_attn = batch_attn.to(rank) 87 | batch_labs = batch_labs.to(rank) 88 | 89 | loss, scores = model(batch_enc, batch_attn, batch_labs) 90 | 91 | ranking_scores = scores[:, 1].detach() 92 | val_scores.append(ranking_scores) 93 | 94 | predict = torch.argmax(scores.detach(), dim=1) 95 | num_correct = (predict == batch_labs).sum() 96 | acc_cnt[0] += num_correct 97 | acc_cnt[1] += predict.size(0) 98 | 99 | positive_idx = torch.where(batch_labs == 1)[0] 100 | num_correct_pos = (predict[positive_idx] == batch_labs[positive_idx]).sum() 101 | acc_cnt_pos[0] += num_correct_pos 102 | acc_cnt_pos[1] += positive_idx.size(0) 103 | 104 | dist.all_reduce(acc_cnt, op=dist.ReduceOp.SUM) 105 | dist.all_reduce(acc_cnt_pos, op=dist.ReduceOp.SUM) 106 | 107 | acc = acc_cnt[0] / acc_cnt[1] 108 | acc_pos = acc_cnt_pos[0] / acc_cnt_pos[1] 109 | pos_ratio = acc_cnt_pos[1] / acc_cnt[1] 110 | 111 | val_scores = torch.cat(val_scores, dim=0) 112 | val_impids = torch.IntTensor(imp_ids).to(rank) 113 | val_labels = torch.IntTensor(labels).to(rank) 114 | 115 | val_scores_list = [torch.zeros_like(val_scores).to(rank) for _ in range(world_size)] 116 | val_impids_list = [torch.zeros_like(val_impids).to(rank) for _ in range(world_size)] 117 | val_labels_list = [torch.zeros_like(val_labels).to(rank) for _ in range(world_size)] 118 | 119 | dist.all_gather(val_scores_list, val_scores) 120 | dist.all_gather(val_impids_list, val_impids) 121 | dist.all_gather(val_labels_list, val_labels) 122 | 123 | return val_scores_list, acc.item(), acc_pos.item(), pos_ratio.item(), val_impids_list, val_labels_list 124 | 125 | 126 | def ddp_main(rank, world_size, args): 127 | args.rank = rank 128 | args.world_size = world_size 129 | init_seed(rank + 1) 130 | if rank == 0: 131 | if args.log: 132 | sys.stdout = Logger(args.log_file, sys.stdout) 133 | setup(rank, world_size) 134 | 135 | print('| distributed init rank {}'.format(rank)) 136 | dist.barrier() 137 | 138 | # load model 139 | net, tokenizer = load_model(args.model_name, args) 140 | 141 | # load data 142 | news_dict = pickle.load(open(os.path.join(args.data_path, 'news.txt'), 'rb')) 143 | test_dataset = MyDataset(args, tokenizer, news_dict, status='test') 144 | 145 | if rank == 0: 146 | print(args) 147 | print('Vocabulary size of tokenizer after adding new tokens : %d' % args.vocab_size) 148 | print(test_dataset[0]['sentence']) 149 | print('num test: %d' % len(test_dataset)) 150 | 151 | test_sampler = DistributedSampler(test_dataset, 152 | rank=rank, 153 | num_replicas=world_size) 154 | nw = 2 155 | test_kwargs = {'batch_size': args.test_batch_size, 'sampler': test_sampler, 156 | 'shuffle': False, 'pin_memory': False, 157 | 'num_workers': nw, 'collate_fn': test_dataset.collate_fn} 158 | 159 | test_loader = DataLoader(test_dataset, **test_kwargs) 160 | 161 | net = net.to(rank) 162 | net = DDP(net, device_ids=[rank]) 163 | 164 | dist.barrier() 165 | 166 | map_location = {'cuda:%d' % 0: 'cuda:%d' % rank} 167 | net.module.load_state_dict(torch.load(args.model_file, map_location=map_location)) 168 | 169 | with torch.no_grad(): 170 | st_test = time.time() 171 | test_scores, acc_test, acc_pos_test, pos_ratio_test, test_impids, test_labels = \ 172 | eval(net, rank, world_size, test_loader) 173 | impressions = {} # {1: {'score': [], 'lab': []}} 174 | for i in range(world_size): 175 | scores, imp_id, labs = test_scores[i], test_impids[i], test_labels[i] 176 | assert scores.size() == imp_id.size() == labs.size() 177 | scores = scores.cpu().numpy().tolist() 178 | imp_id = imp_id.cpu().numpy().tolist() 179 | labs = labs.cpu().numpy().tolist() 180 | for j in range(len(scores)): 181 | sco, imp, lab = scores[j], imp_id[j], labs[j] 182 | if imp not in impressions: 183 | impressions[imp] = {'score': [], 'lab': []} 184 | impressions[imp]['score'].append(sco) 185 | impressions[imp]['lab'].append(lab) 186 | else: 187 | impressions[imp]['score'].append(sco) 188 | impressions[imp]['lab'].append(lab) 189 | predicts, truths = [], [] 190 | for imp in impressions: 191 | sims, labs = impressions[imp]['score'], impressions[imp]['lab'] 192 | sl_zip = sorted(zip(sims, labs), key=lambda x: x[0], reverse=True) 193 | sort_sims, sort_labs = zip(*sl_zip) 194 | predicts.append(list(range(1, len(sort_labs) + 1, 1))) 195 | truths.append(sort_labs) 196 | 197 | auc_test, mrr_test, ndcg5_test, ndcg10_test = evaluate(predicts, truths) 198 | end_test = time.time() 199 | test_spend = (end_test - st_test) / 60 200 | 201 | if rank == 0: 202 | print("Test: AUC: %0.4f\tMRR: %0.4f\tnDCG@5: %0.4f\tnDCG@10: %0.4f\t[Test-Time: %0.2f mim]" % 203 | (auc_test, mrr_test, ndcg5_test, ndcg10_test, test_spend)) 204 | cleanup() 205 | 206 | 207 | if __name__ == '__main__': 208 | t0 = time.time() 209 | parser = argparse.ArgumentParser() 210 | parser.add_argument('--data_path', default='../DATA/MIND-Small', type=str, help='Path') 211 | parser.add_argument('--model_name', default='bert-base-uncased', type=str) 212 | 213 | parser.add_argument('--test_batch_size', default=15, type=int, help='test batch_size') 214 | parser.add_argument('--max_his', default=50, type=int, help='max number of history') 215 | parser.add_argument('--max_tokens', default=500, type=int, help='max number of tokens') 216 | 217 | parser.add_argument('--max_his_len', default=450, type=int, help='max number of history') 218 | 219 | parser.add_argument('--ratio', default=0.5, type=float, help='ratio of all datasets') 220 | 221 | parser.add_argument('--device', default='cuda', help='device id') 222 | parser.add_argument('--world_size', default=2, type=int, help='number of distributed processes') 223 | 224 | parser.add_argument('--model_file', default='', type=str, help='model file') 225 | parser.add_argument('--log', default=False, type=bool, help='whether write log file') 226 | # parser.add_argument('--log', default=True, type=bool, help='whether write log file') 227 | 228 | args = parser.parse_args() 229 | 230 | # Create log file, All dataset and Few-shot use different log director 231 | if args.ratio == 1.0: 232 | if args.data_path == '../DATA/MIND-Demo': 233 | if args.log: 234 | if not os.path.exists('./log-Test'): 235 | os.makedirs('./log-Test') 236 | log_file = './log-Test/' + 'Tbs' + str(args.test_batch_size) + '-' + str(datetime.now())[-5:]+'.txt' 237 | args.log_file = log_file 238 | else: 239 | if args.log: 240 | if not os.path.exists('./log-Test-Small'): 241 | os.makedirs('./log-Test-Small') 242 | log_file = './log-Test-Small/' + 'Tbs' + str(args.test_batch_size) + '-' + str(datetime.now())[-5:]+'.txt' 243 | args.log_file = log_file 244 | else: 245 | if args.data_path == '../DATA/MIND-Demo': 246 | if args.log: 247 | if not os.path.exists('./log-Test-Few'): 248 | os.makedirs('./log-Test-Few') 249 | log_file = './log-Test-Few/' + 'Tbs' + str(args.test_batch_size) + \ 250 | '-ratio' + str(args.ratio) + \ 251 | '-' + str(datetime.now())[-5:]+'.txt' 252 | args.log_file = log_file 253 | else: 254 | if args.log: 255 | if not os.path.exists('./log-Test-Small-Few'): 256 | os.makedirs('./log-Test-Small-Few') 257 | log_file = './log-Test-Small-Few/' + 'Tbs' + str(args.test_batch_size) + \ 258 | '-ratio' + str(args.ratio) + \ 259 | '-' + str(datetime.now())[-5:]+'.txt' 260 | args.log_file = log_file 261 | 262 | WORLD_SIZE = torch.cuda.device_count() 263 | mp.spawn(ddp_main, 264 | args=(WORLD_SIZE, args), 265 | nprocs=WORLD_SIZE, 266 | join=True) 267 | t1 = time.time() 268 | run_time = (t1 - t0) / 3600 269 | print('Running time: %0.4f' % run_time) -------------------------------------------------------------------------------- /Continuous-Action/predict.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import pickle 4 | import time 5 | import sys 6 | 7 | from tqdm import tqdm 8 | from datetime import datetime 9 | import torch.cuda 10 | from torch.utils.data import DataLoader 11 | 12 | from transformers import BertTokenizer 13 | from transformers import AdamW 14 | 15 | import torch.distributed as dist 16 | import torch.multiprocessing as mp 17 | from torch.nn.parallel import DistributedDataParallel as DDP 18 | from torch.utils.data.distributed import DistributedSampler 19 | 20 | from model import BERTPrompt4NR 21 | from prepro_data import * 22 | from utils import evaluate 23 | 24 | 25 | def setup(rank, world_size): 26 | os.environ['MASTER_ADDR'] = 'localhost' 27 | os.environ['MASTER_PORT'] = '23342' 28 | dist.init_process_group("nccl", rank=rank, world_size=world_size) 29 | 30 | 31 | def cleanup(): 32 | dist.destroy_process_group() 33 | 34 | 35 | def init_seed(seed): 36 | random.seed(seed) 37 | np.random.seed(seed) 38 | torch.manual_seed(seed) 39 | torch.cuda.manual_seed(seed) 40 | torch.cuda.manual_seed_all(seed) 41 | 42 | 43 | class Logger(object): 44 | def __init__(self, filename, stream=sys.stdout): 45 | self.terminal = stream 46 | self.log = open(filename, 'w') 47 | 48 | def write(self, message): 49 | self.terminal.write(message) 50 | self.log.write(message) 51 | 52 | def flush(self): 53 | pass 54 | 55 | 56 | def load_tokenizer(model_name, args): 57 | tokenizer = BertTokenizer.from_pretrained(model_name) 58 | conti_tokens1 = [] 59 | for i in range(args.num_conti1): 60 | conti_tokens1.append('[P' + str(i + 1) + ']') 61 | conti_tokens2 = [] 62 | for i in range(args.num_conti2): 63 | conti_tokens2.append('[Q' + str(i + 1) + ']') 64 | conti_tokens3 = [] 65 | for i in range(args.num_conti3): 66 | conti_tokens3.append('[M' + str(i + 1) + ']') 67 | 68 | new_tokens = ['[NSEP]'] 69 | tokenizer.add_tokens(new_tokens) 70 | 71 | conti_tokens = conti_tokens1 + conti_tokens2 + conti_tokens3 72 | tokenizer.add_tokens(conti_tokens) 73 | 74 | new_vocab_size = len(tokenizer) 75 | args.vocab_size = new_vocab_size 76 | 77 | return tokenizer, conti_tokens1, conti_tokens2, conti_tokens3 78 | 79 | 80 | def load_model(model_name, tokenizer, args): 81 | answer = ['no', 'yes'] 82 | answer_ids = tokenizer.encode(answer, add_special_tokens=False) 83 | 84 | net = BERTPrompt4NR(model_name, answer_ids, args) 85 | return net 86 | 87 | 88 | def eval(model, rank, world_size, data_loader): 89 | model.eval() 90 | data_loader = tqdm(data_loader) 91 | val_scores = [] 92 | acc_cnt = torch.zeros(2).to(rank) 93 | acc_cnt_pos = torch.zeros(2).to(rank) 94 | imp_ids = [] 95 | labels = [] 96 | for step, data in enumerate(data_loader): 97 | batch_enc, batch_attn, batch_labs, batch_imp = data 98 | imp_ids = imp_ids + batch_imp 99 | labels = labels + batch_labs.cpu().numpy().tolist() 100 | 101 | batch_enc = batch_enc.to(rank) 102 | batch_attn = batch_attn.to(rank) 103 | batch_labs = batch_labs.to(rank) 104 | 105 | loss, scores = model(batch_enc, batch_attn, batch_labs) 106 | 107 | ranking_scores = scores[:, 1].detach() 108 | val_scores.append(ranking_scores) 109 | 110 | predict = torch.argmax(scores.detach(), dim=1) 111 | num_correct = (predict == batch_labs).sum() 112 | acc_cnt[0] += num_correct 113 | acc_cnt[1] += predict.size(0) 114 | 115 | positive_idx = torch.where(batch_labs == 1)[0] 116 | num_correct_pos = (predict[positive_idx] == batch_labs[positive_idx]).sum() 117 | acc_cnt_pos[0] += num_correct_pos 118 | acc_cnt_pos[1] += positive_idx.size(0) 119 | 120 | dist.all_reduce(acc_cnt, op=dist.ReduceOp.SUM) 121 | dist.all_reduce(acc_cnt_pos, op=dist.ReduceOp.SUM) 122 | 123 | acc = acc_cnt[0] / acc_cnt[1] 124 | acc_pos = acc_cnt_pos[0] / acc_cnt_pos[1] 125 | pos_ratio = acc_cnt_pos[1] / acc_cnt[1] 126 | 127 | val_scores = torch.cat(val_scores, dim=0) 128 | val_impids = torch.IntTensor(imp_ids).to(rank) 129 | val_labels = torch.IntTensor(labels).to(rank) 130 | 131 | val_scores_list = [torch.zeros_like(val_scores).to(rank) for _ in range(world_size)] 132 | val_impids_list = [torch.zeros_like(val_impids).to(rank) for _ in range(world_size)] 133 | val_labels_list = [torch.zeros_like(val_labels).to(rank) for _ in range(world_size)] 134 | 135 | dist.all_gather(val_scores_list, val_scores) 136 | dist.all_gather(val_impids_list, val_impids) 137 | dist.all_gather(val_labels_list, val_labels) 138 | 139 | return val_scores_list, acc.item(), acc_pos.item(), pos_ratio.item(), val_impids_list, val_labels_list 140 | 141 | 142 | def ddp_main(rank, world_size, args): 143 | args.rank = rank 144 | args.world_size = world_size 145 | init_seed(rank + 1) 146 | if rank == 0: 147 | if args.log: 148 | sys.stdout = Logger(args.log_file, sys.stdout) 149 | setup(rank, world_size) 150 | 151 | print('| distributed init rank {}'.format(rank)) 152 | dist.barrier() 153 | 154 | # load tokenizer 155 | tokenizer, conti_tokens1, conti_tokens2, conti_tokens3 = load_tokenizer(args.model_name, args) 156 | conti_tokens = [conti_tokens1, conti_tokens2, conti_tokens3] 157 | 158 | # load model 159 | net = load_model(args.model_name, tokenizer, args) 160 | 161 | # load data 162 | news_dict = pickle.load(open(os.path.join(args.data_path, 'news.txt'), 'rb')) 163 | test_dataset = MyDataset(args, tokenizer, news_dict, conti_tokens, status='test') 164 | 165 | if rank == 0: 166 | print(args) 167 | print('Vocabulary size of tokenizer after adding new tokens : %d' % args.vocab_size) 168 | print(test_dataset[0]['sentence']) 169 | print('num test: %d' % len(test_dataset)) 170 | 171 | test_sampler = DistributedSampler(test_dataset, 172 | rank=rank, 173 | num_replicas=world_size) 174 | nw = 2 175 | test_kwargs = {'batch_size': args.test_batch_size, 'sampler': test_sampler, 176 | 'shuffle': False, 'pin_memory': False, 177 | 'num_workers': nw, 'collate_fn': test_dataset.collate_fn} 178 | 179 | test_loader = DataLoader(test_dataset, **test_kwargs) 180 | 181 | net = net.to(rank) 182 | net = DDP(net, device_ids=[rank]) 183 | 184 | dist.barrier() 185 | 186 | map_location = {'cuda:%d' % 0: 'cuda:%d' % rank} 187 | net.module.load_state_dict(torch.load(args.model_file, map_location=map_location)) 188 | 189 | with torch.no_grad(): 190 | st_test = time.time() 191 | test_scores, acc_test, acc_pos_test, pos_ratio_test, test_impids, test_labels = \ 192 | eval(net, rank, world_size, test_loader) 193 | impressions = {} # {1: {'score': [], 'lab': []}} 194 | for i in range(world_size): 195 | scores, imp_id, labs = test_scores[i], test_impids[i], test_labels[i] 196 | assert scores.size() == imp_id.size() == labs.size() 197 | scores = scores.cpu().numpy().tolist() 198 | imp_id = imp_id.cpu().numpy().tolist() 199 | labs = labs.cpu().numpy().tolist() 200 | for j in range(len(scores)): 201 | sco, imp, lab = scores[j], imp_id[j], labs[j] 202 | if imp not in impressions: 203 | impressions[imp] = {'score': [], 'lab': []} 204 | impressions[imp]['score'].append(sco) 205 | impressions[imp]['lab'].append(lab) 206 | else: 207 | impressions[imp]['score'].append(sco) 208 | impressions[imp]['lab'].append(lab) 209 | predicts, truths = [], [] 210 | for imp in impressions: 211 | sims, labs = impressions[imp]['score'], impressions[imp]['lab'] 212 | sl_zip = sorted(zip(sims, labs), key=lambda x: x[0], reverse=True) 213 | sort_sims, sort_labs = zip(*sl_zip) 214 | predicts.append(list(range(1, len(sort_labs) + 1, 1))) 215 | truths.append(sort_labs) 216 | 217 | auc_test, mrr_test, ndcg5_test, ndcg10_test = evaluate(predicts, truths) 218 | end_test = time.time() 219 | test_spend = (end_test - st_test) / 60 220 | 221 | if rank == 0: 222 | print("Test: AUC: %0.4f\tMRR: %0.4f\tnDCG@5: %0.4f\tnDCG@10: %0.4f\t[Test-Time: %0.2f mim]" % 223 | (auc_test, mrr_test, ndcg5_test, ndcg10_test, test_spend)) 224 | cleanup() 225 | 226 | 227 | if __name__ == '__main__': 228 | t0 = time.time() 229 | parser = argparse.ArgumentParser() 230 | parser.add_argument('--data_path', default='../DATA/MIND-Small', type=str, help='Path') 231 | parser.add_argument('--model_name', default='bert-base-uncased', type=str) 232 | 233 | parser.add_argument('--test_batch_size', default=15, type=int, help='test batch_size') 234 | parser.add_argument('--max_his', default=50, type=int, help='max number of history') 235 | parser.add_argument('--max_tokens', default=500, type=int, help='max number of tokens') 236 | 237 | parser.add_argument('--max_his_len', default=450, type=int, help='max number of history') 238 | 239 | parser.add_argument('--num_conti1', default=3, type=int, help='number of continuous tokens') 240 | parser.add_argument('--num_conti2', default=3, type=int, help='number of continuous tokens') 241 | parser.add_argument('--num_conti3', default=3, type=int, help='number of continuous tokens') 242 | 243 | parser.add_argument('--device', default='cuda', help='device id') 244 | parser.add_argument('--world_size', default=2, type=int, help='number of distributed processes') 245 | 246 | parser.add_argument('--model_file', default='', type=str, help='model file') 247 | # parser.add_argument('--log', default=False, type=bool, help='whether write log file') 248 | parser.add_argument('--log', default=True, type=bool, help='whether write log file') 249 | 250 | args = parser.parse_args() 251 | 252 | if args.data_path == '../DATA/MIND-Demo': 253 | if args.log: 254 | if not os.path.exists('./log-Test'): 255 | os.makedirs('./log-Test') 256 | log_file = './log-Test/' + 'Tbs' + str(args.test_batch_size) + \ 257 | '-n' + str(args.num_conti1) + str(args.num_conti2) + str(args.num_conti3) + \ 258 | '-' + str(datetime.now())[-5:]+'.txt' 259 | args.log_file = log_file 260 | else: 261 | if args.log: 262 | if not os.path.exists('./log-Test-Small'): 263 | os.makedirs('./log-Test-Small') 264 | log_file = './log-Test-Small/' + 'Tbs' + str(args.test_batch_size) + \ 265 | '-n' + str(args.num_conti1) + str(args.num_conti2) + str(args.num_conti3) + \ 266 | '-' + str(datetime.now())[-5:]+'.txt' 267 | args.log_file = log_file 268 | 269 | WORLD_SIZE = torch.cuda.device_count() 270 | mp.spawn(ddp_main, 271 | args=(WORLD_SIZE, args), 272 | nprocs=WORLD_SIZE, 273 | join=True) 274 | t1 = time.time() 275 | run_time = (t1 - t0) / 3600 276 | print('Running time: %0.4f' % run_time) --------------------------------------------------------------------------------