├── Imgs
├── Prompt4NR.png
└── templates_table.png
├── Discrete-Action
├── run.sh
├── model.py
├── utils.py
├── prepro_data.py
└── predict.py
├── Discrete-Emotion
├── run.sh
├── model.py
├── utils.py
├── prepro_data.py
└── predict.py
├── Discrete-Relevance
├── run.sh
├── model.py
├── utils.py
├── prepro_data.py
└── predict.py
├── Hybrid-Relevance
├── run.sh
├── model.py
├── utils.py
├── prepro_data.py
└── predict.py
├── Hybrid-Utility
├── run.sh
├── model.py
├── utils.py
├── prepro_data.py
└── predict.py
├── Hybrid-Emotion
├── run.sh
├── model.py
├── utils.py
├── prepro_data.py
└── predict.py
├── Continuous-Action
├── run.sh
├── model.py
├── utils.py
├── prepro_data.py
└── predict.py
├── Continuous-Emotion
├── run.sh
├── model.py
├── utils.py
└── prepro_data.py
├── Continuous-Relevance
├── run.sh
├── model.py
├── utils.py
└── prepro_data.py
├── Hybrid-Action
├── model.py
├── utils.py
├── run.sh
└── prepro_data.py
├── Continuous-Utility
├── model.py
├── utils.py
├── run.sh
└── prepro_data.py
├── Discrete-Utility
├── model.py
├── utils.py
├── run.sh
├── prepro_data.py
└── predict.py
└── README.md
/Imgs/Prompt4NR.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/resistzzz/Prompt4NR/HEAD/Imgs/Prompt4NR.png
--------------------------------------------------------------------------------
/Imgs/templates_table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/resistzzz/Prompt4NR/HEAD/Imgs/templates_table.png
--------------------------------------------------------------------------------
/Discrete-Action/run.sh:
--------------------------------------------------------------------------------
1 | python main-multigpu.py --data_path ../DATA/MIND-Small --epochs 3 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True
2 | python predict.py --data_path ../DATA/MIND-Small --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True
--------------------------------------------------------------------------------
/Discrete-Emotion/run.sh:
--------------------------------------------------------------------------------
1 | python main-multigpu.py --data_path ../DATA/MIND-Small --epochs 4 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True
2 | python predict.py --data_path ../DATA/MIND-Small --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True
--------------------------------------------------------------------------------
/Discrete-Relevance/run.sh:
--------------------------------------------------------------------------------
1 | python main-multigpu.py --data_path ../DATA/MIND-Small --epochs 4 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True
2 | python predict.py --data_path ../DATA/MIND-Small --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True
--------------------------------------------------------------------------------
/Hybrid-Relevance/run.sh:
--------------------------------------------------------------------------------
1 | python main-multigpu.py --data_path ../DATA/MIND-Small --epochs 4 --batch_size 16 --test_batch_size 100 --num_conti1 3 --num_conti2 3 --wd 1e-3 --max_tokens 500 --log True --model_save True
2 | python predict.py --data_path ../DATA/MIND-Small --test_batch_size 100 --num_conti1 3 --num_conti2 3 --max_tokens 500 --model_file ./temp/BestModel.pt --log True
--------------------------------------------------------------------------------
/Hybrid-Utility/run.sh:
--------------------------------------------------------------------------------
1 | python main-multigpu.py --data_path ../DATA/MIND-Small --epochs 4 --batch_size 16 --test_batch_size 100 --num_conti1 3 --num_conti2 3 --wd 1e-3 --max_tokens 500 --log True --model_save True
2 | python predict.py --data_path ../DATA/MIND-Small --test_batch_size 100 --num_conti1 3 --num_conti2 3 --max_tokens 500 --model_file ./temp/BestModel.pt --log True
--------------------------------------------------------------------------------
/Hybrid-Emotion/run.sh:
--------------------------------------------------------------------------------
1 | python main-multigpu.py --data_path ../DATA/MIND-Small --epochs 4 --batch_size 16 --test_batch_size 100 --num_conti1 3 --num_conti2 3 --wd 1e-3 --max_tokens 500 --log True --model_save True
2 | python predict.py --data_path ../DATA/MIND-Small --test_batch_size 100 --num_conti1 3 --num_conti2 3 --max_tokens 500 --model_file ./temp/BestModel.pt --log True
3 |
--------------------------------------------------------------------------------
/Continuous-Action/run.sh:
--------------------------------------------------------------------------------
1 | python main-multigpu.py --data_path ../DATA/MIND-Small --epochs 4 --batch_size 16 --test_batch_size 100 --num_conti1 3 --num_conti2 3 --num_conti3 3 --wd 1e-3 --max_tokens 500 --log True --model_save True
2 | python predict.py --data_path ../DATA/MIND-Small --test_batch_size 100 --num_conti1 3 --num_conti2 3 --num_conti3 3 --max_tokens 500 --model_file ./temp/BestModel.pt --log True
--------------------------------------------------------------------------------
/Continuous-Emotion/run.sh:
--------------------------------------------------------------------------------
1 | python main-multigpu.py --data_path ../DATA/MIND-Small --epochs 4 --batch_size 16 --test_batch_size 100 --num_conti1 3 --num_conti2 3 --num_conti3 3 --wd 1e-3 --max_tokens 500 --log True --model_save True
2 | python predict.py --data_path ../DATA/MIND-Small --test_batch_size 100 --num_conti1 3 --num_conti2 3 --num_conti3 3 --max_tokens 500 --model_file ./temp/BestModel.pt --log True
--------------------------------------------------------------------------------
/Continuous-Relevance/run.sh:
--------------------------------------------------------------------------------
1 | python main-multigpu.py --data_path ../DATA/MIND-Small --epochs 4 --batch_size 16 --test_batch_size 100 --num_conti1 3 --num_conti2 3 --num_conti3 3 --wd 1e-3 --max_tokens 500 --log True --model_save True
2 | python predict.py --data_path ../DATA/MIND-Small --test_batch_size 100 --num_conti1 3 --num_conti2 3 --num_conti3 3 --max_tokens 500 --model_file ./temp/BestModel.pt --log True
--------------------------------------------------------------------------------
/Hybrid-Action/model.py:
--------------------------------------------------------------------------------
1 |
2 | from transformers import BertForMaskedLM
3 | import torch.nn as nn
4 |
5 |
6 | class BERTPrompt4NR(nn.Module):
7 | def __init__(self, model_name, answer_ids, args):
8 | super(BERTPrompt4NR, self).__init__()
9 | self.BERT = BertForMaskedLM.from_pretrained(model_name)
10 | self.BERT.resize_token_embeddings(args.vocab_size)
11 |
12 | for param in self.BERT.parameters():
13 | param.requires_grad = True
14 |
15 | self.answer_ids = answer_ids
16 | self.mask_token_id = 103
17 | self.loss_func = nn.CrossEntropyLoss()
18 |
19 | def forward(self, batch_enc, batch_attn, batch_labs):
20 | outputs = self.BERT(input_ids=batch_enc,
21 | attention_mask=batch_attn)
22 | out_logits = outputs.logits
23 |
24 | mask_position = batch_enc.eq(self.mask_token_id)
25 | mask_logits = out_logits[mask_position, :].view(out_logits.size(0), -1, out_logits.size(-1))[:, -1, :]
26 |
27 | answer_logits = mask_logits[:, self.answer_ids]
28 |
29 | loss = self.loss_func(answer_logits, batch_labs)
30 |
31 | return loss, answer_logits.softmax(dim=1)
32 |
--------------------------------------------------------------------------------
/Hybrid-Emotion/model.py:
--------------------------------------------------------------------------------
1 |
2 | from transformers import BertForMaskedLM
3 | import torch.nn as nn
4 |
5 |
6 | class BERTPrompt4NR(nn.Module):
7 | def __init__(self, model_name, answer_ids, args):
8 | super(BERTPrompt4NR, self).__init__()
9 | self.BERT = BertForMaskedLM.from_pretrained(model_name)
10 | self.BERT.resize_token_embeddings(args.vocab_size)
11 |
12 | for param in self.BERT.parameters():
13 | param.requires_grad = True
14 |
15 | self.answer_ids = answer_ids
16 | self.mask_token_id = 103
17 | self.loss_func = nn.CrossEntropyLoss()
18 |
19 | def forward(self, batch_enc, batch_attn, batch_labs):
20 | outputs = self.BERT(input_ids=batch_enc,
21 | attention_mask=batch_attn)
22 | out_logits = outputs.logits
23 |
24 | mask_position = batch_enc.eq(self.mask_token_id)
25 | mask_logits = out_logits[mask_position, :].view(out_logits.size(0), -1, out_logits.size(-1))[:, -1, :]
26 |
27 | answer_logits = mask_logits[:, self.answer_ids]
28 |
29 | loss = self.loss_func(answer_logits, batch_labs)
30 |
31 | return loss, answer_logits.softmax(dim=1)
32 |
--------------------------------------------------------------------------------
/Hybrid-Utility/model.py:
--------------------------------------------------------------------------------
1 |
2 | from transformers import BertForMaskedLM
3 | import torch.nn as nn
4 |
5 |
6 | class BERTPrompt4NR(nn.Module):
7 | def __init__(self, model_name, answer_ids, args):
8 | super(BERTPrompt4NR, self).__init__()
9 | self.BERT = BertForMaskedLM.from_pretrained(model_name)
10 | self.BERT.resize_token_embeddings(args.vocab_size)
11 |
12 | for param in self.BERT.parameters():
13 | param.requires_grad = True
14 |
15 | self.answer_ids = answer_ids
16 | self.mask_token_id = 103
17 | self.loss_func = nn.CrossEntropyLoss()
18 |
19 | def forward(self, batch_enc, batch_attn, batch_labs):
20 | outputs = self.BERT(input_ids=batch_enc,
21 | attention_mask=batch_attn)
22 | out_logits = outputs.logits
23 |
24 | mask_position = batch_enc.eq(self.mask_token_id)
25 | mask_logits = out_logits[mask_position, :].view(out_logits.size(0), -1, out_logits.size(-1))[:, -1, :]
26 |
27 | answer_logits = mask_logits[:, self.answer_ids]
28 |
29 | loss = self.loss_func(answer_logits, batch_labs)
30 |
31 | return loss, answer_logits.softmax(dim=1)
32 |
--------------------------------------------------------------------------------
/Continuous-Action/model.py:
--------------------------------------------------------------------------------
1 |
2 | from transformers import BertForMaskedLM
3 | import torch.nn as nn
4 |
5 |
6 | class BERTPrompt4NR(nn.Module):
7 | def __init__(self, model_name, answer_ids, args):
8 | super(BERTPrompt4NR, self).__init__()
9 | self.BERT = BertForMaskedLM.from_pretrained(model_name)
10 | self.BERT.resize_token_embeddings(args.vocab_size)
11 |
12 | for param in self.BERT.parameters():
13 | param.requires_grad = True
14 |
15 | self.answer_ids = answer_ids
16 | self.mask_token_id = 103
17 | self.loss_func = nn.CrossEntropyLoss()
18 |
19 | def forward(self, batch_enc, batch_attn, batch_labs):
20 | outputs = self.BERT(input_ids=batch_enc,
21 | attention_mask=batch_attn)
22 | out_logits = outputs.logits
23 |
24 | mask_position = batch_enc.eq(self.mask_token_id)
25 | mask_logits = out_logits[mask_position, :].view(out_logits.size(0), -1, out_logits.size(-1))[:, -1, :]
26 |
27 | answer_logits = mask_logits[:, self.answer_ids]
28 |
29 | loss = self.loss_func(answer_logits, batch_labs)
30 |
31 | return loss, answer_logits.softmax(dim=1)
32 |
--------------------------------------------------------------------------------
/Continuous-Emotion/model.py:
--------------------------------------------------------------------------------
1 |
2 | from transformers import BertForMaskedLM
3 | import torch.nn as nn
4 |
5 |
6 | class BERTPrompt4NR(nn.Module):
7 | def __init__(self, model_name, answer_ids, args):
8 | super(BERTPrompt4NR, self).__init__()
9 | self.BERT = BertForMaskedLM.from_pretrained(model_name)
10 | self.BERT.resize_token_embeddings(args.vocab_size)
11 |
12 | for param in self.BERT.parameters():
13 | param.requires_grad = True
14 |
15 | self.answer_ids = answer_ids
16 | self.mask_token_id = 103
17 | self.loss_func = nn.CrossEntropyLoss()
18 |
19 | def forward(self, batch_enc, batch_attn, batch_labs):
20 | outputs = self.BERT(input_ids=batch_enc,
21 | attention_mask=batch_attn)
22 | out_logits = outputs.logits
23 |
24 | mask_position = batch_enc.eq(self.mask_token_id)
25 | mask_logits = out_logits[mask_position, :].view(out_logits.size(0), -1, out_logits.size(-1))[:, -1, :]
26 |
27 | answer_logits = mask_logits[:, self.answer_ids]
28 |
29 | loss = self.loss_func(answer_logits, batch_labs)
30 |
31 | return loss, answer_logits.softmax(dim=1)
32 |
--------------------------------------------------------------------------------
/Continuous-Utility/model.py:
--------------------------------------------------------------------------------
1 |
2 | from transformers import BertForMaskedLM
3 | import torch.nn as nn
4 |
5 |
6 | class BERTPrompt4NR(nn.Module):
7 | def __init__(self, model_name, answer_ids, args):
8 | super(BERTPrompt4NR, self).__init__()
9 | self.BERT = BertForMaskedLM.from_pretrained(model_name)
10 | self.BERT.resize_token_embeddings(args.vocab_size)
11 |
12 | for param in self.BERT.parameters():
13 | param.requires_grad = True
14 |
15 | self.answer_ids = answer_ids
16 | self.mask_token_id = 103
17 | self.loss_func = nn.CrossEntropyLoss()
18 |
19 | def forward(self, batch_enc, batch_attn, batch_labs):
20 | outputs = self.BERT(input_ids=batch_enc,
21 | attention_mask=batch_attn)
22 | out_logits = outputs.logits
23 |
24 | mask_position = batch_enc.eq(self.mask_token_id)
25 | mask_logits = out_logits[mask_position, :].view(out_logits.size(0), -1, out_logits.size(-1))[:, -1, :]
26 |
27 | answer_logits = mask_logits[:, self.answer_ids]
28 |
29 | loss = self.loss_func(answer_logits, batch_labs)
30 |
31 | return loss, answer_logits.softmax(dim=1)
32 |
--------------------------------------------------------------------------------
/Discrete-Action/model.py:
--------------------------------------------------------------------------------
1 |
2 | from transformers import BertForMaskedLM
3 | import torch.nn as nn
4 |
5 |
6 | class BERTPrompt4NR(nn.Module):
7 | def __init__(self, model_name, answer_ids, args):
8 | super(BERTPrompt4NR, self).__init__()
9 | self.BERT = BertForMaskedLM.from_pretrained(model_name)
10 | self.BERT.resize_token_embeddings(args.vocab_size)
11 |
12 | for param in self.BERT.parameters():
13 | param.requires_grad = True
14 |
15 | self.answer_ids = answer_ids
16 | self.mask_token_id = 103
17 | self.loss_func = nn.CrossEntropyLoss()
18 |
19 | def forward(self, batch_enc, batch_attn, batch_labs):
20 | outputs = self.BERT(input_ids=batch_enc,
21 | attention_mask=batch_attn)
22 | out_logits = outputs.logits
23 |
24 | mask_position = batch_enc.eq(self.mask_token_id)
25 | mask_logits = out_logits[mask_position, :].view(out_logits.size(0), -1, out_logits.size(-1))[:, -1, :]
26 |
27 | answer_logits = mask_logits[:, self.answer_ids]
28 |
29 | loss = self.loss_func(answer_logits, batch_labs)
30 |
31 | return loss, answer_logits.softmax(dim=1)
32 |
--------------------------------------------------------------------------------
/Discrete-Emotion/model.py:
--------------------------------------------------------------------------------
1 |
2 | from transformers import BertForMaskedLM
3 | import torch.nn as nn
4 |
5 |
6 | class BERTPrompt4NR(nn.Module):
7 | def __init__(self, model_name, answer_ids, args):
8 | super(BERTPrompt4NR, self).__init__()
9 | self.BERT = BertForMaskedLM.from_pretrained(model_name)
10 | self.BERT.resize_token_embeddings(args.vocab_size)
11 |
12 | for param in self.BERT.parameters():
13 | param.requires_grad = True
14 |
15 | self.answer_ids = answer_ids
16 | self.mask_token_id = 103
17 | self.loss_func = nn.CrossEntropyLoss()
18 |
19 | def forward(self, batch_enc, batch_attn, batch_labs):
20 | outputs = self.BERT(input_ids=batch_enc,
21 | attention_mask=batch_attn)
22 | out_logits = outputs.logits
23 |
24 | mask_position = batch_enc.eq(self.mask_token_id)
25 | mask_logits = out_logits[mask_position, :].view(out_logits.size(0), -1, out_logits.size(-1))[:, -1, :]
26 |
27 | answer_logits = mask_logits[:, self.answer_ids]
28 |
29 | loss = self.loss_func(answer_logits, batch_labs)
30 |
31 | return loss, answer_logits.softmax(dim=1)
32 |
--------------------------------------------------------------------------------
/Discrete-Relevance/model.py:
--------------------------------------------------------------------------------
1 |
2 | from transformers import BertForMaskedLM
3 | import torch.nn as nn
4 |
5 |
6 | class BERTPrompt4NR(nn.Module):
7 | def __init__(self, model_name, answer_ids, args):
8 | super(BERTPrompt4NR, self).__init__()
9 | self.BERT = BertForMaskedLM.from_pretrained(model_name)
10 | self.BERT.resize_token_embeddings(args.vocab_size)
11 |
12 | for param in self.BERT.parameters():
13 | param.requires_grad = True
14 |
15 | self.answer_ids = answer_ids
16 | self.mask_token_id = 103
17 | self.loss_func = nn.CrossEntropyLoss()
18 |
19 | def forward(self, batch_enc, batch_attn, batch_labs):
20 | outputs = self.BERT(input_ids=batch_enc,
21 | attention_mask=batch_attn)
22 | out_logits = outputs.logits
23 |
24 | mask_position = batch_enc.eq(self.mask_token_id)
25 | mask_logits = out_logits[mask_position, :].view(out_logits.size(0), -1, out_logits.size(-1))[:, -1, :]
26 |
27 | answer_logits = mask_logits[:, self.answer_ids]
28 |
29 | loss = self.loss_func(answer_logits, batch_labs)
30 |
31 | return loss, answer_logits.softmax(dim=1)
32 |
--------------------------------------------------------------------------------
/Discrete-Utility/model.py:
--------------------------------------------------------------------------------
1 |
2 | from transformers import BertForMaskedLM
3 | import torch.nn as nn
4 |
5 |
6 | class BERTPrompt4NR(nn.Module):
7 | def __init__(self, model_name, answer_ids, args):
8 | super(BERTPrompt4NR, self).__init__()
9 | self.BERT = BertForMaskedLM.from_pretrained(model_name)
10 | self.BERT.resize_token_embeddings(args.vocab_size)
11 |
12 | for param in self.BERT.parameters():
13 | param.requires_grad = True
14 |
15 | self.answer_ids = answer_ids
16 | self.mask_token_id = 103
17 | self.loss_func = nn.CrossEntropyLoss()
18 |
19 | def forward(self, batch_enc, batch_attn, batch_labs):
20 | outputs = self.BERT(input_ids=batch_enc,
21 | attention_mask=batch_attn)
22 | out_logits = outputs.logits
23 |
24 | mask_position = batch_enc.eq(self.mask_token_id)
25 | mask_logits = out_logits[mask_position, :].view(out_logits.size(0), -1, out_logits.size(-1))[:, -1, :]
26 |
27 | answer_logits = mask_logits[:, self.answer_ids]
28 |
29 | loss = self.loss_func(answer_logits, batch_labs)
30 |
31 | return loss, answer_logits.softmax(dim=1)
32 |
--------------------------------------------------------------------------------
/Hybrid-Relevance/model.py:
--------------------------------------------------------------------------------
1 |
2 | from transformers import BertForMaskedLM
3 | import torch.nn as nn
4 |
5 |
6 | class BERTPrompt4NR(nn.Module):
7 | def __init__(self, model_name, answer_ids, args):
8 | super(BERTPrompt4NR, self).__init__()
9 | self.BERT = BertForMaskedLM.from_pretrained(model_name)
10 | self.BERT.resize_token_embeddings(args.vocab_size)
11 |
12 | for param in self.BERT.parameters():
13 | param.requires_grad = True
14 |
15 | self.answer_ids = answer_ids
16 | self.mask_token_id = 103
17 | self.loss_func = nn.CrossEntropyLoss()
18 |
19 | def forward(self, batch_enc, batch_attn, batch_labs):
20 | outputs = self.BERT(input_ids=batch_enc,
21 | attention_mask=batch_attn)
22 | out_logits = outputs.logits
23 |
24 | mask_position = batch_enc.eq(self.mask_token_id)
25 | mask_logits = out_logits[mask_position, :].view(out_logits.size(0), -1, out_logits.size(-1))[:, -1, :]
26 |
27 | answer_logits = mask_logits[:, self.answer_ids]
28 |
29 | loss = self.loss_func(answer_logits, batch_labs)
30 |
31 | return loss, answer_logits.softmax(dim=1)
32 |
--------------------------------------------------------------------------------
/Continuous-Relevance/model.py:
--------------------------------------------------------------------------------
1 |
2 | from transformers import BertForMaskedLM
3 | import torch.nn as nn
4 |
5 |
6 | class BERTPrompt4NR(nn.Module):
7 | def __init__(self, model_name, answer_ids, args):
8 | super(BERTPrompt4NR, self).__init__()
9 | self.BERT = BertForMaskedLM.from_pretrained(model_name)
10 | self.BERT.resize_token_embeddings(args.vocab_size)
11 |
12 | for param in self.BERT.parameters():
13 | param.requires_grad = True
14 |
15 | self.answer_ids = answer_ids
16 | self.mask_token_id = 103
17 | self.loss_func = nn.CrossEntropyLoss()
18 |
19 | def forward(self, batch_enc, batch_attn, batch_labs):
20 | outputs = self.BERT(input_ids=batch_enc,
21 | attention_mask=batch_attn)
22 | out_logits = outputs.logits
23 |
24 | mask_position = batch_enc.eq(self.mask_token_id)
25 | mask_logits = out_logits[mask_position, :].view(out_logits.size(0), -1, out_logits.size(-1))[:, -1, :]
26 |
27 | answer_logits = mask_logits[:, self.answer_ids]
28 |
29 | loss = self.loss_func(answer_logits, batch_labs)
30 |
31 | return loss, answer_logits.softmax(dim=1)
32 |
--------------------------------------------------------------------------------
/Hybrid-Action/utils.py:
--------------------------------------------------------------------------------
1 | from sklearn.metrics import roc_auc_score
2 | import numpy as np
3 |
4 |
5 | def dcg_score(y_true, y_score, k=10):
6 | order = np.argsort(y_score)[::-1]
7 | y_true = np.take(y_true, order[:k])
8 | gains = 2 ** y_true - 1
9 | discounts = np.log2(np.arange(len(y_true)) + 2)
10 | return np.sum(gains / discounts)
11 |
12 |
13 | def ndcg_score(y_true, y_score, k=10):
14 | best = dcg_score(y_true, y_true, k)
15 | actual = dcg_score(y_true, y_score, k)
16 | return actual / best
17 |
18 |
19 | def mrr_score(y_true, y_score):
20 | order = np.argsort(y_score)[::-1]
21 | y_true = np.take(y_true, order)
22 | rr_score = y_true / (np.arange(len(y_true)) + 1)
23 | return np.sum(rr_score) / np.sum(y_true)
24 |
25 |
26 | def evaluate(predicts, truths):
27 | aucs = []
28 | mrrs = []
29 | ndcg5s = []
30 | ndcg10s = []
31 | for pre, tru in zip(predicts, truths):
32 | y_true = np.array(tru, dtype='float32')
33 | y_score = 1.0 / np.array(pre, dtype='float32')
34 | auc = roc_auc_score(y_true, y_score)
35 | mrr = mrr_score(y_true, y_score)
36 | ndcg5 = ndcg_score(y_true, y_score, 5)
37 | ndcg10 = ndcg_score(y_true, y_score, 10)
38 |
39 | aucs.append(auc)
40 | mrrs.append(mrr)
41 | ndcg5s.append(ndcg5)
42 | ndcg10s.append(ndcg10)
43 | return np.mean(aucs), np.mean(mrrs), np.mean(ndcg5s), np.mean(ndcg10s)
44 |
45 |
46 |
47 |
--------------------------------------------------------------------------------
/Hybrid-Emotion/utils.py:
--------------------------------------------------------------------------------
1 | from sklearn.metrics import roc_auc_score
2 | import numpy as np
3 |
4 |
5 | def dcg_score(y_true, y_score, k=10):
6 | order = np.argsort(y_score)[::-1]
7 | y_true = np.take(y_true, order[:k])
8 | gains = 2 ** y_true - 1
9 | discounts = np.log2(np.arange(len(y_true)) + 2)
10 | return np.sum(gains / discounts)
11 |
12 |
13 | def ndcg_score(y_true, y_score, k=10):
14 | best = dcg_score(y_true, y_true, k)
15 | actual = dcg_score(y_true, y_score, k)
16 | return actual / best
17 |
18 |
19 | def mrr_score(y_true, y_score):
20 | order = np.argsort(y_score)[::-1]
21 | y_true = np.take(y_true, order)
22 | rr_score = y_true / (np.arange(len(y_true)) + 1)
23 | return np.sum(rr_score) / np.sum(y_true)
24 |
25 |
26 | def evaluate(predicts, truths):
27 | aucs = []
28 | mrrs = []
29 | ndcg5s = []
30 | ndcg10s = []
31 | for pre, tru in zip(predicts, truths):
32 | y_true = np.array(tru, dtype='float32')
33 | y_score = 1.0 / np.array(pre, dtype='float32')
34 | auc = roc_auc_score(y_true, y_score)
35 | mrr = mrr_score(y_true, y_score)
36 | ndcg5 = ndcg_score(y_true, y_score, 5)
37 | ndcg10 = ndcg_score(y_true, y_score, 10)
38 |
39 | aucs.append(auc)
40 | mrrs.append(mrr)
41 | ndcg5s.append(ndcg5)
42 | ndcg10s.append(ndcg10)
43 | return np.mean(aucs), np.mean(mrrs), np.mean(ndcg5s), np.mean(ndcg10s)
44 |
45 |
46 |
47 |
--------------------------------------------------------------------------------
/Hybrid-Utility/utils.py:
--------------------------------------------------------------------------------
1 | from sklearn.metrics import roc_auc_score
2 | import numpy as np
3 |
4 |
5 | def dcg_score(y_true, y_score, k=10):
6 | order = np.argsort(y_score)[::-1]
7 | y_true = np.take(y_true, order[:k])
8 | gains = 2 ** y_true - 1
9 | discounts = np.log2(np.arange(len(y_true)) + 2)
10 | return np.sum(gains / discounts)
11 |
12 |
13 | def ndcg_score(y_true, y_score, k=10):
14 | best = dcg_score(y_true, y_true, k)
15 | actual = dcg_score(y_true, y_score, k)
16 | return actual / best
17 |
18 |
19 | def mrr_score(y_true, y_score):
20 | order = np.argsort(y_score)[::-1]
21 | y_true = np.take(y_true, order)
22 | rr_score = y_true / (np.arange(len(y_true)) + 1)
23 | return np.sum(rr_score) / np.sum(y_true)
24 |
25 |
26 | def evaluate(predicts, truths):
27 | aucs = []
28 | mrrs = []
29 | ndcg5s = []
30 | ndcg10s = []
31 | for pre, tru in zip(predicts, truths):
32 | y_true = np.array(tru, dtype='float32')
33 | y_score = 1.0 / np.array(pre, dtype='float32')
34 | auc = roc_auc_score(y_true, y_score)
35 | mrr = mrr_score(y_true, y_score)
36 | ndcg5 = ndcg_score(y_true, y_score, 5)
37 | ndcg10 = ndcg_score(y_true, y_score, 10)
38 |
39 | aucs.append(auc)
40 | mrrs.append(mrr)
41 | ndcg5s.append(ndcg5)
42 | ndcg10s.append(ndcg10)
43 | return np.mean(aucs), np.mean(mrrs), np.mean(ndcg5s), np.mean(ndcg10s)
44 |
45 |
46 |
47 |
--------------------------------------------------------------------------------
/Continuous-Action/utils.py:
--------------------------------------------------------------------------------
1 | from sklearn.metrics import roc_auc_score
2 | import numpy as np
3 |
4 |
5 | def dcg_score(y_true, y_score, k=10):
6 | order = np.argsort(y_score)[::-1]
7 | y_true = np.take(y_true, order[:k])
8 | gains = 2 ** y_true - 1
9 | discounts = np.log2(np.arange(len(y_true)) + 2)
10 | return np.sum(gains / discounts)
11 |
12 |
13 | def ndcg_score(y_true, y_score, k=10):
14 | best = dcg_score(y_true, y_true, k)
15 | actual = dcg_score(y_true, y_score, k)
16 | return actual / best
17 |
18 |
19 | def mrr_score(y_true, y_score):
20 | order = np.argsort(y_score)[::-1]
21 | y_true = np.take(y_true, order)
22 | rr_score = y_true / (np.arange(len(y_true)) + 1)
23 | return np.sum(rr_score) / np.sum(y_true)
24 |
25 |
26 | def evaluate(predicts, truths):
27 | aucs = []
28 | mrrs = []
29 | ndcg5s = []
30 | ndcg10s = []
31 | for pre, tru in zip(predicts, truths):
32 | y_true = np.array(tru, dtype='float32')
33 | y_score = 1.0 / np.array(pre, dtype='float32')
34 | auc = roc_auc_score(y_true, y_score)
35 | mrr = mrr_score(y_true, y_score)
36 | ndcg5 = ndcg_score(y_true, y_score, 5)
37 | ndcg10 = ndcg_score(y_true, y_score, 10)
38 |
39 | aucs.append(auc)
40 | mrrs.append(mrr)
41 | ndcg5s.append(ndcg5)
42 | ndcg10s.append(ndcg10)
43 | return np.mean(aucs), np.mean(mrrs), np.mean(ndcg5s), np.mean(ndcg10s)
44 |
45 |
46 |
47 |
--------------------------------------------------------------------------------
/Continuous-Emotion/utils.py:
--------------------------------------------------------------------------------
1 | from sklearn.metrics import roc_auc_score
2 | import numpy as np
3 |
4 |
5 | def dcg_score(y_true, y_score, k=10):
6 | order = np.argsort(y_score)[::-1]
7 | y_true = np.take(y_true, order[:k])
8 | gains = 2 ** y_true - 1
9 | discounts = np.log2(np.arange(len(y_true)) + 2)
10 | return np.sum(gains / discounts)
11 |
12 |
13 | def ndcg_score(y_true, y_score, k=10):
14 | best = dcg_score(y_true, y_true, k)
15 | actual = dcg_score(y_true, y_score, k)
16 | return actual / best
17 |
18 |
19 | def mrr_score(y_true, y_score):
20 | order = np.argsort(y_score)[::-1]
21 | y_true = np.take(y_true, order)
22 | rr_score = y_true / (np.arange(len(y_true)) + 1)
23 | return np.sum(rr_score) / np.sum(y_true)
24 |
25 |
26 | def evaluate(predicts, truths):
27 | aucs = []
28 | mrrs = []
29 | ndcg5s = []
30 | ndcg10s = []
31 | for pre, tru in zip(predicts, truths):
32 | y_true = np.array(tru, dtype='float32')
33 | y_score = 1.0 / np.array(pre, dtype='float32')
34 | auc = roc_auc_score(y_true, y_score)
35 | mrr = mrr_score(y_true, y_score)
36 | ndcg5 = ndcg_score(y_true, y_score, 5)
37 | ndcg10 = ndcg_score(y_true, y_score, 10)
38 |
39 | aucs.append(auc)
40 | mrrs.append(mrr)
41 | ndcg5s.append(ndcg5)
42 | ndcg10s.append(ndcg10)
43 | return np.mean(aucs), np.mean(mrrs), np.mean(ndcg5s), np.mean(ndcg10s)
44 |
45 |
46 |
47 |
--------------------------------------------------------------------------------
/Continuous-Utility/utils.py:
--------------------------------------------------------------------------------
1 | from sklearn.metrics import roc_auc_score
2 | import numpy as np
3 |
4 |
5 | def dcg_score(y_true, y_score, k=10):
6 | order = np.argsort(y_score)[::-1]
7 | y_true = np.take(y_true, order[:k])
8 | gains = 2 ** y_true - 1
9 | discounts = np.log2(np.arange(len(y_true)) + 2)
10 | return np.sum(gains / discounts)
11 |
12 |
13 | def ndcg_score(y_true, y_score, k=10):
14 | best = dcg_score(y_true, y_true, k)
15 | actual = dcg_score(y_true, y_score, k)
16 | return actual / best
17 |
18 |
19 | def mrr_score(y_true, y_score):
20 | order = np.argsort(y_score)[::-1]
21 | y_true = np.take(y_true, order)
22 | rr_score = y_true / (np.arange(len(y_true)) + 1)
23 | return np.sum(rr_score) / np.sum(y_true)
24 |
25 |
26 | def evaluate(predicts, truths):
27 | aucs = []
28 | mrrs = []
29 | ndcg5s = []
30 | ndcg10s = []
31 | for pre, tru in zip(predicts, truths):
32 | y_true = np.array(tru, dtype='float32')
33 | y_score = 1.0 / np.array(pre, dtype='float32')
34 | auc = roc_auc_score(y_true, y_score)
35 | mrr = mrr_score(y_true, y_score)
36 | ndcg5 = ndcg_score(y_true, y_score, 5)
37 | ndcg10 = ndcg_score(y_true, y_score, 10)
38 |
39 | aucs.append(auc)
40 | mrrs.append(mrr)
41 | ndcg5s.append(ndcg5)
42 | ndcg10s.append(ndcg10)
43 | return np.mean(aucs), np.mean(mrrs), np.mean(ndcg5s), np.mean(ndcg10s)
44 |
45 |
46 |
47 |
--------------------------------------------------------------------------------
/Discrete-Action/utils.py:
--------------------------------------------------------------------------------
1 | from sklearn.metrics import roc_auc_score
2 | import numpy as np
3 |
4 |
5 | def dcg_score(y_true, y_score, k=10):
6 | order = np.argsort(y_score)[::-1]
7 | y_true = np.take(y_true, order[:k])
8 | gains = 2 ** y_true - 1
9 | discounts = np.log2(np.arange(len(y_true)) + 2)
10 | return np.sum(gains / discounts)
11 |
12 |
13 | def ndcg_score(y_true, y_score, k=10):
14 | best = dcg_score(y_true, y_true, k)
15 | actual = dcg_score(y_true, y_score, k)
16 | return actual / best
17 |
18 |
19 | def mrr_score(y_true, y_score):
20 | order = np.argsort(y_score)[::-1]
21 | y_true = np.take(y_true, order)
22 | rr_score = y_true / (np.arange(len(y_true)) + 1)
23 | return np.sum(rr_score) / np.sum(y_true)
24 |
25 |
26 | def evaluate(predicts, truths):
27 | aucs = []
28 | mrrs = []
29 | ndcg5s = []
30 | ndcg10s = []
31 | for pre, tru in zip(predicts, truths):
32 | y_true = np.array(tru, dtype='float32')
33 | y_score = 1.0 / np.array(pre, dtype='float32')
34 | auc = roc_auc_score(y_true, y_score)
35 | mrr = mrr_score(y_true, y_score)
36 | ndcg5 = ndcg_score(y_true, y_score, 5)
37 | ndcg10 = ndcg_score(y_true, y_score, 10)
38 |
39 | aucs.append(auc)
40 | mrrs.append(mrr)
41 | ndcg5s.append(ndcg5)
42 | ndcg10s.append(ndcg10)
43 | return np.mean(aucs), np.mean(mrrs), np.mean(ndcg5s), np.mean(ndcg10s)
44 |
45 |
46 |
47 |
--------------------------------------------------------------------------------
/Discrete-Emotion/utils.py:
--------------------------------------------------------------------------------
1 | from sklearn.metrics import roc_auc_score
2 | import numpy as np
3 |
4 |
5 | def dcg_score(y_true, y_score, k=10):
6 | order = np.argsort(y_score)[::-1]
7 | y_true = np.take(y_true, order[:k])
8 | gains = 2 ** y_true - 1
9 | discounts = np.log2(np.arange(len(y_true)) + 2)
10 | return np.sum(gains / discounts)
11 |
12 |
13 | def ndcg_score(y_true, y_score, k=10):
14 | best = dcg_score(y_true, y_true, k)
15 | actual = dcg_score(y_true, y_score, k)
16 | return actual / best
17 |
18 |
19 | def mrr_score(y_true, y_score):
20 | order = np.argsort(y_score)[::-1]
21 | y_true = np.take(y_true, order)
22 | rr_score = y_true / (np.arange(len(y_true)) + 1)
23 | return np.sum(rr_score) / np.sum(y_true)
24 |
25 |
26 | def evaluate(predicts, truths):
27 | aucs = []
28 | mrrs = []
29 | ndcg5s = []
30 | ndcg10s = []
31 | for pre, tru in zip(predicts, truths):
32 | y_true = np.array(tru, dtype='float32')
33 | y_score = 1.0 / np.array(pre, dtype='float32')
34 | auc = roc_auc_score(y_true, y_score)
35 | mrr = mrr_score(y_true, y_score)
36 | ndcg5 = ndcg_score(y_true, y_score, 5)
37 | ndcg10 = ndcg_score(y_true, y_score, 10)
38 |
39 | aucs.append(auc)
40 | mrrs.append(mrr)
41 | ndcg5s.append(ndcg5)
42 | ndcg10s.append(ndcg10)
43 | return np.mean(aucs), np.mean(mrrs), np.mean(ndcg5s), np.mean(ndcg10s)
44 |
45 |
46 |
47 |
--------------------------------------------------------------------------------
/Discrete-Relevance/utils.py:
--------------------------------------------------------------------------------
1 | from sklearn.metrics import roc_auc_score
2 | import numpy as np
3 |
4 |
5 | def dcg_score(y_true, y_score, k=10):
6 | order = np.argsort(y_score)[::-1]
7 | y_true = np.take(y_true, order[:k])
8 | gains = 2 ** y_true - 1
9 | discounts = np.log2(np.arange(len(y_true)) + 2)
10 | return np.sum(gains / discounts)
11 |
12 |
13 | def ndcg_score(y_true, y_score, k=10):
14 | best = dcg_score(y_true, y_true, k)
15 | actual = dcg_score(y_true, y_score, k)
16 | return actual / best
17 |
18 |
19 | def mrr_score(y_true, y_score):
20 | order = np.argsort(y_score)[::-1]
21 | y_true = np.take(y_true, order)
22 | rr_score = y_true / (np.arange(len(y_true)) + 1)
23 | return np.sum(rr_score) / np.sum(y_true)
24 |
25 |
26 | def evaluate(predicts, truths):
27 | aucs = []
28 | mrrs = []
29 | ndcg5s = []
30 | ndcg10s = []
31 | for pre, tru in zip(predicts, truths):
32 | y_true = np.array(tru, dtype='float32')
33 | y_score = 1.0 / np.array(pre, dtype='float32')
34 | auc = roc_auc_score(y_true, y_score)
35 | mrr = mrr_score(y_true, y_score)
36 | ndcg5 = ndcg_score(y_true, y_score, 5)
37 | ndcg10 = ndcg_score(y_true, y_score, 10)
38 |
39 | aucs.append(auc)
40 | mrrs.append(mrr)
41 | ndcg5s.append(ndcg5)
42 | ndcg10s.append(ndcg10)
43 | return np.mean(aucs), np.mean(mrrs), np.mean(ndcg5s), np.mean(ndcg10s)
44 |
45 |
46 |
47 |
--------------------------------------------------------------------------------
/Discrete-Utility/utils.py:
--------------------------------------------------------------------------------
1 | from sklearn.metrics import roc_auc_score
2 | import numpy as np
3 |
4 |
5 | def dcg_score(y_true, y_score, k=10):
6 | order = np.argsort(y_score)[::-1]
7 | y_true = np.take(y_true, order[:k])
8 | gains = 2 ** y_true - 1
9 | discounts = np.log2(np.arange(len(y_true)) + 2)
10 | return np.sum(gains / discounts)
11 |
12 |
13 | def ndcg_score(y_true, y_score, k=10):
14 | best = dcg_score(y_true, y_true, k)
15 | actual = dcg_score(y_true, y_score, k)
16 | return actual / best
17 |
18 |
19 | def mrr_score(y_true, y_score):
20 | order = np.argsort(y_score)[::-1]
21 | y_true = np.take(y_true, order)
22 | rr_score = y_true / (np.arange(len(y_true)) + 1)
23 | return np.sum(rr_score) / np.sum(y_true)
24 |
25 |
26 | def evaluate(predicts, truths):
27 | aucs = []
28 | mrrs = []
29 | ndcg5s = []
30 | ndcg10s = []
31 | for pre, tru in zip(predicts, truths):
32 | y_true = np.array(tru, dtype='float32')
33 | y_score = 1.0 / np.array(pre, dtype='float32')
34 | auc = roc_auc_score(y_true, y_score)
35 | mrr = mrr_score(y_true, y_score)
36 | ndcg5 = ndcg_score(y_true, y_score, 5)
37 | ndcg10 = ndcg_score(y_true, y_score, 10)
38 |
39 | aucs.append(auc)
40 | mrrs.append(mrr)
41 | ndcg5s.append(ndcg5)
42 | ndcg10s.append(ndcg10)
43 | return np.mean(aucs), np.mean(mrrs), np.mean(ndcg5s), np.mean(ndcg10s)
44 |
45 |
46 |
47 |
--------------------------------------------------------------------------------
/Hybrid-Relevance/utils.py:
--------------------------------------------------------------------------------
1 | from sklearn.metrics import roc_auc_score
2 | import numpy as np
3 |
4 |
5 | def dcg_score(y_true, y_score, k=10):
6 | order = np.argsort(y_score)[::-1]
7 | y_true = np.take(y_true, order[:k])
8 | gains = 2 ** y_true - 1
9 | discounts = np.log2(np.arange(len(y_true)) + 2)
10 | return np.sum(gains / discounts)
11 |
12 |
13 | def ndcg_score(y_true, y_score, k=10):
14 | best = dcg_score(y_true, y_true, k)
15 | actual = dcg_score(y_true, y_score, k)
16 | return actual / best
17 |
18 |
19 | def mrr_score(y_true, y_score):
20 | order = np.argsort(y_score)[::-1]
21 | y_true = np.take(y_true, order)
22 | rr_score = y_true / (np.arange(len(y_true)) + 1)
23 | return np.sum(rr_score) / np.sum(y_true)
24 |
25 |
26 | def evaluate(predicts, truths):
27 | aucs = []
28 | mrrs = []
29 | ndcg5s = []
30 | ndcg10s = []
31 | for pre, tru in zip(predicts, truths):
32 | y_true = np.array(tru, dtype='float32')
33 | y_score = 1.0 / np.array(pre, dtype='float32')
34 | auc = roc_auc_score(y_true, y_score)
35 | mrr = mrr_score(y_true, y_score)
36 | ndcg5 = ndcg_score(y_true, y_score, 5)
37 | ndcg10 = ndcg_score(y_true, y_score, 10)
38 |
39 | aucs.append(auc)
40 | mrrs.append(mrr)
41 | ndcg5s.append(ndcg5)
42 | ndcg10s.append(ndcg10)
43 | return np.mean(aucs), np.mean(mrrs), np.mean(ndcg5s), np.mean(ndcg10s)
44 |
45 |
46 |
47 |
--------------------------------------------------------------------------------
/Continuous-Relevance/utils.py:
--------------------------------------------------------------------------------
1 | from sklearn.metrics import roc_auc_score
2 | import numpy as np
3 |
4 |
5 | def dcg_score(y_true, y_score, k=10):
6 | order = np.argsort(y_score)[::-1]
7 | y_true = np.take(y_true, order[:k])
8 | gains = 2 ** y_true - 1
9 | discounts = np.log2(np.arange(len(y_true)) + 2)
10 | return np.sum(gains / discounts)
11 |
12 |
13 | def ndcg_score(y_true, y_score, k=10):
14 | best = dcg_score(y_true, y_true, k)
15 | actual = dcg_score(y_true, y_score, k)
16 | return actual / best
17 |
18 |
19 | def mrr_score(y_true, y_score):
20 | order = np.argsort(y_score)[::-1]
21 | y_true = np.take(y_true, order)
22 | rr_score = y_true / (np.arange(len(y_true)) + 1)
23 | return np.sum(rr_score) / np.sum(y_true)
24 |
25 |
26 | def evaluate(predicts, truths):
27 | aucs = []
28 | mrrs = []
29 | ndcg5s = []
30 | ndcg10s = []
31 | for pre, tru in zip(predicts, truths):
32 | y_true = np.array(tru, dtype='float32')
33 | y_score = 1.0 / np.array(pre, dtype='float32')
34 | auc = roc_auc_score(y_true, y_score)
35 | mrr = mrr_score(y_true, y_score)
36 | ndcg5 = ndcg_score(y_true, y_score, 5)
37 | ndcg10 = ndcg_score(y_true, y_score, 10)
38 |
39 | aucs.append(auc)
40 | mrrs.append(mrr)
41 | ndcg5s.append(ndcg5)
42 | ndcg10s.append(ndcg10)
43 | return np.mean(aucs), np.mean(mrrs), np.mean(ndcg5s), np.mean(ndcg10s)
44 |
45 |
46 |
47 |
--------------------------------------------------------------------------------
/Hybrid-Action/run.sh:
--------------------------------------------------------------------------------
1 | python main-multigpu.py --data_path ../DATA/MIND-Small --train_data_path ../DATA/MIND-Small --epochs 5 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True
2 | python predict.py --data_path ../DATA/MIND-Small --ratio 0.5 --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True
3 |
4 | python main-multigpu.py --data_path ../DATA/MIND-Small --train_data_path ../DATA/MIND-Small-0.5 --epochs 5 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True
5 | python predict.py --data_path ../DATA/MIND-Small --ratio 0.5 --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True
6 |
7 | python main-multigpu.py --data_path ../DATA/MIND-Small --train_data_path ../DATA/MIND-Small-0.3 --epochs 5 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True
8 | python predict.py --data_path ../DATA/MIND-Small --ratio 0.3 --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True
9 |
10 | python main-multigpu.py --data_path ../DATA/MIND-Small --train_data_path ../DATA/MIND-Small-0.2 --epochs 5 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True
11 | python predict.py --data_path ../DATA/MIND-Small --ratio 0.2 --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True
12 |
13 | python main-multigpu.py --data_path ../DATA/MIND-Small --train_data_path ../DATA/MIND-Small-0.1 --epochs 5 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True
14 | python predict.py --data_path ../DATA/MIND-Small --ratio 0.1 --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True
15 |
16 |
--------------------------------------------------------------------------------
/Continuous-Utility/run.sh:
--------------------------------------------------------------------------------
1 | python main-multigpu.py --data_path ../DATA/MIND-Small --train_data_path ../DATA/MIND-Small --epochs 3 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True
2 | python predict.py --data_path ../DATA/MIND-Small --ratio 0.5 --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True
3 |
4 | python main-multigpu.py --data_path ../DATA/MIND-Small --train_data_path ../DATA/MIND-Small-0.5 --epochs 3 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True
5 | python predict.py --data_path ../DATA/MIND-Small --ratio 0.5 --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True
6 |
7 | python main-multigpu.py --data_path ../DATA/MIND-Small --train_data_path ../DATA/MIND-Small-0.3 --epochs 3 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True
8 | python predict.py --data_path ../DATA/MIND-Small --ratio 0.3 --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True
9 |
10 | python main-multigpu.py --data_path ../DATA/MIND-Small --train_data_path ../DATA/MIND-Small-0.2 --epochs 3 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True
11 | python predict.py --data_path ../DATA/MIND-Small --ratio 0.2 --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True
12 |
13 | python main-multigpu.py --data_path ../DATA/MIND-Small --train_data_path ../DATA/MIND-Small-0.1 --epochs 3 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True
14 | python predict.py --data_path ../DATA/MIND-Small --ratio 0.1 --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True
15 |
16 | python main-multigpu.py --data_path ../DATA/MIND-Small --train_data_path ../DATA/MIND-Small-0.05 --epochs 5 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True
17 | python predict.py --data_path ../DATA/MIND-Small --ratio 0.05 --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True
18 |
19 | python main-multigpu.py --data_path ../DATA/MIND-Small --train_data_path ../DATA/MIND-Small-0.01 --epochs 5 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True
20 | python predict.py --data_path ../DATA/MIND-Small --ratio 0.01 --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True
--------------------------------------------------------------------------------
/Discrete-Utility/run.sh:
--------------------------------------------------------------------------------
1 | python main-multigpu.py --data_path ../DATA/MIND-Small --train_data_path ../DATA/MIND-Small --epochs 4 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True
2 | python predict.py --data_path ../DATA/MIND-Small --ratio 0.5 --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True
3 |
4 | python main-multigpu.py --data_path ../DATA/MIND-Small --train_data_path ../DATA/MIND-Small-0.5 --epochs 4 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True
5 | python predict.py --data_path ../DATA/MIND-Small --ratio 0.5 --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True
6 |
7 | python main-multigpu.py --data_path ../DATA/MIND-Small --train_data_path ../DATA/MIND-Small-0.3 --epochs 4 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True
8 | python predict.py --data_path ../DATA/MIND-Small --ratio 0.3 --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True
9 |
10 | python main-multigpu.py --data_path ../DATA/MIND-Small --train_data_path ../DATA/MIND-Small-0.2 --epochs 4 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True
11 | python predict.py --data_path ../DATA/MIND-Small --ratio 0.2 --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True
12 |
13 | python main-multigpu.py --data_path ../DATA/MIND-Small --train_data_path ../DATA/MIND-Small-0.1 --epochs 4 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True
14 | python predict.py --data_path ../DATA/MIND-Small --ratio 0.1 --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True
15 |
16 | python main-multigpu.py --data_path ../DATA/MIND-Small --train_data_path ../DATA/MIND-Small-0.05 --epochs 4 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True
17 | python predict.py --data_path ../DATA/MIND-Small --ratio 0.05 --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True
18 |
19 | python main-multigpu.py --data_path ../DATA/MIND-Small --train_data_path ../DATA/MIND-Small-0.01 --epochs 4 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True
20 | python predict.py --data_path ../DATA/MIND-Small --ratio 0.01 --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## Prompt4NR: Prompt Learning for News Recommendation
2 | Source code for SIGIR 2023 paper: Prompt Learning for News Recommendation
3 |
4 | ### The Prompt4NR Framework
5 |
6 |
7 |
8 |
9 |
10 | ### Directory Structure:
11 | 12 directories correspond to 12 prompt templates three types (Discrete, Continuous, Hybrid) of templates from four perspectives (Relevance, Emotion, Action, Utility)
12 | - Discrete-Relevance, Discrete-Emotion, Discrete-Action, Discrete-Utility
13 | - Continuous-Relevance, Continuous-Emotion, Continuous-Action, Continuous-Utility
14 | - Hybrid-Relevance, Hybrid-Emotion, Hybrid-Action, Hybrid-Utility
15 |
16 | ### Details of the 12 templates are provided as follows:
17 |
18 |
19 |
20 | ### Dataset
21 |
22 | The experiments are based on public dataset MIND, we use the small version MIND-Small.
23 |
24 | For our paper, we have preprocessed the original dataset and store it as binary files via "pickle". Even though I use ".txt" as the file extension, they are still binary files stored by pickle, you can use pickle package to directly load them, which include:
25 |
26 | - train.txt: training set
27 | - val.txt: validation set
28 | - test.txt: testing set
29 | - news.txt: containing information of all news
30 |
31 | I have shared our preprocessed dataset on Google Drive as follows:
32 |
33 |
34 |
35 | ### Model Checkpoints
36 |
37 | I have shared our trained model checkpoints on Google Drive as follows:
38 |
39 |
40 |
41 | ### How to Run These codes
42 | In each directory, there is a script called ``run.sh`` that can run the codes for the corresponding template.
43 | Take “Discrete-Relevance” template as an example, the ``run.sh`` file is shown as follows:
44 | ```
45 | python main-multigpu.py --data_path ../DATA/MIND-Small --epochs 4 --batch_size 16 --test_batch_size 100 --wd 1e-3 --max_tokens 500 --log True --model_save True
46 | python predict.py --data_path ../DATA/MIND-Small --test_batch_size 100 --max_tokens 500 --model_file ./temp/BestModel.pt --log True
47 | ```
48 | - The first line is used to train the model on the training set and evaluate it on the validation set at each epoch. During this process, the model with the best performance on the validation set will be stored.
49 | - The second line is used to evaluate the "best" model on the testing set to obtain the performance evaluation.
50 |
51 | We implement the source code via the Distributed Data Parallel (DDP) technology provided by pytorch. Hence, our codes is a Multi-GPUs version. We encourage you to overwrite our code to obtain a Single-GPU version.
52 |
53 | ### Enviroments
54 | - python==3.7
55 | - pytorch==1.13.0
56 | - cuda==116
57 | - transformers==4.27.0
58 |
59 | ### Citation
60 | If you use this codes, please cite our paper!
61 | ```
62 | @inproceedings{zhang2023prompt,
63 | author = {Zhang, Zizhuo and Wang, Bang},
64 | title = {Prompt Learning for News Recommendation},
65 | year = {2023},
66 | booktitle = {Proceedings of the 46th International ACM SIGIR Conference on Research and Development in Information Retrieval},
67 | pages = {227–237},
68 | numpages = {11},
69 | location = {Taipei, Taiwan},
70 | series = {SIGIR '23}
71 | }
72 | ```
73 |
--------------------------------------------------------------------------------
/Discrete-Action/prepro_data.py:
--------------------------------------------------------------------------------
1 |
2 | import re
3 | import random
4 | import numpy as np
5 | from torch.utils.data import Dataset
6 | import pickle
7 | import os
8 | import torch
9 |
10 |
11 | class MyDataset(Dataset):
12 | def __init__(self, args, tokenizer, news_dict, status='train'):
13 | self.tokenizer = tokenizer
14 | self.news_dict = news_dict
15 | self.args = args
16 | self.status = status
17 |
18 | self.data = []
19 | self.imp_lens = []
20 | if self.status == 'train':
21 | self.data_path = os.path.join(args.data_path, 'train.txt')
22 | elif self.status == 'val':
23 | self.data_path = os.path.join(args.data_path, 'val.txt')
24 | else:
25 | self.data_path = os.path.join(args.data_path, 'test.txt')
26 | self.load_data()
27 |
28 | def __len__(self):
29 | return len(self.data)
30 |
31 | def __getitem__(self, item):
32 | return self.data[item]
33 |
34 | def obtain_data(self, data):
35 | return data[0], data[1], data[2], data[3]
36 |
37 | def prepro_train(self, imp_ids, behaviors, news_dict, K_samples,
38 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450):
39 | template = "User: [SEP] News: [SEP] Dose the user click the news? [MASK]"
40 | for impid, behav in zip(imp_ids, behaviors):
41 | his_clicks = behav[0][-max_his:]
42 | his_clicks.reverse()
43 | his_titles = []
44 | for news in his_clicks:
45 | title = news_dict[news]['title']
46 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
47 |
48 | title = ' '.join(title.split(' ')[:max_title_len])
49 |
50 | his_titles.append(title)
51 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles)
52 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len]
53 | his_sen = self.tokenizer.decode(his_sen_ids)
54 | base_sentence = template.replace("", his_sen)
55 |
56 | positives = behav[1]
57 | negatives = behav[2]
58 |
59 | for news in positives:
60 | title = news_dict[news]['title']
61 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
62 |
63 | title = ' '.join(title.split(' ')[:max_candi_len])
64 |
65 | sentence = base_sentence.replace("", title)
66 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid})
67 |
68 | if len(negatives) >= K_samples:
69 | sample_negs = random.sample(negatives, k=K_samples)
70 | else:
71 | sample_negs = np.random.choice(negatives, K_samples, replace=True).tolist()
72 |
73 | for neg in sample_negs:
74 | neg_title = news_dict[neg]['title']
75 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title)
76 |
77 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len])
78 |
79 | sentence = base_sentence.replace("", neg_title)
80 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid})
81 |
82 | def prepro_dev(self, imp_ids, behaviors, news_dict,
83 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450):
84 | template = "User: [SEP] News: [SEP] Dose the user click the news? [MASK]"
85 | for impid, behav in zip(imp_ids, behaviors):
86 | if len(behav[0]) == 0:
87 | continue
88 | his_clicks = behav[0][-max_his:]
89 | his_clicks.reverse()
90 | his_titles = []
91 | for news in his_clicks:
92 | title = news_dict[news]['title']
93 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
94 |
95 | title = ' '.join(title.split(' ')[:max_title_len])
96 |
97 | his_titles.append(title)
98 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles)
99 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len]
100 | his_sen = self.tokenizer.decode(his_sen_ids)
101 | base_sentence = template.replace("", his_sen)
102 |
103 | positives = behav[1]
104 | negatives = behav[2]
105 | for news in positives:
106 | title = news_dict[news]['title']
107 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
108 |
109 | title = ' '.join(title.split(' ')[:max_candi_len])
110 |
111 | sentence = base_sentence.replace("", title)
112 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid})
113 |
114 | for neg in negatives:
115 | neg_title = news_dict[neg]['title']
116 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title)
117 |
118 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len])
119 |
120 | sentence = base_sentence.replace("", neg_title)
121 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid})
122 |
123 | def load_data(self):
124 | data = pickle.load(open(self.data_path, 'rb'))
125 | imps, users, times, behaviors = self.obtain_data(data)
126 | if self.status == 'train':
127 | self.prepro_train(imps, behaviors, self.news_dict, self.args.num_negs, self.args.max_his,
128 | max_his_len=self.args.max_his_len)
129 | else:
130 | self.prepro_dev(imps, behaviors, self.news_dict, self.args.max_his,
131 | max_his_len=self.args.max_his_len)
132 |
133 | def collate_fn(self, batch):
134 | sentences = [x['sentence'] for x in batch]
135 | target = [x['target'] for x in batch]
136 | imp_id = [x['imp'] for x in batch]
137 |
138 | encode_dict = self.tokenizer.batch_encode_plus(
139 | sentences,
140 | add_special_tokens=True,
141 | padding='max_length',
142 | max_length=self.args.max_tokens,
143 | truncation=True,
144 | pad_to_max_length=True,
145 | return_attention_mask=True,
146 | return_tensors='pt'
147 | )
148 |
149 | batch_enc = encode_dict['input_ids']
150 | batch_attn = encode_dict['attention_mask']
151 | target = torch.LongTensor(target)
152 |
153 | return batch_enc, batch_attn, target, imp_id
154 |
155 |
156 |
157 |
158 |
159 |
--------------------------------------------------------------------------------
/Discrete-Relevance/prepro_data.py:
--------------------------------------------------------------------------------
1 |
2 | import re
3 | import random
4 | import numpy as np
5 | from torch.utils.data import Dataset
6 | import pickle
7 | import os
8 | import torch
9 |
10 |
11 | class MyDataset(Dataset):
12 | def __init__(self, args, tokenizer, news_dict, status='train'):
13 | self.tokenizer = tokenizer
14 | self.news_dict = news_dict
15 | self.args = args
16 | self.status = status
17 |
18 | self.data = []
19 | self.imp_lens = []
20 | if self.status == 'train':
21 | self.data_path = os.path.join(args.data_path, 'train.txt')
22 | elif self.status == 'val':
23 | self.data_path = os.path.join(args.data_path, 'val.txt')
24 | else:
25 | self.data_path = os.path.join(args.data_path, 'test.txt')
26 | self.load_data()
27 |
28 | def __len__(self):
29 | return len(self.data)
30 |
31 | def __getitem__(self, item):
32 | return self.data[item]
33 |
34 | def obtain_data(self, data):
35 | # if self.status == 'train':
36 | # return data[0][:20], data[1][:20], data[2][:20], data[3][:20]
37 | # else:
38 | # return data[0], data[1], data[2], data[3]
39 | return data[0], data[1], data[2], data[3]
40 |
41 | def prepro_train(self, imp_ids, behaviors, news_dict, K_samples,
42 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450):
43 | template = " is [MASK] to "
44 | for impid, behav in zip(imp_ids, behaviors):
45 | his_clicks = behav[0][-max_his:]
46 | his_clicks.reverse()
47 | his_titles = []
48 | for news in his_clicks:
49 | title = news_dict[news]['title']
50 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
51 |
52 | title = ' '.join(title.split(' ')[:max_title_len])
53 |
54 | his_titles.append(title)
55 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles)
56 | # his_sen = ' '.join(his_sen.split(' ')[:max_his_len])
57 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len]
58 | his_sen = self.tokenizer.decode(his_sen_ids)
59 | base_sentence = template.replace("", his_sen)
60 |
61 | positives = behav[1]
62 | negatives = behav[2]
63 |
64 | for news in positives:
65 | title = news_dict[news]['title']
66 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
67 |
68 | title = ' '.join(title.split(' ')[:max_candi_len])
69 |
70 | sentence = base_sentence.replace("", title)
71 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid})
72 |
73 | if len(negatives) >= K_samples:
74 | sample_negs = random.sample(negatives, k=K_samples)
75 | else:
76 | sample_negs = np.random.choice(negatives, K_samples, replace=True).tolist()
77 |
78 | for neg in sample_negs:
79 | neg_title = news_dict[neg]['title']
80 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title)
81 |
82 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len])
83 |
84 | sentence = base_sentence.replace("", neg_title)
85 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid})
86 |
87 | def prepro_dev(self, imp_ids, behaviors, news_dict,
88 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450):
89 | template = " is [MASK] to "
90 | for impid, behav in zip(imp_ids, behaviors):
91 | if len(behav[0]) == 0:
92 | continue
93 | his_clicks = behav[0][-max_his:]
94 | his_clicks.reverse()
95 | his_titles = []
96 | for news in his_clicks:
97 | title = news_dict[news]['title']
98 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
99 |
100 | title = ' '.join(title.split(' ')[:max_title_len])
101 |
102 | his_titles.append(title)
103 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles)
104 | # his_sen = ' '.join(his_sen.split(' ')[:max_his_len])
105 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len]
106 | his_sen = self.tokenizer.decode(his_sen_ids)
107 | base_sentence = template.replace("", his_sen)
108 |
109 | positives = behav[1]
110 | negatives = behav[2]
111 | for news in positives:
112 | title = news_dict[news]['title']
113 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
114 |
115 | title = ' '.join(title.split(' ')[:max_candi_len])
116 |
117 | sentence = base_sentence.replace("", title)
118 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid})
119 |
120 | for neg in negatives:
121 | neg_title = news_dict[neg]['title']
122 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title)
123 |
124 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len])
125 |
126 | sentence = base_sentence.replace("", neg_title)
127 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid})
128 |
129 | def load_data(self):
130 | data = pickle.load(open(self.data_path, 'rb'))
131 | imps, users, times, behaviors = self.obtain_data(data)
132 | if self.status == 'train':
133 | self.prepro_train(imps, behaviors, self.news_dict, self.args.num_negs, self.args.max_his,
134 | max_his_len=self.args.max_his_len)
135 | else:
136 | self.prepro_dev(imps, behaviors, self.news_dict, self.args.max_his,
137 | max_his_len=self.args.max_his_len)
138 |
139 | def collate_fn(self, batch):
140 | sentences = [x['sentence'] for x in batch]
141 | target = [x['target'] for x in batch]
142 | imp_id = [x['imp'] for x in batch]
143 |
144 | encode_dict = self.tokenizer.batch_encode_plus(
145 | sentences,
146 | add_special_tokens=True,
147 | padding='max_length',
148 | max_length=self.args.max_tokens,
149 | truncation=True,
150 | pad_to_max_length=True,
151 | return_attention_mask=True,
152 | return_tensors='pt'
153 | )
154 |
155 | batch_enc = encode_dict['input_ids']
156 | batch_attn = encode_dict['attention_mask']
157 | target = torch.LongTensor(target)
158 |
159 | return batch_enc, batch_attn, target, imp_id
160 |
161 |
162 |
163 |
164 |
165 |
--------------------------------------------------------------------------------
/Discrete-Utility/prepro_data.py:
--------------------------------------------------------------------------------
1 |
2 | import re
3 | import random
4 | import numpy as np
5 | from torch.utils.data import Dataset
6 | import pickle
7 | import os
8 | import torch
9 |
10 |
11 | class MyDataset(Dataset):
12 | def __init__(self, args, tokenizer, news_dict, status='train'):
13 | self.tokenizer = tokenizer
14 | self.news_dict = news_dict
15 | self.args = args
16 | self.status = status
17 |
18 | self.data = []
19 | self.imp_lens = []
20 | if self.status == 'train':
21 | self.data_path = os.path.join(args.train_data_path, 'train.txt')
22 | elif self.status == 'val':
23 | self.data_path = os.path.join(args.data_path, 'val.txt')
24 | else:
25 | self.data_path = os.path.join(args.data_path, 'test.txt')
26 | self.load_data()
27 |
28 | def __len__(self):
29 | return len(self.data)
30 |
31 | def __getitem__(self, item):
32 | return self.data[item]
33 |
34 | def obtain_data(self, data):
35 | # if self.status == 'train':
36 | # return data[0][:20], data[1][:20], data[2][:20], data[3][:20]
37 | # else:
38 | # return data[0], data[1], data[2], data[3]
39 | return data[0], data[1], data[2], data[3]
40 |
41 | def prepro_train(self, imp_ids, behaviors, news_dict, K_samples,
42 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450):
43 | template = "Recommending to the user is a [MASK] choice according to "
44 | for impid, behav in zip(imp_ids, behaviors):
45 | his_clicks = behav[0][-max_his:]
46 | his_clicks.reverse()
47 | his_titles = []
48 | for news in his_clicks:
49 | title = news_dict[news]['title']
50 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
51 |
52 | title = ' '.join(title.split(' ')[:max_title_len])
53 |
54 | his_titles.append(title)
55 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles)
56 | # his_sen = ' '.join(his_sen.split(' ')[:max_his_len])
57 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len]
58 | his_sen = self.tokenizer.decode(his_sen_ids)
59 | base_sentence = template.replace("", his_sen)
60 |
61 | positives = behav[1]
62 | negatives = behav[2]
63 |
64 | for news in positives:
65 | title = news_dict[news]['title']
66 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
67 |
68 | title = ' '.join(title.split(' ')[:max_candi_len])
69 |
70 | sentence = base_sentence.replace("", title)
71 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid})
72 |
73 | if len(negatives) >= K_samples:
74 | sample_negs = random.sample(negatives, k=K_samples)
75 | else:
76 | sample_negs = np.random.choice(negatives, K_samples, replace=True).tolist()
77 |
78 | for neg in sample_negs:
79 | neg_title = news_dict[neg]['title']
80 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title)
81 |
82 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len])
83 |
84 | sentence = base_sentence.replace("", neg_title)
85 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid})
86 |
87 | def prepro_dev(self, imp_ids, behaviors, news_dict,
88 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450):
89 | template = "Recommending to the user is a [MASK] choice according to "
90 | for impid, behav in zip(imp_ids, behaviors):
91 | if len(behav[0]) == 0:
92 | continue
93 | his_clicks = behav[0][-max_his:]
94 | his_clicks.reverse()
95 | his_titles = []
96 | for news in his_clicks:
97 | title = news_dict[news]['title']
98 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
99 |
100 | title = ' '.join(title.split(' ')[:max_title_len])
101 |
102 | his_titles.append(title)
103 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles)
104 | # his_sen = ' '.join(his_sen.split(' ')[:max_his_len])
105 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len]
106 | his_sen = self.tokenizer.decode(his_sen_ids)
107 | base_sentence = template.replace("", his_sen)
108 |
109 | positives = behav[1]
110 | negatives = behav[2]
111 | for news in positives:
112 | title = news_dict[news]['title']
113 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
114 |
115 | title = ' '.join(title.split(' ')[:max_candi_len])
116 |
117 | sentence = base_sentence.replace("", title)
118 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid})
119 |
120 | for neg in negatives:
121 | neg_title = news_dict[neg]['title']
122 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title)
123 |
124 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len])
125 |
126 | sentence = base_sentence.replace("", neg_title)
127 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid})
128 |
129 | def load_data(self):
130 | data = pickle.load(open(self.data_path, 'rb'))
131 | imps, users, times, behaviors = self.obtain_data(data)
132 | if self.status == 'train':
133 | self.prepro_train(imps, behaviors, self.news_dict, self.args.num_negs, self.args.max_his,
134 | max_his_len=self.args.max_his_len)
135 | else:
136 | self.prepro_dev(imps, behaviors, self.news_dict, self.args.max_his,
137 | max_his_len=self.args.max_his_len)
138 |
139 | def collate_fn(self, batch):
140 | sentences = [x['sentence'] for x in batch]
141 | target = [x['target'] for x in batch]
142 | imp_id = [x['imp'] for x in batch]
143 |
144 | encode_dict = self.tokenizer.batch_encode_plus(
145 | sentences,
146 | add_special_tokens=True,
147 | padding='max_length',
148 | max_length=self.args.max_tokens,
149 | truncation=True,
150 | pad_to_max_length=True,
151 | return_attention_mask=True,
152 | return_tensors='pt'
153 | )
154 |
155 | batch_enc = encode_dict['input_ids']
156 | batch_attn = encode_dict['attention_mask']
157 | target = torch.LongTensor(target)
158 |
159 | return batch_enc, batch_attn, target, imp_id
160 |
161 |
162 |
163 |
164 |
165 |
--------------------------------------------------------------------------------
/Discrete-Emotion/prepro_data.py:
--------------------------------------------------------------------------------
1 |
2 | import re
3 | import random
4 | import numpy as np
5 | from torch.utils.data import Dataset
6 | import pickle
7 | import os
8 | import torch
9 |
10 |
11 | class MyDataset(Dataset):
12 | def __init__(self, args, tokenizer, news_dict, status='train'):
13 | self.tokenizer = tokenizer
14 | self.news_dict = news_dict
15 | self.args = args
16 | self.status = status
17 |
18 | self.data = []
19 | self.imp_lens = []
20 | if self.status == 'train':
21 | self.data_path = os.path.join(args.data_path, 'train.txt')
22 | elif self.status == 'val':
23 | self.data_path = os.path.join(args.data_path, 'val.txt')
24 | else:
25 | self.data_path = os.path.join(args.data_path, 'test.txt')
26 | self.load_data()
27 |
28 | def __len__(self):
29 | return len(self.data)
30 |
31 | def __getitem__(self, item):
32 | return self.data[item]
33 |
34 | def obtain_data(self, data):
35 | # if self.status == 'train':
36 | # return data[0][:20], data[1][:20], data[2][:20], data[3][:20]
37 | # else:
38 | # return data[0], data[1], data[2], data[3]
39 | return data[0], data[1], data[2], data[3]
40 |
41 | def prepro_train(self, imp_ids, behaviors, news_dict, K_samples,
42 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450):
43 | template = "The user feels [MASK] about according to his area of interest "
44 | for impid, behav in zip(imp_ids, behaviors):
45 | his_clicks = behav[0][-max_his:]
46 | his_clicks.reverse()
47 | his_titles = []
48 | for news in his_clicks:
49 | title = news_dict[news]['title']
50 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
51 |
52 | title = ' '.join(title.split(' ')[:max_title_len])
53 |
54 | his_titles.append(title)
55 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles)
56 | # his_sen = ' '.join(his_sen.split(' ')[:max_his_len])
57 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len]
58 | his_sen = self.tokenizer.decode(his_sen_ids)
59 | base_sentence = template.replace("", his_sen)
60 |
61 | positives = behav[1]
62 | negatives = behav[2]
63 |
64 | for news in positives:
65 | title = news_dict[news]['title']
66 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
67 |
68 | title = ' '.join(title.split(' ')[:max_candi_len])
69 |
70 | sentence = base_sentence.replace("", title)
71 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid})
72 |
73 | if len(negatives) >= K_samples:
74 | sample_negs = random.sample(negatives, k=K_samples)
75 | else:
76 | sample_negs = np.random.choice(negatives, K_samples, replace=True).tolist()
77 |
78 | for neg in sample_negs:
79 | neg_title = news_dict[neg]['title']
80 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title)
81 |
82 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len])
83 |
84 | sentence = base_sentence.replace("", neg_title)
85 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid})
86 |
87 | def prepro_dev(self, imp_ids, behaviors, news_dict,
88 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450):
89 | template = "The user feels [MASK] about according to his area of interest "
90 | for impid, behav in zip(imp_ids, behaviors):
91 | if len(behav[0]) == 0:
92 | continue
93 | his_clicks = behav[0][-max_his:]
94 | his_clicks.reverse()
95 | his_titles = []
96 | for news in his_clicks:
97 | title = news_dict[news]['title']
98 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
99 |
100 | title = ' '.join(title.split(' ')[:max_title_len])
101 |
102 | his_titles.append(title)
103 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles)
104 | # his_sen = ' '.join(his_sen.split(' ')[:max_his_len])
105 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len]
106 | his_sen = self.tokenizer.decode(his_sen_ids)
107 | base_sentence = template.replace("", his_sen)
108 |
109 | positives = behav[1]
110 | negatives = behav[2]
111 | for news in positives:
112 | title = news_dict[news]['title']
113 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
114 |
115 | title = ' '.join(title.split(' ')[:max_candi_len])
116 |
117 | sentence = base_sentence.replace("", title)
118 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid})
119 |
120 | for neg in negatives:
121 | neg_title = news_dict[neg]['title']
122 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title)
123 |
124 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len])
125 |
126 | sentence = base_sentence.replace("", neg_title)
127 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid})
128 |
129 | def load_data(self):
130 | data = pickle.load(open(self.data_path, 'rb'))
131 | imps, users, times, behaviors = self.obtain_data(data)
132 | if self.status == 'train':
133 | self.prepro_train(imps, behaviors, self.news_dict, self.args.num_negs, self.args.max_his,
134 | max_his_len=self.args.max_his_len)
135 | else:
136 | self.prepro_dev(imps, behaviors, self.news_dict, self.args.max_his,
137 | max_his_len=self.args.max_his_len)
138 |
139 | def collate_fn(self, batch):
140 | sentences = [x['sentence'] for x in batch]
141 | target = [x['target'] for x in batch]
142 | imp_id = [x['imp'] for x in batch]
143 |
144 | encode_dict = self.tokenizer.batch_encode_plus(
145 | sentences,
146 | add_special_tokens=True,
147 | padding='max_length',
148 | max_length=self.args.max_tokens,
149 | truncation=True,
150 | pad_to_max_length=True,
151 | return_attention_mask=True,
152 | return_tensors='pt'
153 | )
154 |
155 | batch_enc = encode_dict['input_ids']
156 | batch_attn = encode_dict['attention_mask']
157 | target = torch.LongTensor(target)
158 |
159 | return batch_enc, batch_attn, target, imp_id
160 |
161 |
162 |
163 |
164 |
165 |
--------------------------------------------------------------------------------
/Continuous-Emotion/prepro_data.py:
--------------------------------------------------------------------------------
1 |
2 | import re
3 | import random
4 | import numpy as np
5 | from torch.utils.data import Dataset
6 | import pickle
7 | import os
8 | import torch
9 |
10 |
11 | class MyDataset(Dataset):
12 | def __init__(self, args, tokenizer, news_dict, conti_tokens, status='train'):
13 | self.tokenizer = tokenizer
14 | self.news_dict = news_dict
15 | self.args = args
16 | self.status = status
17 | self.conti_tokens = conti_tokens
18 |
19 | self.data = []
20 | self.imp_lens = []
21 | if self.status == 'train':
22 | self.data_path = os.path.join(args.data_path, 'train.txt')
23 | elif self.status == 'val':
24 | self.data_path = os.path.join(args.data_path, 'val.txt')
25 | else:
26 | self.data_path = os.path.join(args.data_path, 'test.txt')
27 | self.load_data()
28 |
29 | def __len__(self):
30 | return len(self.data)
31 |
32 | def __getitem__(self, item):
33 | return self.data[item]
34 |
35 | def obtain_data(self, data):
36 | # if self.status == 'train':
37 | # return data[0][:20], data[1][:20], data[2][:20], data[3][:20]
38 | # else:
39 | # return data[0], data[1], data[2], data[3]
40 | return data[0], data[1], data[2], data[3]
41 |
42 | def prepro_train(self, imp_ids, behaviors, news_dict, K_samples,
43 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450):
44 | template1 = ''.join(self.conti_tokens[0]) + ""
45 | template2 = ''.join(self.conti_tokens[1]) + ""
46 | template3 = ''.join(self.conti_tokens[2]) + "[MASK]"
47 | template = template3 + template2 + template1
48 |
49 | for impid, behav in zip(imp_ids, behaviors):
50 | his_clicks = behav[0][-max_his:]
51 | his_clicks.reverse()
52 | his_titles = []
53 | for news in his_clicks:
54 | title = news_dict[news]['title']
55 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
56 |
57 | title = ' '.join(title.split(' ')[:max_title_len])
58 |
59 | his_titles.append(title)
60 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles)
61 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len]
62 | his_sen = self.tokenizer.decode(his_sen_ids)
63 | base_sentence = template.replace("", his_sen)
64 |
65 | positives = behav[1]
66 | negatives = behav[2]
67 |
68 | for news in positives:
69 | title = news_dict[news]['title']
70 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
71 |
72 | title = ' '.join(title.split(' ')[:max_candi_len])
73 |
74 | sentence = base_sentence.replace("", title)
75 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid})
76 |
77 | if len(negatives) >= K_samples:
78 | sample_negs = random.sample(negatives, k=K_samples)
79 | else:
80 | sample_negs = np.random.choice(negatives, K_samples, replace=True).tolist()
81 |
82 | for neg in sample_negs:
83 | neg_title = news_dict[neg]['title']
84 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title)
85 |
86 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len])
87 |
88 | sentence = base_sentence.replace("", neg_title)
89 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid})
90 |
91 | def prepro_dev(self, imp_ids, behaviors, news_dict,
92 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450):
93 | template1 = ''.join(self.conti_tokens[0]) + ""
94 | template2 = ''.join(self.conti_tokens[1]) + ""
95 | template3 = ''.join(self.conti_tokens[2]) + "[MASK]"
96 | template = template3 + template2 + template1
97 |
98 | for impid, behav in zip(imp_ids, behaviors):
99 | if len(behav[0]) == 0:
100 | continue
101 | his_clicks = behav[0][-max_his:]
102 | his_clicks.reverse()
103 | his_titles = []
104 | for news in his_clicks:
105 | title = news_dict[news]['title']
106 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
107 |
108 | title = ' '.join(title.split(' ')[:max_title_len])
109 |
110 | his_titles.append(title)
111 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles)
112 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len]
113 | his_sen = self.tokenizer.decode(his_sen_ids)
114 | base_sentence = template.replace("", his_sen)
115 |
116 | positives = behav[1]
117 | negatives = behav[2]
118 | for news in positives:
119 | title = news_dict[news]['title']
120 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
121 |
122 | title = ' '.join(title.split(' ')[:max_candi_len])
123 |
124 | sentence = base_sentence.replace("", title)
125 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid})
126 |
127 | for neg in negatives:
128 | neg_title = news_dict[neg]['title']
129 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title)
130 |
131 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len])
132 |
133 | sentence = base_sentence.replace("", neg_title)
134 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid})
135 |
136 | def load_data(self):
137 | data = pickle.load(open(self.data_path, 'rb'))
138 | imps, users, times, behaviors = self.obtain_data(data)
139 | if self.status == 'train':
140 | self.prepro_train(imps, behaviors, self.news_dict, self.args.num_negs, self.args.max_his,
141 | max_his_len=self.args.max_his_len)
142 | else:
143 | self.prepro_dev(imps, behaviors, self.news_dict, self.args.max_his,
144 | max_his_len=self.args.max_his_len)
145 |
146 | def collate_fn(self, batch):
147 | sentences = [x['sentence'] for x in batch]
148 | target = [x['target'] for x in batch]
149 | imp_id = [x['imp'] for x in batch]
150 |
151 | encode_dict = self.tokenizer.batch_encode_plus(
152 | sentences,
153 | add_special_tokens=True,
154 | padding='max_length',
155 | max_length=self.args.max_tokens,
156 | truncation=True,
157 | pad_to_max_length=True,
158 | return_attention_mask=True,
159 | return_tensors='pt'
160 | )
161 |
162 | batch_enc = encode_dict['input_ids']
163 | batch_attn = encode_dict['attention_mask']
164 | target = torch.LongTensor(target)
165 |
166 | return batch_enc, batch_attn, target, imp_id
167 |
168 |
169 |
170 |
171 |
172 |
--------------------------------------------------------------------------------
/Continuous-Relevance/prepro_data.py:
--------------------------------------------------------------------------------
1 |
2 | import re
3 | import random
4 | import numpy as np
5 | from torch.utils.data import Dataset
6 | import pickle
7 | import os
8 | import torch
9 |
10 |
11 | class MyDataset(Dataset):
12 | def __init__(self, args, tokenizer, news_dict, conti_tokens, status='train'):
13 | self.tokenizer = tokenizer
14 | self.news_dict = news_dict
15 | self.args = args
16 | self.status = status
17 | self.conti_tokens = conti_tokens
18 |
19 | self.data = []
20 | self.imp_lens = []
21 | if self.status == 'train':
22 | self.data_path = os.path.join(args.data_path, 'train.txt')
23 | elif self.status == 'val':
24 | self.data_path = os.path.join(args.data_path, 'val.txt')
25 | else:
26 | self.data_path = os.path.join(args.data_path, 'test.txt')
27 | self.load_data()
28 |
29 | def __len__(self):
30 | return len(self.data)
31 |
32 | def __getitem__(self, item):
33 | return self.data[item]
34 |
35 | def obtain_data(self, data):
36 | # if self.status == 'train':
37 | # return data[0][:20], data[1][:20], data[2][:20], data[3][:20]
38 | # else:
39 | # return data[0], data[1], data[2], data[3]
40 | return data[0], data[1], data[2], data[3]
41 |
42 | def prepro_train(self, imp_ids, behaviors, news_dict, K_samples,
43 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450):
44 | template1 = ''.join(self.conti_tokens[0]) + ""
45 | template2 = ''.join(self.conti_tokens[1]) + ""
46 | template3 = ''.join(self.conti_tokens[2]) + "[MASK]"
47 | template = template2 + template3 + template1
48 |
49 | for impid, behav in zip(imp_ids, behaviors):
50 | his_clicks = behav[0][-max_his:]
51 | his_clicks.reverse()
52 | his_titles = []
53 | for news in his_clicks:
54 | title = news_dict[news]['title']
55 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
56 |
57 | title = ' '.join(title.split(' ')[:max_title_len])
58 |
59 | his_titles.append(title)
60 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles)
61 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len]
62 | his_sen = self.tokenizer.decode(his_sen_ids)
63 | base_sentence = template.replace("", his_sen)
64 |
65 | positives = behav[1]
66 | negatives = behav[2]
67 |
68 | for news in positives:
69 | title = news_dict[news]['title']
70 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
71 |
72 | title = ' '.join(title.split(' ')[:max_candi_len])
73 |
74 | sentence = base_sentence.replace("", title)
75 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid})
76 |
77 | if len(negatives) >= K_samples:
78 | sample_negs = random.sample(negatives, k=K_samples)
79 | else:
80 | sample_negs = np.random.choice(negatives, K_samples, replace=True).tolist()
81 |
82 | for neg in sample_negs:
83 | neg_title = news_dict[neg]['title']
84 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title)
85 |
86 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len])
87 |
88 | sentence = base_sentence.replace("", neg_title)
89 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid})
90 |
91 | def prepro_dev(self, imp_ids, behaviors, news_dict,
92 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450):
93 | template1 = ''.join(self.conti_tokens[0]) + ""
94 | template2 = ''.join(self.conti_tokens[1]) + ""
95 | template3 = ''.join(self.conti_tokens[2]) + "[MASK]"
96 | template = template2 + template3 + template1
97 |
98 | for impid, behav in zip(imp_ids, behaviors):
99 | if len(behav[0]) == 0:
100 | continue
101 | his_clicks = behav[0][-max_his:]
102 | his_clicks.reverse()
103 | his_titles = []
104 | for news in his_clicks:
105 | title = news_dict[news]['title']
106 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
107 |
108 | title = ' '.join(title.split(' ')[:max_title_len])
109 |
110 | his_titles.append(title)
111 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles)
112 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len]
113 | his_sen = self.tokenizer.decode(his_sen_ids)
114 | base_sentence = template.replace("", his_sen)
115 |
116 | positives = behav[1]
117 | negatives = behav[2]
118 | for news in positives:
119 | title = news_dict[news]['title']
120 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
121 |
122 | title = ' '.join(title.split(' ')[:max_candi_len])
123 |
124 | sentence = base_sentence.replace("", title)
125 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid})
126 |
127 | for neg in negatives:
128 | neg_title = news_dict[neg]['title']
129 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title)
130 |
131 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len])
132 |
133 | sentence = base_sentence.replace("", neg_title)
134 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid})
135 |
136 | def load_data(self):
137 | data = pickle.load(open(self.data_path, 'rb'))
138 | imps, users, times, behaviors = self.obtain_data(data)
139 | if self.status == 'train':
140 | self.prepro_train(imps, behaviors, self.news_dict, self.args.num_negs, self.args.max_his,
141 | max_his_len=self.args.max_his_len)
142 | else:
143 | self.prepro_dev(imps, behaviors, self.news_dict, self.args.max_his,
144 | max_his_len=self.args.max_his_len)
145 |
146 | def collate_fn(self, batch):
147 | sentences = [x['sentence'] for x in batch]
148 | target = [x['target'] for x in batch]
149 | imp_id = [x['imp'] for x in batch]
150 |
151 | encode_dict = self.tokenizer.batch_encode_plus(
152 | sentences,
153 | add_special_tokens=True,
154 | padding='max_length',
155 | max_length=self.args.max_tokens,
156 | truncation=True,
157 | pad_to_max_length=True,
158 | return_attention_mask=True,
159 | return_tensors='pt'
160 | )
161 |
162 | batch_enc = encode_dict['input_ids']
163 | batch_attn = encode_dict['attention_mask']
164 | target = torch.LongTensor(target)
165 |
166 | return batch_enc, batch_attn, target, imp_id
167 |
168 |
169 |
170 |
171 |
172 |
--------------------------------------------------------------------------------
/Continuous-Utility/prepro_data.py:
--------------------------------------------------------------------------------
1 |
2 | import re
3 | import random
4 | import numpy as np
5 | from torch.utils.data import Dataset
6 | import pickle
7 | import os
8 | import torch
9 |
10 |
11 | class MyDataset(Dataset):
12 | def __init__(self, args, tokenizer, news_dict, conti_tokens, status='train'):
13 | self.tokenizer = tokenizer
14 | self.news_dict = news_dict
15 | self.args = args
16 | self.status = status
17 | self.conti_tokens = conti_tokens
18 |
19 | self.data = []
20 | self.imp_lens = []
21 | if self.status == 'train':
22 | self.data_path = os.path.join(args.train_data_path, 'train.txt')
23 | elif self.status == 'val':
24 | self.data_path = os.path.join(args.data_path, 'val.txt')
25 | else:
26 | self.data_path = os.path.join(args.data_path, 'test.txt')
27 | self.load_data()
28 |
29 | def __len__(self):
30 | return len(self.data)
31 |
32 | def __getitem__(self, item):
33 | return self.data[item]
34 |
35 | def obtain_data(self, data):
36 | # if self.status == 'train':
37 | # return data[0][:20], data[1][:20], data[2][:20], data[3][:20]
38 | # else:
39 | # return data[0], data[1], data[2], data[3]
40 | return data[0], data[1], data[2], data[3]
41 |
42 | def prepro_train(self, imp_ids, behaviors, news_dict, K_samples,
43 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450):
44 | template1 = ''.join(self.conti_tokens[0]) + ""
45 | template2 = ''.join(self.conti_tokens[1]) + ""
46 | template3 = ''.join(self.conti_tokens[2]) + "[MASK]"
47 | template = template2 + template3 + template1
48 |
49 | for impid, behav in zip(imp_ids, behaviors):
50 | his_clicks = behav[0][-max_his:]
51 | his_clicks.reverse()
52 | his_titles = []
53 | for news in his_clicks:
54 | title = news_dict[news]['title']
55 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
56 |
57 | title = ' '.join(title.split(' ')[:max_title_len])
58 |
59 | his_titles.append(title)
60 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles)
61 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len]
62 | his_sen = self.tokenizer.decode(his_sen_ids)
63 | base_sentence = template.replace("", his_sen)
64 |
65 | positives = behav[1]
66 | negatives = behav[2]
67 |
68 | for news in positives:
69 | title = news_dict[news]['title']
70 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
71 |
72 | title = ' '.join(title.split(' ')[:max_candi_len])
73 |
74 | sentence = base_sentence.replace("", title)
75 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid})
76 |
77 | if len(negatives) >= K_samples:
78 | sample_negs = random.sample(negatives, k=K_samples)
79 | else:
80 | sample_negs = np.random.choice(negatives, K_samples, replace=True).tolist()
81 |
82 | for neg in sample_negs:
83 | neg_title = news_dict[neg]['title']
84 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title)
85 |
86 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len])
87 |
88 | sentence = base_sentence.replace("", neg_title)
89 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid})
90 |
91 | def prepro_dev(self, imp_ids, behaviors, news_dict,
92 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450):
93 | template1 = ''.join(self.conti_tokens[0]) + ""
94 | template2 = ''.join(self.conti_tokens[1]) + ""
95 | template3 = ''.join(self.conti_tokens[2]) + "[MASK]"
96 | template = template2 + template3 + template1
97 |
98 | for impid, behav in zip(imp_ids, behaviors):
99 | if len(behav[0]) == 0:
100 | continue
101 | his_clicks = behav[0][-max_his:]
102 | his_clicks.reverse()
103 | his_titles = []
104 | for news in his_clicks:
105 | title = news_dict[news]['title']
106 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
107 |
108 | title = ' '.join(title.split(' ')[:max_title_len])
109 |
110 | his_titles.append(title)
111 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles)
112 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len]
113 | his_sen = self.tokenizer.decode(his_sen_ids)
114 | base_sentence = template.replace("", his_sen)
115 |
116 | positives = behav[1]
117 | negatives = behav[2]
118 | for news in positives:
119 | title = news_dict[news]['title']
120 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
121 |
122 | title = ' '.join(title.split(' ')[:max_candi_len])
123 |
124 | sentence = base_sentence.replace("", title)
125 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid})
126 |
127 | for neg in negatives:
128 | neg_title = news_dict[neg]['title']
129 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title)
130 |
131 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len])
132 |
133 | sentence = base_sentence.replace("", neg_title)
134 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid})
135 |
136 | def load_data(self):
137 | data = pickle.load(open(self.data_path, 'rb'))
138 | imps, users, times, behaviors = self.obtain_data(data)
139 | if self.status == 'train':
140 | self.prepro_train(imps, behaviors, self.news_dict, self.args.num_negs, self.args.max_his,
141 | max_his_len=self.args.max_his_len)
142 | else:
143 | self.prepro_dev(imps, behaviors, self.news_dict, self.args.max_his,
144 | max_his_len=self.args.max_his_len)
145 |
146 | def collate_fn(self, batch):
147 | sentences = [x['sentence'] for x in batch]
148 | target = [x['target'] for x in batch]
149 | imp_id = [x['imp'] for x in batch]
150 |
151 | encode_dict = self.tokenizer.batch_encode_plus(
152 | sentences,
153 | add_special_tokens=True,
154 | padding='max_length',
155 | max_length=self.args.max_tokens,
156 | truncation=True,
157 | pad_to_max_length=True,
158 | return_attention_mask=True,
159 | return_tensors='pt'
160 | )
161 |
162 | batch_enc = encode_dict['input_ids']
163 | batch_attn = encode_dict['attention_mask']
164 | target = torch.LongTensor(target)
165 |
166 | return batch_enc, batch_attn, target, imp_id
167 |
168 |
169 |
170 |
171 |
172 |
--------------------------------------------------------------------------------
/Hybrid-Emotion/prepro_data.py:
--------------------------------------------------------------------------------
1 |
2 | import re
3 | import random
4 | import numpy as np
5 | from torch.utils.data import Dataset
6 | import pickle
7 | import os
8 | import torch
9 |
10 |
11 | class MyDataset(Dataset):
12 | def __init__(self, args, tokenizer, news_dict, conti_tokens, status='train'):
13 | self.tokenizer = tokenizer
14 | self.news_dict = news_dict
15 | self.args = args
16 | self.status = status
17 | self.conti_tokens = conti_tokens
18 |
19 | self.data = []
20 | self.imp_lens = []
21 | if self.status == 'train':
22 | self.data_path = os.path.join(args.data_path, 'train.txt')
23 | elif self.status == 'val':
24 | self.data_path = os.path.join(args.data_path, 'val.txt')
25 | else:
26 | self.data_path = os.path.join(args.data_path, 'test.txt')
27 | self.load_data()
28 |
29 | def __len__(self):
30 | return len(self.data)
31 |
32 | def __getitem__(self, item):
33 | return self.data[item]
34 |
35 | def obtain_data(self, data):
36 | # if self.status == 'train':
37 | # return data[0][:20], data[1][:20], data[2][:20], data[3][:20]
38 | # else:
39 | # return data[0], data[1], data[2], data[3]
40 | return data[0], data[1], data[2], data[3]
41 |
42 | def prepro_train(self, imp_ids, behaviors, news_dict, K_samples,
43 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450):
44 | template1 = ''.join(self.conti_tokens[0]) + ""
45 | template2 = ''.join(self.conti_tokens[1]) + ""
46 | template3 = "The user feels [MASK] about the news"
47 | template = template1 + "[SEP]" + template2 + "[SEP]" + template3
48 |
49 | for impid, behav in zip(imp_ids, behaviors):
50 | his_clicks = behav[0][-max_his:]
51 | his_clicks.reverse()
52 | his_titles = []
53 | for news in his_clicks:
54 | title = news_dict[news]['title']
55 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
56 |
57 | title = ' '.join(title.split(' ')[:max_title_len])
58 |
59 | his_titles.append(title)
60 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles)
61 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len]
62 | his_sen = self.tokenizer.decode(his_sen_ids)
63 | base_sentence = template.replace("", his_sen)
64 |
65 | positives = behav[1]
66 | negatives = behav[2]
67 |
68 | for news in positives:
69 | title = news_dict[news]['title']
70 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
71 |
72 | title = ' '.join(title.split(' ')[:max_candi_len])
73 |
74 | sentence = base_sentence.replace("", title)
75 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid})
76 |
77 | if len(negatives) >= K_samples:
78 | sample_negs = random.sample(negatives, k=K_samples)
79 | else:
80 | sample_negs = np.random.choice(negatives, K_samples, replace=True).tolist()
81 |
82 | for neg in sample_negs:
83 | neg_title = news_dict[neg]['title']
84 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title)
85 |
86 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len])
87 |
88 | sentence = base_sentence.replace("", neg_title)
89 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid})
90 |
91 | def prepro_dev(self, imp_ids, behaviors, news_dict,
92 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450):
93 | template1 = ''.join(self.conti_tokens[0]) + ""
94 | template2 = ''.join(self.conti_tokens[1]) + ""
95 | template3 = "The user feels [MASK] about the news"
96 | template = template1 + "[SEP]" + template2 + "[SEP]" + template3
97 |
98 | for impid, behav in zip(imp_ids, behaviors):
99 | if len(behav[0]) == 0:
100 | continue
101 | his_clicks = behav[0][-max_his:]
102 | his_clicks.reverse()
103 | his_titles = []
104 | for news in his_clicks:
105 | title = news_dict[news]['title']
106 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
107 |
108 | title = ' '.join(title.split(' ')[:max_title_len])
109 |
110 | his_titles.append(title)
111 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles)
112 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len]
113 | his_sen = self.tokenizer.decode(his_sen_ids)
114 | base_sentence = template.replace("", his_sen)
115 |
116 | positives = behav[1]
117 | negatives = behav[2]
118 | for news in positives:
119 | title = news_dict[news]['title']
120 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
121 |
122 | title = ' '.join(title.split(' ')[:max_candi_len])
123 |
124 | sentence = base_sentence.replace("", title)
125 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid})
126 |
127 | for neg in negatives:
128 | neg_title = news_dict[neg]['title']
129 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title)
130 |
131 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len])
132 |
133 | sentence = base_sentence.replace("", neg_title)
134 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid})
135 |
136 | def load_data(self):
137 | data = pickle.load(open(self.data_path, 'rb'))
138 | imps, users, times, behaviors = self.obtain_data(data)
139 | if self.status == 'train':
140 | self.prepro_train(imps, behaviors, self.news_dict, self.args.num_negs, self.args.max_his,
141 | max_his_len=self.args.max_his_len)
142 | else:
143 | self.prepro_dev(imps, behaviors, self.news_dict, self.args.max_his,
144 | max_his_len=self.args.max_his_len)
145 |
146 | def collate_fn(self, batch):
147 | sentences = [x['sentence'] for x in batch]
148 | target = [x['target'] for x in batch]
149 | imp_id = [x['imp'] for x in batch]
150 |
151 | encode_dict = self.tokenizer.batch_encode_plus(
152 | sentences,
153 | add_special_tokens=True,
154 | padding='max_length',
155 | max_length=self.args.max_tokens,
156 | truncation=True,
157 | pad_to_max_length=True,
158 | return_attention_mask=True,
159 | return_tensors='pt'
160 | )
161 |
162 | batch_enc = encode_dict['input_ids']
163 | batch_attn = encode_dict['attention_mask']
164 | target = torch.LongTensor(target)
165 |
166 | return batch_enc, batch_attn, target, imp_id
167 |
168 |
169 |
170 |
171 |
172 |
--------------------------------------------------------------------------------
/Continuous-Action/prepro_data.py:
--------------------------------------------------------------------------------
1 |
2 | import re
3 | import random
4 | import numpy as np
5 | from torch.utils.data import Dataset
6 | import pickle
7 | import os
8 | import torch
9 |
10 |
11 | class MyDataset(Dataset):
12 | def __init__(self, args, tokenizer, news_dict, conti_tokens, status='train'):
13 | self.tokenizer = tokenizer
14 | self.news_dict = news_dict
15 | self.args = args
16 | self.status = status
17 | self.conti_tokens = conti_tokens
18 |
19 | self.data = []
20 | self.imp_lens = []
21 | if self.status == 'train':
22 | self.data_path = os.path.join(args.data_path, 'train.txt')
23 | elif self.status == 'val':
24 | self.data_path = os.path.join(args.data_path, 'val.txt')
25 | else:
26 | self.data_path = os.path.join(args.data_path, 'test.txt')
27 | self.load_data()
28 |
29 | def __len__(self):
30 | return len(self.data)
31 |
32 | def __getitem__(self, item):
33 | return self.data[item]
34 |
35 | def obtain_data(self, data):
36 | # if self.status == 'train':
37 | # return data[0][:20], data[1][:20], data[2][:20], data[3][:20]
38 | # else:
39 | # return data[0], data[1], data[2], data[3]
40 | return data[0], data[1], data[2], data[3]
41 |
42 | def prepro_train(self, imp_ids, behaviors, news_dict, K_samples,
43 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450):
44 | template1 = ''.join(self.conti_tokens[0]) + ""
45 | template2 = ''.join(self.conti_tokens[1]) + ""
46 | template3 = ''.join(self.conti_tokens[2]) + "[MASK]"
47 | template = template1 + "[SEP]" + template2 + "[SEP]" + template3
48 |
49 | for impid, behav in zip(imp_ids, behaviors):
50 | his_clicks = behav[0][-max_his:]
51 | his_clicks.reverse()
52 | his_titles = []
53 | for news in his_clicks:
54 | title = news_dict[news]['title']
55 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
56 |
57 | title = ' '.join(title.split(' ')[:max_title_len])
58 |
59 | his_titles.append(title)
60 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles)
61 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len]
62 | his_sen = self.tokenizer.decode(his_sen_ids)
63 | base_sentence = template.replace("", his_sen)
64 |
65 | positives = behav[1]
66 | negatives = behav[2]
67 |
68 | for news in positives:
69 | title = news_dict[news]['title']
70 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
71 |
72 | title = ' '.join(title.split(' ')[:max_candi_len])
73 |
74 | sentence = base_sentence.replace("", title)
75 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid})
76 |
77 | if len(negatives) >= K_samples:
78 | sample_negs = random.sample(negatives, k=K_samples)
79 | else:
80 | sample_negs = np.random.choice(negatives, K_samples, replace=True).tolist()
81 |
82 | for neg in sample_negs:
83 | neg_title = news_dict[neg]['title']
84 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title)
85 |
86 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len])
87 |
88 | sentence = base_sentence.replace("", neg_title)
89 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid})
90 |
91 | def prepro_dev(self, imp_ids, behaviors, news_dict,
92 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450):
93 | template1 = ''.join(self.conti_tokens[0]) + ""
94 | template2 = ''.join(self.conti_tokens[1]) + ""
95 | template3 = ''.join(self.conti_tokens[2]) + "[MASK]"
96 | template = template1 + "[SEP]" + template2 + "[SEP]" + template3
97 |
98 | for impid, behav in zip(imp_ids, behaviors):
99 | if len(behav[0]) == 0:
100 | continue
101 | his_clicks = behav[0][-max_his:]
102 | his_clicks.reverse()
103 | his_titles = []
104 | for news in his_clicks:
105 | title = news_dict[news]['title']
106 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
107 |
108 | title = ' '.join(title.split(' ')[:max_title_len])
109 |
110 | his_titles.append(title)
111 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles)
112 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len]
113 | his_sen = self.tokenizer.decode(his_sen_ids)
114 | base_sentence = template.replace("", his_sen)
115 |
116 | positives = behav[1]
117 | negatives = behav[2]
118 | for news in positives:
119 | title = news_dict[news]['title']
120 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
121 |
122 | title = ' '.join(title.split(' ')[:max_candi_len])
123 |
124 | sentence = base_sentence.replace("", title)
125 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid})
126 |
127 | for neg in negatives:
128 | neg_title = news_dict[neg]['title']
129 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title)
130 |
131 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len])
132 |
133 | sentence = base_sentence.replace("", neg_title)
134 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid})
135 |
136 | def load_data(self):
137 | data = pickle.load(open(self.data_path, 'rb'))
138 | imps, users, times, behaviors = self.obtain_data(data)
139 | if self.status == 'train':
140 | self.prepro_train(imps, behaviors, self.news_dict, self.args.num_negs, self.args.max_his,
141 | max_his_len=self.args.max_his_len)
142 | else:
143 | self.prepro_dev(imps, behaviors, self.news_dict, self.args.max_his,
144 | max_his_len=self.args.max_his_len)
145 |
146 | def collate_fn(self, batch):
147 | sentences = [x['sentence'] for x in batch]
148 | target = [x['target'] for x in batch]
149 | imp_id = [x['imp'] for x in batch]
150 |
151 | encode_dict = self.tokenizer.batch_encode_plus(
152 | sentences,
153 | add_special_tokens=True,
154 | padding='max_length',
155 | max_length=self.args.max_tokens,
156 | truncation=True,
157 | pad_to_max_length=True,
158 | return_attention_mask=True,
159 | return_tensors='pt'
160 | )
161 |
162 | batch_enc = encode_dict['input_ids']
163 | batch_attn = encode_dict['attention_mask']
164 | target = torch.LongTensor(target)
165 |
166 | return batch_enc, batch_attn, target, imp_id
167 |
168 |
169 |
170 |
171 |
172 |
--------------------------------------------------------------------------------
/Hybrid-Action/prepro_data.py:
--------------------------------------------------------------------------------
1 |
2 | import re
3 | import random
4 | import numpy as np
5 | from torch.utils.data import Dataset
6 | import pickle
7 | import os
8 | import torch
9 |
10 |
11 | class MyDataset(Dataset):
12 | def __init__(self, args, tokenizer, news_dict, conti_tokens, status='train'):
13 | self.tokenizer = tokenizer
14 | self.news_dict = news_dict
15 | self.args = args
16 | self.status = status
17 | self.conti_tokens = conti_tokens
18 |
19 | self.data = []
20 | self.imp_lens = []
21 | if self.status == 'train':
22 | self.data_path = os.path.join(args.train_data_path, 'train.txt')
23 | elif self.status == 'val':
24 | self.data_path = os.path.join(args.data_path, 'val.txt')
25 | else:
26 | self.data_path = os.path.join(args.data_path, 'test.txt')
27 | self.load_data()
28 |
29 | def __len__(self):
30 | return len(self.data)
31 |
32 | def __getitem__(self, item):
33 | return self.data[item]
34 |
35 | def obtain_data(self, data):
36 | # if self.status == 'train':
37 | # return data[0][:20], data[1][:20], data[2][:20], data[3][:20]
38 | # else:
39 | # return data[0], data[1], data[2], data[3]
40 | return data[0], data[1], data[2], data[3]
41 |
42 | def prepro_train(self, imp_ids, behaviors, news_dict, K_samples,
43 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450):
44 | template1 = ''.join(self.conti_tokens[0]) + ""
45 | template2 = ''.join(self.conti_tokens[1]) + ""
46 | template3 = "Does the user click the news? [MASK]"
47 | template = template1 + "[SEP]" + template2 + "[SEP]" + template3
48 |
49 | for impid, behav in zip(imp_ids, behaviors):
50 | his_clicks = behav[0][-max_his:]
51 | his_clicks.reverse()
52 | his_titles = []
53 | for news in his_clicks:
54 | title = news_dict[news]['title']
55 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
56 |
57 | title = ' '.join(title.split(' ')[:max_title_len])
58 |
59 | his_titles.append(title)
60 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles)
61 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len]
62 | his_sen = self.tokenizer.decode(his_sen_ids)
63 | base_sentence = template.replace("", his_sen)
64 |
65 | positives = behav[1]
66 | negatives = behav[2]
67 |
68 | for news in positives:
69 | title = news_dict[news]['title']
70 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
71 |
72 | title = ' '.join(title.split(' ')[:max_candi_len])
73 |
74 | sentence = base_sentence.replace("", title)
75 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid})
76 |
77 | if len(negatives) >= K_samples:
78 | sample_negs = random.sample(negatives, k=K_samples)
79 | else:
80 | sample_negs = np.random.choice(negatives, K_samples, replace=True).tolist()
81 |
82 | for neg in sample_negs:
83 | neg_title = news_dict[neg]['title']
84 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title)
85 |
86 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len])
87 |
88 | sentence = base_sentence.replace("", neg_title)
89 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid})
90 |
91 | def prepro_dev(self, imp_ids, behaviors, news_dict,
92 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450):
93 | template1 = ''.join(self.conti_tokens[0]) + ""
94 | template2 = ''.join(self.conti_tokens[1]) + ""
95 | template3 = "Does the user click the news? [MASK]"
96 | template = template1 + "[SEP]" + template2 + "[SEP]" + template3
97 |
98 | for impid, behav in zip(imp_ids, behaviors):
99 | if len(behav[0]) == 0:
100 | continue
101 | his_clicks = behav[0][-max_his:]
102 | his_clicks.reverse()
103 | his_titles = []
104 | for news in his_clicks:
105 | title = news_dict[news]['title']
106 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
107 |
108 | title = ' '.join(title.split(' ')[:max_title_len])
109 |
110 | his_titles.append(title)
111 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles)
112 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len]
113 | his_sen = self.tokenizer.decode(his_sen_ids)
114 | base_sentence = template.replace("", his_sen)
115 |
116 | positives = behav[1]
117 | negatives = behav[2]
118 | for news in positives:
119 | title = news_dict[news]['title']
120 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
121 |
122 | title = ' '.join(title.split(' ')[:max_candi_len])
123 |
124 | sentence = base_sentence.replace("", title)
125 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid})
126 |
127 | for neg in negatives:
128 | neg_title = news_dict[neg]['title']
129 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title)
130 |
131 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len])
132 |
133 | sentence = base_sentence.replace("", neg_title)
134 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid})
135 |
136 | def load_data(self):
137 | data = pickle.load(open(self.data_path, 'rb'))
138 | imps, users, times, behaviors = self.obtain_data(data)
139 | if self.status == 'train':
140 | self.prepro_train(imps, behaviors, self.news_dict, self.args.num_negs, self.args.max_his,
141 | max_his_len=self.args.max_his_len)
142 | else:
143 | self.prepro_dev(imps, behaviors, self.news_dict, self.args.max_his,
144 | max_his_len=self.args.max_his_len)
145 |
146 | def collate_fn(self, batch):
147 | sentences = [x['sentence'] for x in batch]
148 | target = [x['target'] for x in batch]
149 | imp_id = [x['imp'] for x in batch]
150 |
151 | encode_dict = self.tokenizer.batch_encode_plus(
152 | sentences,
153 | add_special_tokens=True,
154 | padding='max_length',
155 | max_length=self.args.max_tokens,
156 | truncation=True,
157 | pad_to_max_length=True,
158 | return_attention_mask=True,
159 | return_tensors='pt'
160 | )
161 |
162 | batch_enc = encode_dict['input_ids']
163 | batch_attn = encode_dict['attention_mask']
164 | target = torch.LongTensor(target)
165 |
166 | return batch_enc, batch_attn, target, imp_id
167 |
168 |
169 |
170 |
171 |
172 |
--------------------------------------------------------------------------------
/Hybrid-Utility/prepro_data.py:
--------------------------------------------------------------------------------
1 |
2 | import re
3 | import random
4 | import numpy as np
5 | from torch.utils.data import Dataset
6 | import pickle
7 | import os
8 | import torch
9 |
10 |
11 | class MyDataset(Dataset):
12 | def __init__(self, args, tokenizer, news_dict, conti_tokens, status='train'):
13 | self.tokenizer = tokenizer
14 | self.news_dict = news_dict
15 | self.args = args
16 | self.status = status
17 | self.conti_tokens = conti_tokens
18 |
19 | self.data = []
20 | self.imp_lens = []
21 | if self.status == 'train':
22 | self.data_path = os.path.join(args.data_path, 'train.txt')
23 | elif self.status == 'val':
24 | self.data_path = os.path.join(args.data_path, 'val.txt')
25 | else:
26 | self.data_path = os.path.join(args.data_path, 'test.txt')
27 | self.load_data()
28 |
29 | def __len__(self):
30 | return len(self.data)
31 |
32 | def __getitem__(self, item):
33 | return self.data[item]
34 |
35 | def obtain_data(self, data):
36 | # if self.status == 'train':
37 | # return data[0][:20], data[1][:20], data[2][:20], data[3][:20]
38 | # else:
39 | # return data[0], data[1], data[2], data[3]
40 | return data[0], data[1], data[2], data[3]
41 |
42 | def prepro_train(self, imp_ids, behaviors, news_dict, K_samples,
43 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450):
44 | template1 = ''.join(self.conti_tokens[0]) + ""
45 | template2 = ''.join(self.conti_tokens[1]) + ""
46 | template3 = "Recommending the news to the user is a [MASK] choice"
47 | template = template1 + "[SEP]" + template2 + "[SEP]" + template3
48 |
49 | for impid, behav in zip(imp_ids, behaviors):
50 | his_clicks = behav[0][-max_his:]
51 | his_clicks.reverse()
52 | his_titles = []
53 | for news in his_clicks:
54 | title = news_dict[news]['title']
55 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
56 |
57 | title = ' '.join(title.split(' ')[:max_title_len])
58 |
59 | his_titles.append(title)
60 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles)
61 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len]
62 | his_sen = self.tokenizer.decode(his_sen_ids)
63 | base_sentence = template.replace("", his_sen)
64 |
65 | positives = behav[1]
66 | negatives = behav[2]
67 |
68 | for news in positives:
69 | title = news_dict[news]['title']
70 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
71 |
72 | title = ' '.join(title.split(' ')[:max_candi_len])
73 |
74 | sentence = base_sentence.replace("", title)
75 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid})
76 |
77 | if len(negatives) >= K_samples:
78 | sample_negs = random.sample(negatives, k=K_samples)
79 | else:
80 | sample_negs = np.random.choice(negatives, K_samples, replace=True).tolist()
81 |
82 | for neg in sample_negs:
83 | neg_title = news_dict[neg]['title']
84 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title)
85 |
86 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len])
87 |
88 | sentence = base_sentence.replace("", neg_title)
89 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid})
90 |
91 | def prepro_dev(self, imp_ids, behaviors, news_dict,
92 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450):
93 | template1 = ''.join(self.conti_tokens[0]) + ""
94 | template2 = ''.join(self.conti_tokens[1]) + ""
95 | template3 = "Recommending the news to the user is a [MASK] choice"
96 | template = template1 + "[SEP]" + template2 + "[SEP]" + template3
97 |
98 | for impid, behav in zip(imp_ids, behaviors):
99 | if len(behav[0]) == 0:
100 | continue
101 | his_clicks = behav[0][-max_his:]
102 | his_clicks.reverse()
103 | his_titles = []
104 | for news in his_clicks:
105 | title = news_dict[news]['title']
106 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
107 |
108 | title = ' '.join(title.split(' ')[:max_title_len])
109 |
110 | his_titles.append(title)
111 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles)
112 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len]
113 | his_sen = self.tokenizer.decode(his_sen_ids)
114 | base_sentence = template.replace("", his_sen)
115 |
116 | positives = behav[1]
117 | negatives = behav[2]
118 | for news in positives:
119 | title = news_dict[news]['title']
120 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
121 |
122 | title = ' '.join(title.split(' ')[:max_candi_len])
123 |
124 | sentence = base_sentence.replace("", title)
125 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid})
126 |
127 | for neg in negatives:
128 | neg_title = news_dict[neg]['title']
129 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title)
130 |
131 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len])
132 |
133 | sentence = base_sentence.replace("", neg_title)
134 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid})
135 |
136 | def load_data(self):
137 | data = pickle.load(open(self.data_path, 'rb'))
138 | imps, users, times, behaviors = self.obtain_data(data)
139 | if self.status == 'train':
140 | self.prepro_train(imps, behaviors, self.news_dict, self.args.num_negs, self.args.max_his,
141 | max_his_len=self.args.max_his_len)
142 | else:
143 | self.prepro_dev(imps, behaviors, self.news_dict, self.args.max_his,
144 | max_his_len=self.args.max_his_len)
145 |
146 | def collate_fn(self, batch):
147 | sentences = [x['sentence'] for x in batch]
148 | target = [x['target'] for x in batch]
149 | imp_id = [x['imp'] for x in batch]
150 |
151 | encode_dict = self.tokenizer.batch_encode_plus(
152 | sentences,
153 | add_special_tokens=True,
154 | padding='max_length',
155 | max_length=self.args.max_tokens,
156 | truncation=True,
157 | pad_to_max_length=True,
158 | return_attention_mask=True,
159 | return_tensors='pt'
160 | )
161 |
162 | batch_enc = encode_dict['input_ids']
163 | batch_attn = encode_dict['attention_mask']
164 | target = torch.LongTensor(target)
165 |
166 | return batch_enc, batch_attn, target, imp_id
167 |
168 |
169 |
170 |
171 |
172 |
--------------------------------------------------------------------------------
/Hybrid-Relevance/prepro_data.py:
--------------------------------------------------------------------------------
1 |
2 | import re
3 | import random
4 | import numpy as np
5 | from torch.utils.data import Dataset
6 | import pickle
7 | import os
8 | import torch
9 |
10 |
11 | class MyDataset(Dataset):
12 | def __init__(self, args, tokenizer, news_dict, conti_tokens, status='train'):
13 | self.tokenizer = tokenizer
14 | self.news_dict = news_dict
15 | self.args = args
16 | self.status = status
17 | self.conti_tokens = conti_tokens
18 |
19 | self.data = []
20 | self.imp_lens = []
21 | if self.status == 'train':
22 | self.data_path = os.path.join(args.data_path, 'train.txt')
23 | elif self.status == 'val':
24 | self.data_path = os.path.join(args.data_path, 'val.txt')
25 | else:
26 | self.data_path = os.path.join(args.data_path, 'test.txt')
27 | self.load_data()
28 |
29 | def __len__(self):
30 | return len(self.data)
31 |
32 | def __getitem__(self, item):
33 | return self.data[item]
34 |
35 | def obtain_data(self, data):
36 | # if self.status == 'train':
37 | # return data[0][:20], data[1][:20], data[2][:20], data[3][:20]
38 | # else:
39 | # return data[0], data[1], data[2], data[3]
40 | return data[0], data[1], data[2], data[3]
41 |
42 | def prepro_train(self, imp_ids, behaviors, news_dict, K_samples,
43 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450):
44 | template1 = ''.join(self.conti_tokens[0]) + ""
45 | template2 = ''.join(self.conti_tokens[1]) + ""
46 | template3 = "This news is [MASK] to the user's area of interest"
47 | # template3 = " is [MASK] to "
48 | template = template1 + "[SEP]" + template2 + "[SEP]" + template3
49 | # template = template2 + template3 + template1
50 |
51 | for impid, behav in zip(imp_ids, behaviors):
52 | his_clicks = behav[0][-max_his:]
53 | his_clicks.reverse()
54 | his_titles = []
55 | for news in his_clicks:
56 | title = news_dict[news]['title']
57 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
58 |
59 | title = ' '.join(title.split(' ')[:max_title_len])
60 |
61 | his_titles.append(title)
62 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles)
63 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len]
64 | his_sen = self.tokenizer.decode(his_sen_ids)
65 | base_sentence = template.replace("", his_sen)
66 |
67 | positives = behav[1]
68 | negatives = behav[2]
69 |
70 | for news in positives:
71 | title = news_dict[news]['title']
72 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
73 |
74 | title = ' '.join(title.split(' ')[:max_candi_len])
75 |
76 | sentence = base_sentence.replace("", title)
77 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid})
78 |
79 | if len(negatives) >= K_samples:
80 | sample_negs = random.sample(negatives, k=K_samples)
81 | else:
82 | sample_negs = np.random.choice(negatives, K_samples, replace=True).tolist()
83 |
84 | for neg in sample_negs:
85 | neg_title = news_dict[neg]['title']
86 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title)
87 |
88 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len])
89 |
90 | sentence = base_sentence.replace("", neg_title)
91 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid})
92 |
93 | def prepro_dev(self, imp_ids, behaviors, news_dict,
94 | max_his=50, max_title_len=10, max_candi_len=20, max_his_len=450):
95 | template1 = ''.join(self.conti_tokens[0]) + ""
96 | template2 = ''.join(self.conti_tokens[1]) + ""
97 | template3 = "This news is [MASK] to the user's area of interest"
98 | # template3 = " is [MASK] to "
99 | template = template1 + "[SEP]" + template2 + "[SEP]" + template3
100 | # template = template2 + template3 + template1
101 |
102 | for impid, behav in zip(imp_ids, behaviors):
103 | if len(behav[0]) == 0:
104 | continue
105 | his_clicks = behav[0][-max_his:]
106 | his_clicks.reverse()
107 | his_titles = []
108 | for news in his_clicks:
109 | title = news_dict[news]['title']
110 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
111 |
112 | title = ' '.join(title.split(' ')[:max_title_len])
113 |
114 | his_titles.append(title)
115 | his_sen = '[NSEP] ' + ' [NSEP] '.join(his_titles)
116 | his_sen_ids = self.tokenizer.encode(his_sen, add_special_tokens=False)[:max_his_len]
117 | his_sen = self.tokenizer.decode(his_sen_ids)
118 | base_sentence = template.replace("", his_sen)
119 |
120 | positives = behav[1]
121 | negatives = behav[2]
122 | for news in positives:
123 | title = news_dict[news]['title']
124 | title = re.sub(r'[^A-Za-z0-9 ]+', '', title)
125 |
126 | title = ' '.join(title.split(' ')[:max_candi_len])
127 |
128 | sentence = base_sentence.replace("", title)
129 | self.data.append({'sentence': sentence, 'target': 1, 'imp': impid})
130 |
131 | for neg in negatives:
132 | neg_title = news_dict[neg]['title']
133 | neg_title = re.sub(r'[^A-Za-z0-9 ]+', '', neg_title)
134 |
135 | neg_title = ' '.join(neg_title.split(' ')[:max_candi_len])
136 |
137 | sentence = base_sentence.replace("", neg_title)
138 | self.data.append({'sentence': sentence, 'target': 0, 'imp': impid})
139 |
140 | def load_data(self):
141 | data = pickle.load(open(self.data_path, 'rb'))
142 | imps, users, times, behaviors = self.obtain_data(data)
143 | if self.status == 'train':
144 | self.prepro_train(imps, behaviors, self.news_dict, self.args.num_negs, self.args.max_his,
145 | max_his_len=self.args.max_his_len)
146 | else:
147 | self.prepro_dev(imps, behaviors, self.news_dict, self.args.max_his,
148 | max_his_len=self.args.max_his_len)
149 |
150 | def collate_fn(self, batch):
151 | sentences = [x['sentence'] for x in batch]
152 | target = [x['target'] for x in batch]
153 | imp_id = [x['imp'] for x in batch]
154 |
155 | encode_dict = self.tokenizer.batch_encode_plus(
156 | sentences,
157 | add_special_tokens=True,
158 | padding='max_length',
159 | max_length=self.args.max_tokens,
160 | truncation=True,
161 | pad_to_max_length=True,
162 | return_attention_mask=True,
163 | return_tensors='pt'
164 | )
165 |
166 | batch_enc = encode_dict['input_ids']
167 | batch_attn = encode_dict['attention_mask']
168 | target = torch.LongTensor(target)
169 |
170 | return batch_enc, batch_attn, target, imp_id
171 |
172 |
173 |
174 |
175 |
176 |
--------------------------------------------------------------------------------
/Discrete-Action/predict.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | import pickle
4 | import time
5 | import sys
6 |
7 | from tqdm import tqdm
8 | from datetime import datetime
9 | import torch.cuda
10 | from torch.utils.data import DataLoader
11 |
12 | from transformers import BertTokenizer
13 | from transformers import AdamW
14 |
15 | import torch.distributed as dist
16 | import torch.multiprocessing as mp
17 | from torch.nn.parallel import DistributedDataParallel as DDP
18 | from torch.utils.data.distributed import DistributedSampler
19 |
20 | from model import BERTPrompt4NR
21 | from prepro_data import *
22 | from utils import evaluate
23 |
24 |
25 | def setup(rank, world_size):
26 | os.environ['MASTER_ADDR'] = 'localhost'
27 | os.environ['MASTER_PORT'] = '23342'
28 | dist.init_process_group("nccl", rank=rank, world_size=world_size)
29 |
30 |
31 | def cleanup():
32 | dist.destroy_process_group()
33 |
34 |
35 | def init_seed(seed):
36 | random.seed(seed)
37 | np.random.seed(seed)
38 | torch.manual_seed(seed)
39 | torch.cuda.manual_seed(seed)
40 | torch.cuda.manual_seed_all(seed)
41 |
42 |
43 | class Logger(object):
44 | def __init__(self, filename, stream=sys.stdout):
45 | self.terminal = stream
46 | self.log = open(filename, 'w')
47 |
48 | def write(self, message):
49 | self.terminal.write(message)
50 | self.log.write(message)
51 |
52 | def flush(self):
53 | pass
54 |
55 |
56 | def load_model(model_name, args):
57 | tokenizer = BertTokenizer.from_pretrained(model_name)
58 |
59 | new_tokens = ['[NSEP]']
60 | tokenizer.add_tokens(new_tokens)
61 | new_vocab_size = len(tokenizer)
62 | args.vocab_size = new_vocab_size
63 |
64 | answer = ['no', 'yes']
65 | answer_ids = tokenizer.encode(answer, add_special_tokens=False)
66 |
67 | net = BERTPrompt4NR(model_name, answer_ids, args)
68 | return net, tokenizer
69 |
70 |
71 | def eval(model, rank, world_size, data_loader):
72 | model.eval()
73 | data_loader = tqdm(data_loader)
74 | val_scores = []
75 | acc_cnt = torch.zeros(2).to(rank)
76 | acc_cnt_pos = torch.zeros(2).to(rank)
77 | imp_ids = []
78 | labels = []
79 | for step, data in enumerate(data_loader):
80 | batch_enc, batch_attn, batch_labs, batch_imp = data
81 | imp_ids = imp_ids + batch_imp
82 | labels = labels + batch_labs.cpu().numpy().tolist()
83 |
84 | batch_enc = batch_enc.to(rank)
85 | batch_attn = batch_attn.to(rank)
86 | batch_labs = batch_labs.to(rank)
87 |
88 | loss, scores = model(batch_enc, batch_attn, batch_labs)
89 |
90 | ranking_scores = scores[:, 1].detach()
91 | val_scores.append(ranking_scores)
92 |
93 | predict = torch.argmax(scores.detach(), dim=1)
94 | num_correct = (predict == batch_labs).sum()
95 | acc_cnt[0] += num_correct
96 | acc_cnt[1] += predict.size(0)
97 |
98 | positive_idx = torch.where(batch_labs == 1)[0]
99 | num_correct_pos = (predict[positive_idx] == batch_labs[positive_idx]).sum()
100 | acc_cnt_pos[0] += num_correct_pos
101 | acc_cnt_pos[1] += positive_idx.size(0)
102 |
103 | dist.all_reduce(acc_cnt, op=dist.ReduceOp.SUM)
104 | dist.all_reduce(acc_cnt_pos, op=dist.ReduceOp.SUM)
105 |
106 | acc = acc_cnt[0] / acc_cnt[1]
107 | acc_pos = acc_cnt_pos[0] / acc_cnt_pos[1]
108 | pos_ratio = acc_cnt_pos[1] / acc_cnt[1]
109 |
110 | val_scores = torch.cat(val_scores, dim=0)
111 | val_impids = torch.IntTensor(imp_ids).to(rank)
112 | val_labels = torch.IntTensor(labels).to(rank)
113 |
114 | val_scores_list = [torch.zeros_like(val_scores).to(rank) for _ in range(world_size)]
115 | val_impids_list = [torch.zeros_like(val_impids).to(rank) for _ in range(world_size)]
116 | val_labels_list = [torch.zeros_like(val_labels).to(rank) for _ in range(world_size)]
117 |
118 | dist.all_gather(val_scores_list, val_scores)
119 | dist.all_gather(val_impids_list, val_impids)
120 | dist.all_gather(val_labels_list, val_labels)
121 |
122 | return val_scores_list, acc.item(), acc_pos.item(), pos_ratio.item(), val_impids_list, val_labels_list
123 |
124 |
125 | def ddp_main(rank, world_size, args):
126 | args.rank = rank
127 | args.world_size = world_size
128 | init_seed(rank + 1)
129 | if rank == 0:
130 | if args.log:
131 | sys.stdout = Logger(args.log_file, sys.stdout)
132 | setup(rank, world_size)
133 |
134 | print('| distributed init rank {}'.format(rank))
135 | dist.barrier()
136 |
137 | # load model
138 | net, tokenizer = load_model(args.model_name, args)
139 |
140 | # load data
141 | news_dict = pickle.load(open(os.path.join(args.data_path, 'news.txt'), 'rb'))
142 | test_dataset = MyDataset(args, tokenizer, news_dict, status='test')
143 |
144 | if rank == 0:
145 | print(args)
146 | print('Vocabulary size of tokenizer after adding new tokens : %d' % args.vocab_size)
147 | print(test_dataset[0]['sentence'])
148 | print('num test: %d' % len(test_dataset))
149 |
150 | test_sampler = DistributedSampler(test_dataset,
151 | rank=rank,
152 | num_replicas=world_size)
153 | nw = 2
154 | test_kwargs = {'batch_size': args.test_batch_size, 'sampler': test_sampler,
155 | 'shuffle': False, 'pin_memory': False,
156 | 'num_workers': nw, 'collate_fn': test_dataset.collate_fn}
157 |
158 | test_loader = DataLoader(test_dataset, **test_kwargs)
159 |
160 | net = net.to(rank)
161 | net = DDP(net, device_ids=[rank])
162 |
163 | dist.barrier()
164 |
165 | map_location = {'cuda:%d' % 0: 'cuda:%d' % rank}
166 | net.module.load_state_dict(torch.load(args.model_file, map_location=map_location))
167 |
168 | with torch.no_grad():
169 | st_test = time.time()
170 | test_scores, acc_test, acc_pos_test, pos_ratio_test, test_impids, test_labels = \
171 | eval(net, rank, world_size, test_loader)
172 | impressions = {} # {1: {'score': [], 'lab': []}}
173 | for i in range(world_size):
174 | scores, imp_id, labs = test_scores[i], test_impids[i], test_labels[i]
175 | assert scores.size() == imp_id.size() == labs.size()
176 | scores = scores.cpu().numpy().tolist()
177 | imp_id = imp_id.cpu().numpy().tolist()
178 | labs = labs.cpu().numpy().tolist()
179 | for j in range(len(scores)):
180 | sco, imp, lab = scores[j], imp_id[j], labs[j]
181 | if imp not in impressions:
182 | impressions[imp] = {'score': [], 'lab': []}
183 | impressions[imp]['score'].append(sco)
184 | impressions[imp]['lab'].append(lab)
185 | else:
186 | impressions[imp]['score'].append(sco)
187 | impressions[imp]['lab'].append(lab)
188 | predicts, truths = [], []
189 | for imp in impressions:
190 | sims, labs = impressions[imp]['score'], impressions[imp]['lab']
191 | sl_zip = sorted(zip(sims, labs), key=lambda x: x[0], reverse=True)
192 | sort_sims, sort_labs = zip(*sl_zip)
193 | predicts.append(list(range(1, len(sort_labs) + 1, 1)))
194 | truths.append(sort_labs)
195 |
196 | auc_test, mrr_test, ndcg5_test, ndcg10_test = evaluate(predicts, truths)
197 | end_test = time.time()
198 | test_spend = (end_test - st_test) / 60
199 |
200 | if rank == 0:
201 | print("Test: AUC: %0.4f\tMRR: %0.4f\tnDCG@5: %0.4f\tnDCG@10: %0.4f\t[Test-Time: %0.2f mim]" %
202 | (auc_test, mrr_test, ndcg5_test, ndcg10_test, test_spend))
203 | cleanup()
204 |
205 |
206 | if __name__ == '__main__':
207 | t0 = time.time()
208 | parser = argparse.ArgumentParser()
209 | parser.add_argument('--data_path', default='../DATA/MIND-Demo', type=str, help='Path')
210 | parser.add_argument('--model_name', default='bert-base-uncased', type=str)
211 |
212 | parser.add_argument('--test_batch_size', default=15, type=int, help='test batch_size')
213 | parser.add_argument('--max_his', default=50, type=int, help='max number of history')
214 | parser.add_argument('--max_tokens', default=500, type=int, help='max number of tokens')
215 |
216 | parser.add_argument('--max_his_len', default=450, type=int, help='max number of history')
217 |
218 | parser.add_argument('--device', default='cuda', help='device id')
219 | parser.add_argument('--world_size', default=2, type=int, help='number of distributed processes')
220 |
221 | parser.add_argument('--model_file', default='', type=str, help='model file')
222 | # parser.add_argument('--log', default=False, type=bool, help='whether write log file')
223 | parser.add_argument('--log', default=True, type=bool, help='whether write log file')
224 |
225 | args = parser.parse_args()
226 |
227 | if args.data_path == '../DATA/MIND-Demo':
228 | if args.log:
229 | if not os.path.exists('./log-Test'):
230 | os.makedirs('./log-Test')
231 | log_file = './log-Test/' + 'Tbs' + str(args.test_batch_size) + '-' + str(datetime.now())[-5:]+'.txt'
232 | args.log_file = log_file
233 | else:
234 | if args.log:
235 | if not os.path.exists('./log-Test-Small'):
236 | os.makedirs('./log-Test-Small')
237 | log_file = './log-Test-Small/' + 'Tbs' + str(args.test_batch_size) + '-' + str(datetime.now())[-5:]+'.txt'
238 | args.log_file = log_file
239 |
240 | WORLD_SIZE = torch.cuda.device_count()
241 | mp.spawn(ddp_main,
242 | args=(WORLD_SIZE, args),
243 | nprocs=WORLD_SIZE,
244 | join=True)
245 | t1 = time.time()
246 | run_time = (t1 - t0) / 3600
247 | print('Running time: %0.4f' % run_time)
--------------------------------------------------------------------------------
/Discrete-Emotion/predict.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | import pickle
4 | import time
5 | import sys
6 |
7 | from tqdm import tqdm
8 | from datetime import datetime
9 | import torch.cuda
10 | from torch.utils.data import DataLoader
11 |
12 | from transformers import BertTokenizer
13 | from transformers import AdamW
14 |
15 | import torch.distributed as dist
16 | import torch.multiprocessing as mp
17 | from torch.nn.parallel import DistributedDataParallel as DDP
18 | from torch.utils.data.distributed import DistributedSampler
19 |
20 | from model import BERTPrompt4NR
21 | from prepro_data import *
22 | from utils import evaluate
23 |
24 |
25 | def setup(rank, world_size):
26 | os.environ['MASTER_ADDR'] = 'localhost'
27 | os.environ['MASTER_PORT'] = '23342'
28 | dist.init_process_group("nccl", rank=rank, world_size=world_size)
29 |
30 |
31 | def cleanup():
32 | dist.destroy_process_group()
33 |
34 |
35 | def init_seed(seed):
36 | random.seed(seed)
37 | np.random.seed(seed)
38 | torch.manual_seed(seed)
39 | torch.cuda.manual_seed(seed)
40 | torch.cuda.manual_seed_all(seed)
41 |
42 |
43 | class Logger(object):
44 | def __init__(self, filename, stream=sys.stdout):
45 | self.terminal = stream
46 | self.log = open(filename, 'w')
47 |
48 | def write(self, message):
49 | self.terminal.write(message)
50 | self.log.write(message)
51 |
52 | def flush(self):
53 | pass
54 |
55 |
56 | def load_model(model_name, args):
57 | tokenizer = BertTokenizer.from_pretrained(model_name)
58 |
59 | new_tokens = ['[NSEP]']
60 | tokenizer.add_tokens(new_tokens)
61 | new_vocab_size = len(tokenizer)
62 | args.vocab_size = new_vocab_size
63 |
64 | answer = ['boring', 'interesting']
65 | answer_ids = tokenizer.encode(answer, add_special_tokens=False)
66 |
67 | net = BERTPrompt4NR(model_name, answer_ids, args)
68 | return net, tokenizer
69 |
70 |
71 | def eval(model, rank, world_size, data_loader):
72 | model.eval()
73 | data_loader = tqdm(data_loader)
74 | val_scores = []
75 | acc_cnt = torch.zeros(2).to(rank)
76 | acc_cnt_pos = torch.zeros(2).to(rank)
77 | imp_ids = []
78 | labels = []
79 | for step, data in enumerate(data_loader):
80 | batch_enc, batch_attn, batch_labs, batch_imp = data
81 | imp_ids = imp_ids + batch_imp
82 | labels = labels + batch_labs.cpu().numpy().tolist()
83 |
84 | batch_enc = batch_enc.to(rank)
85 | batch_attn = batch_attn.to(rank)
86 | batch_labs = batch_labs.to(rank)
87 |
88 | loss, scores = model(batch_enc, batch_attn, batch_labs)
89 |
90 | ranking_scores = scores[:, 1].detach()
91 | val_scores.append(ranking_scores)
92 |
93 | predict = torch.argmax(scores.detach(), dim=1)
94 | num_correct = (predict == batch_labs).sum()
95 | acc_cnt[0] += num_correct
96 | acc_cnt[1] += predict.size(0)
97 |
98 | positive_idx = torch.where(batch_labs == 1)[0]
99 | num_correct_pos = (predict[positive_idx] == batch_labs[positive_idx]).sum()
100 | acc_cnt_pos[0] += num_correct_pos
101 | acc_cnt_pos[1] += positive_idx.size(0)
102 |
103 | dist.all_reduce(acc_cnt, op=dist.ReduceOp.SUM)
104 | dist.all_reduce(acc_cnt_pos, op=dist.ReduceOp.SUM)
105 |
106 | acc = acc_cnt[0] / acc_cnt[1]
107 | acc_pos = acc_cnt_pos[0] / acc_cnt_pos[1]
108 | pos_ratio = acc_cnt_pos[1] / acc_cnt[1]
109 |
110 | val_scores = torch.cat(val_scores, dim=0)
111 | val_impids = torch.IntTensor(imp_ids).to(rank)
112 | val_labels = torch.IntTensor(labels).to(rank)
113 |
114 | val_scores_list = [torch.zeros_like(val_scores).to(rank) for _ in range(world_size)]
115 | val_impids_list = [torch.zeros_like(val_impids).to(rank) for _ in range(world_size)]
116 | val_labels_list = [torch.zeros_like(val_labels).to(rank) for _ in range(world_size)]
117 |
118 | dist.all_gather(val_scores_list, val_scores)
119 | dist.all_gather(val_impids_list, val_impids)
120 | dist.all_gather(val_labels_list, val_labels)
121 |
122 | return val_scores_list, acc.item(), acc_pos.item(), pos_ratio.item(), val_impids_list, val_labels_list
123 |
124 |
125 | def ddp_main(rank, world_size, args):
126 | args.rank = rank
127 | args.world_size = world_size
128 | init_seed(rank + 1)
129 | if rank == 0:
130 | if args.log:
131 | sys.stdout = Logger(args.log_file, sys.stdout)
132 | setup(rank, world_size)
133 |
134 | print('| distributed init rank {}'.format(rank))
135 | dist.barrier()
136 |
137 | # load model
138 | net, tokenizer = load_model(args.model_name, args)
139 |
140 | # load data
141 | news_dict = pickle.load(open(os.path.join(args.data_path, 'news.txt'), 'rb'))
142 | test_dataset = MyDataset(args, tokenizer, news_dict, status='test')
143 |
144 | if rank == 0:
145 | print(args)
146 | print('Vocabulary size of tokenizer after adding new tokens : %d' % args.vocab_size)
147 | print(test_dataset[0]['sentence'])
148 | print('num test: %d' % len(test_dataset))
149 |
150 | test_sampler = DistributedSampler(test_dataset,
151 | rank=rank,
152 | num_replicas=world_size)
153 | nw = 2
154 | test_kwargs = {'batch_size': args.test_batch_size, 'sampler': test_sampler,
155 | 'shuffle': False, 'pin_memory': False,
156 | 'num_workers': nw, 'collate_fn': test_dataset.collate_fn}
157 |
158 | test_loader = DataLoader(test_dataset, **test_kwargs)
159 |
160 | net = net.to(rank)
161 | net = DDP(net, device_ids=[rank])
162 |
163 | dist.barrier()
164 |
165 | map_location = {'cuda:%d' % 0: 'cuda:%d' % rank}
166 | net.module.load_state_dict(torch.load(args.model_file, map_location=map_location))
167 |
168 | with torch.no_grad():
169 | st_test = time.time()
170 | test_scores, acc_test, acc_pos_test, pos_ratio_test, test_impids, test_labels = \
171 | eval(net, rank, world_size, test_loader)
172 | impressions = {} # {1: {'score': [], 'lab': []}}
173 | for i in range(world_size):
174 | scores, imp_id, labs = test_scores[i], test_impids[i], test_labels[i]
175 | assert scores.size() == imp_id.size() == labs.size()
176 | scores = scores.cpu().numpy().tolist()
177 | imp_id = imp_id.cpu().numpy().tolist()
178 | labs = labs.cpu().numpy().tolist()
179 | for j in range(len(scores)):
180 | sco, imp, lab = scores[j], imp_id[j], labs[j]
181 | if imp not in impressions:
182 | impressions[imp] = {'score': [], 'lab': []}
183 | impressions[imp]['score'].append(sco)
184 | impressions[imp]['lab'].append(lab)
185 | else:
186 | impressions[imp]['score'].append(sco)
187 | impressions[imp]['lab'].append(lab)
188 | predicts, truths = [], []
189 | for imp in impressions:
190 | sims, labs = impressions[imp]['score'], impressions[imp]['lab']
191 | sl_zip = sorted(zip(sims, labs), key=lambda x: x[0], reverse=True)
192 | sort_sims, sort_labs = zip(*sl_zip)
193 | predicts.append(list(range(1, len(sort_labs) + 1, 1)))
194 | truths.append(sort_labs)
195 |
196 | auc_test, mrr_test, ndcg5_test, ndcg10_test = evaluate(predicts, truths)
197 | end_test = time.time()
198 | test_spend = (end_test - st_test) / 60
199 |
200 | if rank == 0:
201 | print("Test: AUC: %0.4f\tMRR: %0.4f\tnDCG@5: %0.4f\tnDCG@10: %0.4f\t[Test-Time: %0.2f mim]" %
202 | (auc_test, mrr_test, ndcg5_test, ndcg10_test, test_spend))
203 | cleanup()
204 |
205 |
206 | if __name__ == '__main__':
207 | t0 = time.time()
208 | parser = argparse.ArgumentParser()
209 | parser.add_argument('--data_path', default='../DATA/MIND-Small', type=str, help='Path')
210 | parser.add_argument('--model_name', default='bert-base-uncased', type=str)
211 |
212 | parser.add_argument('--test_batch_size', default=15, type=int, help='test batch_size')
213 | parser.add_argument('--max_his', default=50, type=int, help='max number of history')
214 | parser.add_argument('--max_tokens', default=500, type=int, help='max number of tokens')
215 |
216 | parser.add_argument('--max_his_len', default=450, type=int, help='max number of history')
217 |
218 | parser.add_argument('--device', default='cuda', help='device id')
219 | parser.add_argument('--world_size', default=2, type=int, help='number of distributed processes')
220 |
221 | parser.add_argument('--model_file', default='', type=str, help='model file')
222 | # parser.add_argument('--log', default=False, type=bool, help='whether write log file')
223 | parser.add_argument('--log', default=True, type=bool, help='whether write log file')
224 |
225 | args = parser.parse_args()
226 |
227 | if args.data_path == '../DATA/MIND-Demo':
228 | if args.log:
229 | if not os.path.exists('./log-Test'):
230 | os.makedirs('./log-Test')
231 | log_file = './log-Test/' + 'Tbs' + str(args.test_batch_size) + '-' + str(datetime.now())[-5:]+'.txt'
232 | args.log_file = log_file
233 | else: # Mind-Small
234 | if args.log:
235 | if not os.path.exists('./log-Test-Small'):
236 | os.makedirs('./log-Test-Small')
237 | log_file = './log-Test-Small/' + 'Tbs' + str(args.test_batch_size) + '-' + str(datetime.now())[-5:]+'.txt'
238 | args.log_file = log_file
239 |
240 | WORLD_SIZE = torch.cuda.device_count()
241 | mp.spawn(ddp_main,
242 | args=(WORLD_SIZE, args),
243 | nprocs=WORLD_SIZE,
244 | join=True)
245 | t1 = time.time()
246 | run_time = (t1 - t0) / 3600
247 | print('Running time: %0.4f' % run_time)
--------------------------------------------------------------------------------
/Discrete-Relevance/predict.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | import pickle
4 | import time
5 | import sys
6 |
7 | from tqdm import tqdm
8 | from datetime import datetime
9 | import torch.cuda
10 | from torch.utils.data import DataLoader
11 |
12 | from transformers import BertTokenizer
13 | from transformers import AdamW
14 |
15 | import torch.distributed as dist
16 | import torch.multiprocessing as mp
17 | from torch.nn.parallel import DistributedDataParallel as DDP
18 | from torch.utils.data.distributed import DistributedSampler
19 |
20 | from model import BERTPrompt4NR
21 | from prepro_data import *
22 | from utils import evaluate
23 |
24 |
25 | def setup(rank, world_size):
26 | os.environ['MASTER_ADDR'] = 'localhost'
27 | os.environ['MASTER_PORT'] = '23342'
28 | dist.init_process_group("nccl", rank=rank, world_size=world_size)
29 |
30 |
31 | def cleanup():
32 | dist.destroy_process_group()
33 |
34 |
35 | def init_seed(seed):
36 | random.seed(seed)
37 | np.random.seed(seed)
38 | torch.manual_seed(seed)
39 | torch.cuda.manual_seed(seed)
40 | torch.cuda.manual_seed_all(seed)
41 |
42 |
43 | class Logger(object):
44 | def __init__(self, filename, stream=sys.stdout):
45 | self.terminal = stream
46 | self.log = open(filename, 'w')
47 |
48 | def write(self, message):
49 | self.terminal.write(message)
50 | self.log.write(message)
51 |
52 | def flush(self):
53 | pass
54 |
55 |
56 | def load_model(model_name, args):
57 | tokenizer = BertTokenizer.from_pretrained(model_name)
58 |
59 | new_tokens = ['[NSEP]']
60 | tokenizer.add_tokens(new_tokens)
61 | new_vocab_size = len(tokenizer)
62 | args.vocab_size = new_vocab_size
63 |
64 | answer = ['unrelated', 'related']
65 | answer_ids = tokenizer.encode(answer, add_special_tokens=False)
66 |
67 | net = BERTPrompt4NR(model_name, answer_ids, args)
68 | return net, tokenizer
69 |
70 |
71 | def eval(model, rank, world_size, data_loader):
72 | model.eval()
73 | data_loader = tqdm(data_loader)
74 | val_scores = []
75 | acc_cnt = torch.zeros(2).to(rank)
76 | acc_cnt_pos = torch.zeros(2).to(rank)
77 | imp_ids = []
78 | labels = []
79 | for step, data in enumerate(data_loader):
80 | batch_enc, batch_attn, batch_labs, batch_imp = data
81 | imp_ids = imp_ids + batch_imp
82 | labels = labels + batch_labs.cpu().numpy().tolist()
83 |
84 | batch_enc = batch_enc.to(rank)
85 | batch_attn = batch_attn.to(rank)
86 | batch_labs = batch_labs.to(rank)
87 |
88 | loss, scores = model(batch_enc, batch_attn, batch_labs)
89 |
90 | ranking_scores = scores[:, 1].detach()
91 | val_scores.append(ranking_scores)
92 |
93 | predict = torch.argmax(scores.detach(), dim=1)
94 | num_correct = (predict == batch_labs).sum()
95 | acc_cnt[0] += num_correct
96 | acc_cnt[1] += predict.size(0)
97 |
98 | positive_idx = torch.where(batch_labs == 1)[0]
99 | num_correct_pos = (predict[positive_idx] == batch_labs[positive_idx]).sum()
100 | acc_cnt_pos[0] += num_correct_pos
101 | acc_cnt_pos[1] += positive_idx.size(0)
102 |
103 | dist.all_reduce(acc_cnt, op=dist.ReduceOp.SUM)
104 | dist.all_reduce(acc_cnt_pos, op=dist.ReduceOp.SUM)
105 |
106 | acc = acc_cnt[0] / acc_cnt[1]
107 | acc_pos = acc_cnt_pos[0] / acc_cnt_pos[1]
108 | pos_ratio = acc_cnt_pos[1] / acc_cnt[1]
109 |
110 | val_scores = torch.cat(val_scores, dim=0)
111 | val_impids = torch.IntTensor(imp_ids).to(rank)
112 | val_labels = torch.IntTensor(labels).to(rank)
113 |
114 | val_scores_list = [torch.zeros_like(val_scores).to(rank) for _ in range(world_size)]
115 | val_impids_list = [torch.zeros_like(val_impids).to(rank) for _ in range(world_size)]
116 | val_labels_list = [torch.zeros_like(val_labels).to(rank) for _ in range(world_size)]
117 |
118 | dist.all_gather(val_scores_list, val_scores)
119 | dist.all_gather(val_impids_list, val_impids)
120 | dist.all_gather(val_labels_list, val_labels)
121 |
122 | return val_scores_list, acc.item(), acc_pos.item(), pos_ratio.item(), val_impids_list, val_labels_list
123 |
124 |
125 | def ddp_main(rank, world_size, args):
126 | args.rank = rank
127 | args.world_size = world_size
128 | init_seed(rank + 1)
129 | if rank == 0:
130 | if args.log:
131 | sys.stdout = Logger(args.log_file, sys.stdout)
132 | setup(rank, world_size)
133 |
134 | print('| distributed init rank {}'.format(rank))
135 | dist.barrier()
136 |
137 | # load model
138 | net, tokenizer = load_model(args.model_name, args)
139 |
140 | # load data
141 | news_dict = pickle.load(open(os.path.join(args.data_path, 'news.txt'), 'rb'))
142 | test_dataset = MyDataset(args, tokenizer, news_dict, status='test')
143 |
144 | if rank == 0:
145 | print(args)
146 | print('Vocabulary size of tokenizer after adding new tokens : %d' % args.vocab_size)
147 | print(test_dataset[0]['sentence'])
148 | print('num test: %d' % len(test_dataset))
149 |
150 | test_sampler = DistributedSampler(test_dataset,
151 | rank=rank,
152 | num_replicas=world_size)
153 | nw = 2
154 | test_kwargs = {'batch_size': args.test_batch_size, 'sampler': test_sampler,
155 | 'shuffle': False, 'pin_memory': False,
156 | 'num_workers': nw, 'collate_fn': test_dataset.collate_fn}
157 |
158 | test_loader = DataLoader(test_dataset, **test_kwargs)
159 |
160 | net = net.to(rank)
161 | net = DDP(net, device_ids=[rank])
162 |
163 | dist.barrier()
164 |
165 | map_location = {'cuda:%d' % 0: 'cuda:%d' % rank}
166 | net.module.load_state_dict(torch.load(args.model_file, map_location=map_location))
167 |
168 | with torch.no_grad():
169 | st_test = time.time()
170 | test_scores, acc_test, acc_pos_test, pos_ratio_test, test_impids, test_labels = \
171 | eval(net, rank, world_size, test_loader)
172 | impressions = {} # {1: {'score': [], 'lab': []}}
173 | for i in range(world_size):
174 | scores, imp_id, labs = test_scores[i], test_impids[i], test_labels[i]
175 | assert scores.size() == imp_id.size() == labs.size()
176 | scores = scores.cpu().numpy().tolist()
177 | imp_id = imp_id.cpu().numpy().tolist()
178 | labs = labs.cpu().numpy().tolist()
179 | for j in range(len(scores)):
180 | sco, imp, lab = scores[j], imp_id[j], labs[j]
181 | if imp not in impressions:
182 | impressions[imp] = {'score': [], 'lab': []}
183 | impressions[imp]['score'].append(sco)
184 | impressions[imp]['lab'].append(lab)
185 | else:
186 | impressions[imp]['score'].append(sco)
187 | impressions[imp]['lab'].append(lab)
188 | predicts, truths = [], []
189 | for imp in impressions:
190 | sims, labs = impressions[imp]['score'], impressions[imp]['lab']
191 | sl_zip = sorted(zip(sims, labs), key=lambda x: x[0], reverse=True)
192 | sort_sims, sort_labs = zip(*sl_zip)
193 | predicts.append(list(range(1, len(sort_labs) + 1, 1)))
194 | truths.append(sort_labs)
195 |
196 | auc_test, mrr_test, ndcg5_test, ndcg10_test = evaluate(predicts, truths)
197 | end_test = time.time()
198 | test_spend = (end_test - st_test) / 60
199 |
200 | if rank == 0:
201 | print("Test: AUC: %0.4f\tMRR: %0.4f\tnDCG@5: %0.4f\tnDCG@10: %0.4f\t[Test-Time: %0.2f mim]" %
202 | (auc_test, mrr_test, ndcg5_test, ndcg10_test, test_spend))
203 | cleanup()
204 |
205 |
206 | if __name__ == '__main__':
207 | t0 = time.time()
208 | parser = argparse.ArgumentParser()
209 | parser.add_argument('--data_path', default='../DATA/MIND-Small', type=str, help='Path')
210 | parser.add_argument('--model_name', default='bert-base-uncased', type=str)
211 |
212 | parser.add_argument('--test_batch_size', default=15, type=int, help='test batch_size')
213 | parser.add_argument('--max_his', default=50, type=int, help='max number of history')
214 | parser.add_argument('--max_tokens', default=500, type=int, help='max number of tokens')
215 |
216 | parser.add_argument('--max_his_len', default=450, type=int, help='max number of history')
217 |
218 | parser.add_argument('--device', default='cuda', help='device id')
219 | parser.add_argument('--world_size', default=2, type=int, help='number of distributed processes')
220 |
221 | parser.add_argument('--model_file', default='', type=str, help='model file')
222 | parser.add_argument('--log', default=False, type=bool, help='whether write log file')
223 | # parser.add_argument('--log', default=True, type=bool, help='whether write log file')
224 |
225 | args = parser.parse_args()
226 |
227 | if args.data_path == '../DATA/MIND-Demo':
228 | if args.log:
229 | if not os.path.exists('./log-Test'):
230 | os.makedirs('./log-Test')
231 | log_file = './log-Test/' + 'Tbs' + str(args.test_batch_size) + '-' + str(datetime.now())[-5:]+'.txt'
232 | args.log_file = log_file
233 | else: # Mind-Small
234 | if args.log:
235 | if not os.path.exists('./log-Test-Small'):
236 | os.makedirs('./log-Test-Small')
237 | log_file = './log-Test-Small/' + 'Tbs' + str(args.test_batch_size) + '-' + str(datetime.now())[-5:]+'.txt'
238 | args.log_file = log_file
239 |
240 | WORLD_SIZE = torch.cuda.device_count()
241 | mp.spawn(ddp_main,
242 | args=(WORLD_SIZE, args),
243 | nprocs=WORLD_SIZE,
244 | join=True)
245 | t1 = time.time()
246 | run_time = (t1 - t0) / 3600
247 | print('Running time: %0.4f' % run_time)
--------------------------------------------------------------------------------
/Hybrid-Utility/predict.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | import pickle
4 | import time
5 | import sys
6 |
7 | from tqdm import tqdm
8 | from datetime import datetime
9 | import torch.cuda
10 | from torch.utils.data import DataLoader
11 |
12 | from transformers import BertTokenizer
13 | from transformers import AdamW
14 |
15 | import torch.distributed as dist
16 | import torch.multiprocessing as mp
17 | from torch.nn.parallel import DistributedDataParallel as DDP
18 | from torch.utils.data.distributed import DistributedSampler
19 |
20 | from model import BERTPrompt4NR
21 | from prepro_data import *
22 | from utils import evaluate
23 |
24 |
25 | def setup(rank, world_size):
26 | os.environ['MASTER_ADDR'] = 'localhost'
27 | os.environ['MASTER_PORT'] = '23342'
28 | dist.init_process_group("nccl", rank=rank, world_size=world_size)
29 |
30 |
31 | def cleanup():
32 | dist.destroy_process_group()
33 |
34 |
35 | def init_seed(seed):
36 | random.seed(seed)
37 | np.random.seed(seed)
38 | torch.manual_seed(seed)
39 | torch.cuda.manual_seed(seed)
40 | torch.cuda.manual_seed_all(seed)
41 |
42 |
43 | class Logger(object):
44 | def __init__(self, filename, stream=sys.stdout):
45 | self.terminal = stream
46 | self.log = open(filename, 'w')
47 |
48 | def write(self, message):
49 | self.terminal.write(message)
50 | self.log.write(message)
51 |
52 | def flush(self):
53 | pass
54 |
55 |
56 | def load_tokenizer(model_name, args):
57 | tokenizer = BertTokenizer.from_pretrained(model_name)
58 | conti_tokens1 = []
59 | for i in range(args.num_conti1):
60 | conti_tokens1.append('[P' + str(i + 1) + ']')
61 | conti_tokens2 = []
62 | for i in range(args.num_conti2):
63 | conti_tokens2.append('[Q' + str(i + 1) + ']')
64 |
65 | new_tokens = ['[NSEP]']
66 | tokenizer.add_tokens(new_tokens)
67 |
68 | conti_tokens = conti_tokens1 + conti_tokens2
69 | tokenizer.add_tokens(conti_tokens)
70 |
71 | new_vocab_size = len(tokenizer)
72 | args.vocab_size = new_vocab_size
73 |
74 | return tokenizer, conti_tokens1, conti_tokens2
75 |
76 |
77 | def load_model(model_name, tokenizer, args):
78 | answer = ['bad', 'good']
79 | answer_ids = tokenizer.encode(answer, add_special_tokens=False)
80 |
81 | net = BERTPrompt4NR(model_name, answer_ids, args)
82 | return net
83 |
84 |
85 | def eval(model, rank, world_size, data_loader):
86 | model.eval()
87 | data_loader = tqdm(data_loader)
88 | val_scores = []
89 | acc_cnt = torch.zeros(2).to(rank)
90 | acc_cnt_pos = torch.zeros(2).to(rank)
91 | imp_ids = []
92 | labels = []
93 | for step, data in enumerate(data_loader):
94 | batch_enc, batch_attn, batch_labs, batch_imp = data
95 | imp_ids = imp_ids + batch_imp
96 | labels = labels + batch_labs.cpu().numpy().tolist()
97 |
98 | batch_enc = batch_enc.to(rank)
99 | batch_attn = batch_attn.to(rank)
100 | batch_labs = batch_labs.to(rank)
101 |
102 | loss, scores = model(batch_enc, batch_attn, batch_labs)
103 |
104 | ranking_scores = scores[:, 1].detach()
105 | val_scores.append(ranking_scores)
106 |
107 | predict = torch.argmax(scores.detach(), dim=1)
108 | num_correct = (predict == batch_labs).sum()
109 | acc_cnt[0] += num_correct
110 | acc_cnt[1] += predict.size(0)
111 |
112 | positive_idx = torch.where(batch_labs == 1)[0]
113 | num_correct_pos = (predict[positive_idx] == batch_labs[positive_idx]).sum()
114 | acc_cnt_pos[0] += num_correct_pos
115 | acc_cnt_pos[1] += positive_idx.size(0)
116 |
117 | dist.all_reduce(acc_cnt, op=dist.ReduceOp.SUM)
118 | dist.all_reduce(acc_cnt_pos, op=dist.ReduceOp.SUM)
119 |
120 | acc = acc_cnt[0] / acc_cnt[1]
121 | acc_pos = acc_cnt_pos[0] / acc_cnt_pos[1]
122 | pos_ratio = acc_cnt_pos[1] / acc_cnt[1]
123 |
124 | val_scores = torch.cat(val_scores, dim=0)
125 | val_impids = torch.IntTensor(imp_ids).to(rank)
126 | val_labels = torch.IntTensor(labels).to(rank)
127 |
128 | val_scores_list = [torch.zeros_like(val_scores).to(rank) for _ in range(world_size)]
129 | val_impids_list = [torch.zeros_like(val_impids).to(rank) for _ in range(world_size)]
130 | val_labels_list = [torch.zeros_like(val_labels).to(rank) for _ in range(world_size)]
131 |
132 | dist.all_gather(val_scores_list, val_scores)
133 | dist.all_gather(val_impids_list, val_impids)
134 | dist.all_gather(val_labels_list, val_labels)
135 |
136 | return val_scores_list, acc.item(), acc_pos.item(), pos_ratio.item(), val_impids_list, val_labels_list
137 |
138 |
139 | def ddp_main(rank, world_size, args):
140 | args.rank = rank
141 | args.world_size = world_size
142 | init_seed(rank + 1)
143 | if rank == 0:
144 | if args.log:
145 | sys.stdout = Logger(args.log_file, sys.stdout)
146 | setup(rank, world_size)
147 |
148 | print('| distributed init rank {}'.format(rank))
149 | dist.barrier()
150 |
151 | # load tokenizer
152 | tokenizer, conti_tokens1, conti_tokens2 = load_tokenizer(args.model_name, args)
153 | conti_tokens = [conti_tokens1, conti_tokens2]
154 |
155 | # load model
156 | net = load_model(args.model_name, tokenizer, args)
157 |
158 | # load data
159 | news_dict = pickle.load(open(os.path.join(args.data_path, 'news.txt'), 'rb'))
160 | test_dataset = MyDataset(args, tokenizer, news_dict, conti_tokens, status='test')
161 |
162 | if rank == 0:
163 | print(args)
164 | print('Vocabulary size of tokenizer after adding new tokens : %d' % args.vocab_size)
165 | print(test_dataset[0]['sentence'])
166 | print('num test: %d' % len(test_dataset))
167 |
168 | test_sampler = DistributedSampler(test_dataset,
169 | rank=rank,
170 | num_replicas=world_size)
171 | nw = 2
172 | test_kwargs = {'batch_size': args.test_batch_size, 'sampler': test_sampler,
173 | 'shuffle': False, 'pin_memory': False,
174 | 'num_workers': nw, 'collate_fn': test_dataset.collate_fn}
175 |
176 | test_loader = DataLoader(test_dataset, **test_kwargs)
177 |
178 | net = net.to(rank)
179 | net = DDP(net, device_ids=[rank])
180 |
181 | dist.barrier()
182 |
183 | map_location = {'cuda:%d' % 0: 'cuda:%d' % rank}
184 | net.module.load_state_dict(torch.load(args.model_file, map_location=map_location))
185 |
186 | with torch.no_grad():
187 | st_test = time.time()
188 | test_scores, acc_test, acc_pos_test, pos_ratio_test, test_impids, test_labels = \
189 | eval(net, rank, world_size, test_loader)
190 | impressions = {} # {1: {'score': [], 'lab': []}}
191 | for i in range(world_size):
192 | scores, imp_id, labs = test_scores[i], test_impids[i], test_labels[i]
193 | assert scores.size() == imp_id.size() == labs.size()
194 | scores = scores.cpu().numpy().tolist()
195 | imp_id = imp_id.cpu().numpy().tolist()
196 | labs = labs.cpu().numpy().tolist()
197 | for j in range(len(scores)):
198 | sco, imp, lab = scores[j], imp_id[j], labs[j]
199 | if imp not in impressions:
200 | impressions[imp] = {'score': [], 'lab': []}
201 | impressions[imp]['score'].append(sco)
202 | impressions[imp]['lab'].append(lab)
203 | else:
204 | impressions[imp]['score'].append(sco)
205 | impressions[imp]['lab'].append(lab)
206 | predicts, truths = [], []
207 | for imp in impressions:
208 | sims, labs = impressions[imp]['score'], impressions[imp]['lab']
209 | sl_zip = sorted(zip(sims, labs), key=lambda x: x[0], reverse=True)
210 | sort_sims, sort_labs = zip(*sl_zip)
211 | predicts.append(list(range(1, len(sort_labs) + 1, 1)))
212 | truths.append(sort_labs)
213 |
214 | auc_test, mrr_test, ndcg5_test, ndcg10_test = evaluate(predicts, truths)
215 | end_test = time.time()
216 | test_spend = (end_test - st_test) / 60
217 |
218 | if rank == 0:
219 | print("Test: AUC: %0.4f\tMRR: %0.4f\tnDCG@5: %0.4f\tnDCG@10: %0.4f\t[Test-Time: %0.2f mim]" %
220 | (auc_test, mrr_test, ndcg5_test, ndcg10_test, test_spend))
221 | cleanup()
222 |
223 |
224 | if __name__ == '__main__':
225 | t0 = time.time()
226 | parser = argparse.ArgumentParser()
227 | parser.add_argument('--data_path', default='../DATA/MIND-Small', type=str, help='Path')
228 | parser.add_argument('--model_name', default='bert-base-uncased', type=str)
229 |
230 | parser.add_argument('--test_batch_size', default=15, type=int, help='test batch_size')
231 | parser.add_argument('--max_his', default=50, type=int, help='max number of history')
232 | parser.add_argument('--max_tokens', default=500, type=int, help='max number of tokens')
233 |
234 | parser.add_argument('--max_his_len', default=450, type=int, help='max number of history')
235 |
236 | parser.add_argument('--num_conti1', default=3, type=int, help='number of continuous tokens')
237 | parser.add_argument('--num_conti2', default=3, type=int, help='number of continuous tokens')
238 |
239 | parser.add_argument('--device', default='cuda', help='device id')
240 | parser.add_argument('--world_size', default=2, type=int, help='number of distributed processes')
241 |
242 | parser.add_argument('--model_file', default='', type=str, help='model file')
243 | parser.add_argument('--log', default=False, type=bool, help='whether write log file')
244 | # parser.add_argument('--log', default=True, type=bool, help='whether write log file')
245 |
246 | args = parser.parse_args()
247 |
248 | if args.data_path == '../DATA/MIND-Demo':
249 | if args.log:
250 | if not os.path.exists('./log-Test'):
251 | os.makedirs('./log-Test')
252 | log_file = './log-Test/' + 'Tbs' + str(args.test_batch_size) + \
253 | '-n' + str(args.num_conti1) + str(args.num_conti2) + \
254 | '-' + str(datetime.now())[-5:]+'.txt'
255 | args.log_file = log_file
256 | else:
257 | if args.log:
258 | if not os.path.exists('./log-Test-Small'):
259 | os.makedirs('./log-Test-Small')
260 | log_file = './log-Test-Small/' + 'Tbs' + str(args.test_batch_size) + \
261 | '-n' + str(args.num_conti1) + str(args.num_conti2) + \
262 | '-' + str(datetime.now())[-5:]+'.txt'
263 | args.log_file = log_file
264 |
265 | WORLD_SIZE = torch.cuda.device_count()
266 | mp.spawn(ddp_main,
267 | args=(WORLD_SIZE, args),
268 | nprocs=WORLD_SIZE,
269 | join=True)
270 | t1 = time.time()
271 | run_time = (t1 - t0) / 3600
272 | print('Running time: %0.4f' % run_time)
--------------------------------------------------------------------------------
/Hybrid-Relevance/predict.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | import pickle
4 | import time
5 | import sys
6 |
7 | from tqdm import tqdm
8 | from datetime import datetime
9 | import torch.cuda
10 | from torch.utils.data import DataLoader
11 |
12 | from transformers import BertTokenizer
13 | from transformers import AdamW
14 |
15 | import torch.distributed as dist
16 | import torch.multiprocessing as mp
17 | from torch.nn.parallel import DistributedDataParallel as DDP
18 | from torch.utils.data.distributed import DistributedSampler
19 |
20 | from model import BERTPrompt4NR
21 | from prepro_data import *
22 | from utils import evaluate
23 |
24 |
25 | def setup(rank, world_size):
26 | os.environ['MASTER_ADDR'] = 'localhost'
27 | os.environ['MASTER_PORT'] = '23342'
28 | dist.init_process_group("nccl", rank=rank, world_size=world_size)
29 |
30 |
31 | def cleanup():
32 | dist.destroy_process_group()
33 |
34 |
35 | def init_seed(seed):
36 | random.seed(seed)
37 | np.random.seed(seed)
38 | torch.manual_seed(seed)
39 | torch.cuda.manual_seed(seed)
40 | torch.cuda.manual_seed_all(seed)
41 |
42 |
43 | class Logger(object):
44 | def __init__(self, filename, stream=sys.stdout):
45 | self.terminal = stream
46 | self.log = open(filename, 'w')
47 |
48 | def write(self, message):
49 | self.terminal.write(message)
50 | self.log.write(message)
51 |
52 | def flush(self):
53 | pass
54 |
55 |
56 | def load_tokenizer(model_name, args):
57 | tokenizer = BertTokenizer.from_pretrained(model_name)
58 | conti_tokens1 = []
59 | for i in range(args.num_conti1):
60 | conti_tokens1.append('[P' + str(i + 1) + ']')
61 | conti_tokens2 = []
62 | for i in range(args.num_conti2):
63 | conti_tokens2.append('[Q' + str(i + 1) + ']')
64 |
65 | new_tokens = ['[NSEP]']
66 | tokenizer.add_tokens(new_tokens)
67 |
68 | conti_tokens = conti_tokens1 + conti_tokens2
69 | tokenizer.add_tokens(conti_tokens)
70 |
71 | new_vocab_size = len(tokenizer)
72 | args.vocab_size = new_vocab_size
73 |
74 | return tokenizer, conti_tokens1, conti_tokens2
75 |
76 |
77 | def load_model(model_name, tokenizer, args):
78 | answer = ['unrelated', 'related']
79 | answer_ids = tokenizer.encode(answer, add_special_tokens=False)
80 |
81 | net = BERTPrompt4NR(model_name, answer_ids, args)
82 | return net
83 |
84 |
85 | def eval(model, rank, world_size, data_loader):
86 | model.eval()
87 | data_loader = tqdm(data_loader)
88 | val_scores = []
89 | acc_cnt = torch.zeros(2).to(rank)
90 | acc_cnt_pos = torch.zeros(2).to(rank)
91 | imp_ids = []
92 | labels = []
93 | for step, data in enumerate(data_loader):
94 | batch_enc, batch_attn, batch_labs, batch_imp = data
95 | imp_ids = imp_ids + batch_imp
96 | labels = labels + batch_labs.cpu().numpy().tolist()
97 |
98 | batch_enc = batch_enc.to(rank)
99 | batch_attn = batch_attn.to(rank)
100 | batch_labs = batch_labs.to(rank)
101 |
102 | loss, scores = model(batch_enc, batch_attn, batch_labs)
103 |
104 | ranking_scores = scores[:, 1].detach()
105 | val_scores.append(ranking_scores)
106 |
107 | predict = torch.argmax(scores.detach(), dim=1)
108 | num_correct = (predict == batch_labs).sum()
109 | acc_cnt[0] += num_correct
110 | acc_cnt[1] += predict.size(0)
111 |
112 | positive_idx = torch.where(batch_labs == 1)[0]
113 | num_correct_pos = (predict[positive_idx] == batch_labs[positive_idx]).sum()
114 | acc_cnt_pos[0] += num_correct_pos
115 | acc_cnt_pos[1] += positive_idx.size(0)
116 |
117 | dist.all_reduce(acc_cnt, op=dist.ReduceOp.SUM)
118 | dist.all_reduce(acc_cnt_pos, op=dist.ReduceOp.SUM)
119 |
120 | acc = acc_cnt[0] / acc_cnt[1]
121 | acc_pos = acc_cnt_pos[0] / acc_cnt_pos[1]
122 | pos_ratio = acc_cnt_pos[1] / acc_cnt[1]
123 |
124 | val_scores = torch.cat(val_scores, dim=0)
125 | val_impids = torch.IntTensor(imp_ids).to(rank)
126 | val_labels = torch.IntTensor(labels).to(rank)
127 |
128 | val_scores_list = [torch.zeros_like(val_scores).to(rank) for _ in range(world_size)]
129 | val_impids_list = [torch.zeros_like(val_impids).to(rank) for _ in range(world_size)]
130 | val_labels_list = [torch.zeros_like(val_labels).to(rank) for _ in range(world_size)]
131 |
132 | dist.all_gather(val_scores_list, val_scores)
133 | dist.all_gather(val_impids_list, val_impids)
134 | dist.all_gather(val_labels_list, val_labels)
135 |
136 | return val_scores_list, acc.item(), acc_pos.item(), pos_ratio.item(), val_impids_list, val_labels_list
137 |
138 |
139 | def ddp_main(rank, world_size, args):
140 | args.rank = rank
141 | args.world_size = world_size
142 | init_seed(rank + 1)
143 | if rank == 0:
144 | if args.log:
145 | sys.stdout = Logger(args.log_file, sys.stdout)
146 | setup(rank, world_size)
147 |
148 | print('| distributed init rank {}'.format(rank))
149 | dist.barrier()
150 |
151 | # load tokenizer
152 | tokenizer, conti_tokens1, conti_tokens2 = load_tokenizer(args.model_name, args)
153 | conti_tokens = [conti_tokens1, conti_tokens2]
154 |
155 | # load model
156 | net = load_model(args.model_name, tokenizer, args)
157 |
158 | # load data
159 | news_dict = pickle.load(open(os.path.join(args.data_path, 'news.txt'), 'rb'))
160 | test_dataset = MyDataset(args, tokenizer, news_dict, conti_tokens, status='test')
161 |
162 | if rank == 0:
163 | print(args)
164 | print('Vocabulary size of tokenizer after adding new tokens : %d' % args.vocab_size)
165 | print(test_dataset[0]['sentence'])
166 | print('num test: %d' % len(test_dataset))
167 |
168 | test_sampler = DistributedSampler(test_dataset,
169 | rank=rank,
170 | num_replicas=world_size)
171 | nw = 2
172 | test_kwargs = {'batch_size': args.test_batch_size, 'sampler': test_sampler,
173 | 'shuffle': False, 'pin_memory': False,
174 | 'num_workers': nw, 'collate_fn': test_dataset.collate_fn}
175 |
176 | test_loader = DataLoader(test_dataset, **test_kwargs)
177 |
178 | net = net.to(rank)
179 | net = DDP(net, device_ids=[rank])
180 |
181 | dist.barrier()
182 |
183 | map_location = {'cuda:%d' % 0: 'cuda:%d' % rank}
184 | net.module.load_state_dict(torch.load(args.model_file, map_location=map_location))
185 |
186 | with torch.no_grad():
187 | st_test = time.time()
188 | test_scores, acc_test, acc_pos_test, pos_ratio_test, test_impids, test_labels = \
189 | eval(net, rank, world_size, test_loader)
190 | impressions = {} # {1: {'score': [], 'lab': []}}
191 | for i in range(world_size):
192 | scores, imp_id, labs = test_scores[i], test_impids[i], test_labels[i]
193 | assert scores.size() == imp_id.size() == labs.size()
194 | scores = scores.cpu().numpy().tolist()
195 | imp_id = imp_id.cpu().numpy().tolist()
196 | labs = labs.cpu().numpy().tolist()
197 | for j in range(len(scores)):
198 | sco, imp, lab = scores[j], imp_id[j], labs[j]
199 | if imp not in impressions:
200 | impressions[imp] = {'score': [], 'lab': []}
201 | impressions[imp]['score'].append(sco)
202 | impressions[imp]['lab'].append(lab)
203 | else:
204 | impressions[imp]['score'].append(sco)
205 | impressions[imp]['lab'].append(lab)
206 | predicts, truths = [], []
207 | for imp in impressions:
208 | sims, labs = impressions[imp]['score'], impressions[imp]['lab']
209 | sl_zip = sorted(zip(sims, labs), key=lambda x: x[0], reverse=True)
210 | sort_sims, sort_labs = zip(*sl_zip)
211 | predicts.append(list(range(1, len(sort_labs) + 1, 1)))
212 | truths.append(sort_labs)
213 |
214 | auc_test, mrr_test, ndcg5_test, ndcg10_test = evaluate(predicts, truths)
215 | end_test = time.time()
216 | test_spend = (end_test - st_test) / 60
217 |
218 | if rank == 0:
219 | print("Test: AUC: %0.4f\tMRR: %0.4f\tnDCG@5: %0.4f\tnDCG@10: %0.4f\t[Test-Time: %0.2f mim]" %
220 | (auc_test, mrr_test, ndcg5_test, ndcg10_test, test_spend))
221 | cleanup()
222 |
223 |
224 | if __name__ == '__main__':
225 | t0 = time.time()
226 | parser = argparse.ArgumentParser()
227 | parser.add_argument('--data_path', default='../DATA/MIND-Demo', type=str, help='Path')
228 | parser.add_argument('--model_name', default='bert-base-uncased', type=str)
229 |
230 | parser.add_argument('--test_batch_size', default=15, type=int, help='test batch_size')
231 | parser.add_argument('--max_his', default=50, type=int, help='max number of history')
232 | parser.add_argument('--max_tokens', default=500, type=int, help='max number of tokens')
233 |
234 | parser.add_argument('--max_his_len', default=450, type=int, help='max number of history')
235 |
236 | parser.add_argument('--num_conti1', default=3, type=int, help='number of continuous tokens')
237 | parser.add_argument('--num_conti2', default=3, type=int, help='number of continuous tokens')
238 |
239 | parser.add_argument('--device', default='cuda', help='device id')
240 | parser.add_argument('--world_size', default=2, type=int, help='number of distributed processes')
241 |
242 | parser.add_argument('--model_file', default='', type=str, help='model file')
243 | # parser.add_argument('--log', default=False, type=bool, help='whether write log file')
244 | parser.add_argument('--log', default=True, type=bool, help='whether write log file')
245 |
246 | args = parser.parse_args()
247 |
248 | if args.data_path == '../DATA/MIND-Demo':
249 | if args.log:
250 | if not os.path.exists('./log-Test'):
251 | os.makedirs('./log-Test')
252 | log_file = './log-Test/' + 'Tbs' + str(args.test_batch_size) + \
253 | '-n' + str(args.num_conti1) + str(args.num_conti2) + \
254 | '-' + str(datetime.now())[-5:]+'.txt'
255 | args.log_file = log_file
256 | else:
257 | if args.log:
258 | if not os.path.exists('./log-Test-Small'):
259 | os.makedirs('./log-Test-Small')
260 | log_file = './log-Test-Small/' + 'Tbs' + str(args.test_batch_size) + \
261 | '-n' + str(args.num_conti1) + str(args.num_conti2) + \
262 | '-' + str(datetime.now())[-5:]+'.txt'
263 | args.log_file = log_file
264 |
265 | WORLD_SIZE = torch.cuda.device_count()
266 | mp.spawn(ddp_main,
267 | args=(WORLD_SIZE, args),
268 | nprocs=WORLD_SIZE,
269 | join=True)
270 | t1 = time.time()
271 | run_time = (t1 - t0) / 3600
272 | print('Running time: %0.4f' % run_time)
--------------------------------------------------------------------------------
/Hybrid-Emotion/predict.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | import pickle
4 | import time
5 | import sys
6 |
7 | from tqdm import tqdm
8 | from datetime import datetime
9 | import torch.cuda
10 | from torch.utils.data import DataLoader
11 |
12 | from transformers import BertTokenizer
13 | from transformers import AdamW
14 |
15 | import torch.distributed as dist
16 | import torch.multiprocessing as mp
17 | from torch.nn.parallel import DistributedDataParallel as DDP
18 | from torch.utils.data.distributed import DistributedSampler
19 |
20 | from model import BERTPrompt4NR
21 | from prepro_data import *
22 | from utils import evaluate
23 |
24 |
25 | def setup(rank, world_size):
26 | os.environ['MASTER_ADDR'] = 'localhost'
27 | os.environ['MASTER_PORT'] = '23342'
28 | dist.init_process_group("nccl", rank=rank, world_size=world_size)
29 |
30 |
31 | def cleanup():
32 | dist.destroy_process_group()
33 |
34 |
35 | def init_seed(seed):
36 | random.seed(seed)
37 | np.random.seed(seed)
38 | torch.manual_seed(seed)
39 | torch.cuda.manual_seed(seed)
40 | torch.cuda.manual_seed_all(seed)
41 |
42 |
43 | class Logger(object):
44 | def __init__(self, filename, stream=sys.stdout):
45 | self.terminal = stream
46 | self.log = open(filename, 'w')
47 |
48 | def write(self, message):
49 | self.terminal.write(message)
50 | self.log.write(message)
51 |
52 | def flush(self):
53 | pass
54 |
55 |
56 | def load_tokenizer(model_name, args):
57 | tokenizer = BertTokenizer.from_pretrained(model_name)
58 | conti_tokens1 = []
59 | for i in range(args.num_conti1):
60 | conti_tokens1.append('[P' + str(i + 1) + ']')
61 | conti_tokens2 = []
62 | for i in range(args.num_conti2):
63 | conti_tokens2.append('[Q' + str(i + 1) + ']')
64 |
65 | new_tokens = ['[NSEP]']
66 | tokenizer.add_tokens(new_tokens)
67 |
68 | conti_tokens = conti_tokens1 + conti_tokens2
69 | tokenizer.add_tokens(conti_tokens)
70 |
71 | new_vocab_size = len(tokenizer)
72 | args.vocab_size = new_vocab_size
73 |
74 | return tokenizer, conti_tokens1, conti_tokens2
75 |
76 |
77 | def load_model(model_name, tokenizer, args):
78 | answer = ['boring', 'interesting']
79 | answer_ids = tokenizer.encode(answer, add_special_tokens=False)
80 |
81 | net = BERTPrompt4NR(model_name, answer_ids, args)
82 | return net
83 |
84 |
85 | def eval(model, rank, world_size, data_loader):
86 | model.eval()
87 | data_loader = tqdm(data_loader)
88 | val_scores = []
89 | acc_cnt = torch.zeros(2).to(rank)
90 | acc_cnt_pos = torch.zeros(2).to(rank)
91 | imp_ids = []
92 | labels = []
93 | for step, data in enumerate(data_loader):
94 | batch_enc, batch_attn, batch_labs, batch_imp = data
95 | imp_ids = imp_ids + batch_imp
96 | labels = labels + batch_labs.cpu().numpy().tolist()
97 |
98 | batch_enc = batch_enc.to(rank)
99 | batch_attn = batch_attn.to(rank)
100 | batch_labs = batch_labs.to(rank)
101 |
102 | loss, scores = model(batch_enc, batch_attn, batch_labs)
103 |
104 | ranking_scores = scores[:, 1].detach()
105 | val_scores.append(ranking_scores)
106 |
107 | predict = torch.argmax(scores.detach(), dim=1)
108 | num_correct = (predict == batch_labs).sum()
109 | acc_cnt[0] += num_correct
110 | acc_cnt[1] += predict.size(0)
111 |
112 | positive_idx = torch.where(batch_labs == 1)[0]
113 | num_correct_pos = (predict[positive_idx] == batch_labs[positive_idx]).sum()
114 | acc_cnt_pos[0] += num_correct_pos
115 | acc_cnt_pos[1] += positive_idx.size(0)
116 |
117 | dist.all_reduce(acc_cnt, op=dist.ReduceOp.SUM)
118 | dist.all_reduce(acc_cnt_pos, op=dist.ReduceOp.SUM)
119 |
120 | acc = acc_cnt[0] / acc_cnt[1]
121 | acc_pos = acc_cnt_pos[0] / acc_cnt_pos[1]
122 | pos_ratio = acc_cnt_pos[1] / acc_cnt[1]
123 |
124 | val_scores = torch.cat(val_scores, dim=0)
125 | val_impids = torch.IntTensor(imp_ids).to(rank)
126 | val_labels = torch.IntTensor(labels).to(rank)
127 |
128 | val_scores_list = [torch.zeros_like(val_scores).to(rank) for _ in range(world_size)]
129 | val_impids_list = [torch.zeros_like(val_impids).to(rank) for _ in range(world_size)]
130 | val_labels_list = [torch.zeros_like(val_labels).to(rank) for _ in range(world_size)]
131 |
132 | dist.all_gather(val_scores_list, val_scores)
133 | dist.all_gather(val_impids_list, val_impids)
134 | dist.all_gather(val_labels_list, val_labels)
135 |
136 | return val_scores_list, acc.item(), acc_pos.item(), pos_ratio.item(), val_impids_list, val_labels_list
137 |
138 |
139 | def ddp_main(rank, world_size, args):
140 | args.rank = rank
141 | args.world_size = world_size
142 | init_seed(rank + 1)
143 | if rank == 0:
144 | if args.log:
145 | sys.stdout = Logger(args.log_file, sys.stdout)
146 | setup(rank, world_size)
147 |
148 | print('| distributed init rank {}'.format(rank))
149 | dist.barrier()
150 |
151 | # load tokenizer
152 | tokenizer, conti_tokens1, conti_tokens2 = load_tokenizer(args.model_name, args)
153 | conti_tokens = [conti_tokens1, conti_tokens2]
154 |
155 | # load model
156 | net = load_model(args.model_name, tokenizer, args)
157 |
158 | # load data
159 | news_dict = pickle.load(open(os.path.join(args.data_path, 'news.txt'), 'rb'))
160 | test_dataset = MyDataset(args, tokenizer, news_dict, conti_tokens, status='test')
161 |
162 | if rank == 0:
163 | print(args)
164 | print('Vocabulary size of tokenizer after adding new tokens : %d' % args.vocab_size)
165 | print(test_dataset[0]['sentence'])
166 | print('num test: %d' % len(test_dataset))
167 |
168 | test_sampler = DistributedSampler(test_dataset,
169 | rank=rank,
170 | num_replicas=world_size)
171 | nw = 2
172 | test_kwargs = {'batch_size': args.test_batch_size, 'sampler': test_sampler,
173 | 'shuffle': False, 'pin_memory': False,
174 | 'num_workers': nw, 'collate_fn': test_dataset.collate_fn}
175 |
176 | test_loader = DataLoader(test_dataset, **test_kwargs)
177 |
178 | net = net.to(rank)
179 | net = DDP(net, device_ids=[rank])
180 |
181 | dist.barrier()
182 |
183 | map_location = {'cuda:%d' % 0: 'cuda:%d' % rank}
184 | net.module.load_state_dict(torch.load(args.model_file, map_location=map_location))
185 |
186 | with torch.no_grad():
187 | st_test = time.time()
188 | test_scores, acc_test, acc_pos_test, pos_ratio_test, test_impids, test_labels = \
189 | eval(net, rank, world_size, test_loader)
190 | impressions = {} # {1: {'score': [], 'lab': []}}
191 | for i in range(world_size):
192 | scores, imp_id, labs = test_scores[i], test_impids[i], test_labels[i]
193 | assert scores.size() == imp_id.size() == labs.size()
194 | scores = scores.cpu().numpy().tolist()
195 | imp_id = imp_id.cpu().numpy().tolist()
196 | labs = labs.cpu().numpy().tolist()
197 | for j in range(len(scores)):
198 | sco, imp, lab = scores[j], imp_id[j], labs[j]
199 | if imp not in impressions:
200 | impressions[imp] = {'score': [], 'lab': []}
201 | impressions[imp]['score'].append(sco)
202 | impressions[imp]['lab'].append(lab)
203 | else:
204 | impressions[imp]['score'].append(sco)
205 | impressions[imp]['lab'].append(lab)
206 | predicts, truths = [], []
207 | for imp in impressions:
208 | sims, labs = impressions[imp]['score'], impressions[imp]['lab']
209 | sl_zip = sorted(zip(sims, labs), key=lambda x: x[0], reverse=True)
210 | sort_sims, sort_labs = zip(*sl_zip)
211 | predicts.append(list(range(1, len(sort_labs) + 1, 1)))
212 | truths.append(sort_labs)
213 |
214 | auc_test, mrr_test, ndcg5_test, ndcg10_test = evaluate(predicts, truths)
215 | end_test = time.time()
216 | test_spend = (end_test - st_test) / 60
217 |
218 | if rank == 0:
219 | print("Test: AUC: %0.4f\tMRR: %0.4f\tnDCG@5: %0.4f\tnDCG@10: %0.4f\t[Test-Time: %0.2f mim]" %
220 | (auc_test, mrr_test, ndcg5_test, ndcg10_test, test_spend))
221 | cleanup()
222 |
223 |
224 | if __name__ == '__main__':
225 | t0 = time.time()
226 | parser = argparse.ArgumentParser()
227 | parser.add_argument('--data_path', default='../DATA/MIND-Small', type=str, help='Path')
228 | parser.add_argument('--model_name', default='bert-base-uncased', type=str)
229 |
230 | parser.add_argument('--test_batch_size', default=15, type=int, help='test batch_size')
231 | parser.add_argument('--max_his', default=50, type=int, help='max number of history')
232 | parser.add_argument('--max_tokens', default=500, type=int, help='max number of tokens')
233 |
234 | parser.add_argument('--max_his_len', default=450, type=int, help='max number of history')
235 |
236 | parser.add_argument('--num_conti1', default=3, type=int, help='number of continuous tokens')
237 | parser.add_argument('--num_conti2', default=3, type=int, help='number of continuous tokens')
238 |
239 | parser.add_argument('--device', default='cuda', help='device id')
240 | parser.add_argument('--world_size', default=2, type=int, help='number of distributed processes')
241 |
242 | parser.add_argument('--model_file', default='', type=str, help='model file')
243 | # parser.add_argument('--log', default=False, type=bool, help='whether write log file')
244 | parser.add_argument('--log', default=True, type=bool, help='whether write log file')
245 |
246 | args = parser.parse_args()
247 |
248 | # Create log file
249 | if args.data_path == '../DATA/MIND-Demo':
250 | if args.log:
251 | if not os.path.exists('./log-Test'):
252 | os.makedirs('./log-Test')
253 | log_file = './log-Test/' + 'Tbs' + str(args.test_batch_size) + \
254 | '-n' + str(args.num_conti1) + str(args.num_conti2) + \
255 | '-' + str(datetime.now())[-5:]+'.txt'
256 | args.log_file = log_file
257 | else:
258 | if args.log:
259 | if not os.path.exists('./log-Test-Small'):
260 | os.makedirs('./log-Test-Small')
261 | log_file = './log-Test-Small/' + 'Tbs' + str(args.test_batch_size) + \
262 | '-n' + str(args.num_conti1) + str(args.num_conti2) + \
263 | '-' + str(datetime.now())[-5:]+'.txt'
264 | args.log_file = log_file
265 |
266 | WORLD_SIZE = torch.cuda.device_count()
267 | mp.spawn(ddp_main,
268 | args=(WORLD_SIZE, args),
269 | nprocs=WORLD_SIZE,
270 | join=True)
271 | t1 = time.time()
272 | run_time = (t1 - t0) / 3600
273 | print('Running time: %0.4f' % run_time)
--------------------------------------------------------------------------------
/Discrete-Utility/predict.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | import pickle
4 | import time
5 | import sys
6 |
7 | from tqdm import tqdm
8 | from datetime import datetime
9 | import torch.cuda
10 | from torch.utils.data import DataLoader
11 |
12 | from transformers import BertTokenizer
13 | from transformers import AdamW
14 |
15 | import torch.distributed as dist
16 | import torch.multiprocessing as mp
17 | from torch.nn.parallel import DistributedDataParallel as DDP
18 | from torch.utils.data.distributed import DistributedSampler
19 | from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
20 |
21 | from model import BERTPrompt4NR
22 | from prepro_data import *
23 | from utils import evaluate
24 |
25 |
26 | def setup(rank, world_size):
27 | os.environ['MASTER_ADDR'] = 'localhost'
28 | os.environ['MASTER_PORT'] = '23342'
29 | dist.init_process_group("nccl", rank=rank, world_size=world_size)
30 |
31 |
32 | def cleanup():
33 | dist.destroy_process_group()
34 |
35 |
36 | def init_seed(seed):
37 | random.seed(seed)
38 | np.random.seed(seed)
39 | torch.manual_seed(seed)
40 | torch.cuda.manual_seed(seed)
41 | torch.cuda.manual_seed_all(seed)
42 |
43 |
44 | class Logger(object):
45 | def __init__(self, filename, stream=sys.stdout):
46 | self.terminal = stream
47 | self.log = open(filename, 'w')
48 |
49 | def write(self, message):
50 | self.terminal.write(message)
51 | self.log.write(message)
52 |
53 | def flush(self):
54 | pass
55 |
56 |
57 | def load_model(model_name, args):
58 | tokenizer = BertTokenizer.from_pretrained(model_name)
59 |
60 | new_tokens = ['[NSEP]']
61 | tokenizer.add_tokens(new_tokens)
62 | new_vocab_size = len(tokenizer)
63 | args.vocab_size = new_vocab_size
64 |
65 | answer = ['bad', 'good']
66 | answer_ids = tokenizer.encode(answer, add_special_tokens=False)
67 |
68 | net = BERTPrompt4NR(model_name, answer_ids, args)
69 | return net, tokenizer
70 |
71 |
72 | def eval(model, rank, world_size, data_loader):
73 | model.eval()
74 | data_loader = tqdm(data_loader)
75 | val_scores = []
76 | acc_cnt = torch.zeros(2).to(rank)
77 | acc_cnt_pos = torch.zeros(2).to(rank)
78 | imp_ids = []
79 | labels = []
80 | for step, data in enumerate(data_loader):
81 | batch_enc, batch_attn, batch_labs, batch_imp = data
82 | imp_ids = imp_ids + batch_imp
83 | labels = labels + batch_labs.cpu().numpy().tolist()
84 |
85 | batch_enc = batch_enc.to(rank)
86 | batch_attn = batch_attn.to(rank)
87 | batch_labs = batch_labs.to(rank)
88 |
89 | loss, scores = model(batch_enc, batch_attn, batch_labs)
90 |
91 | ranking_scores = scores[:, 1].detach()
92 | val_scores.append(ranking_scores)
93 |
94 | predict = torch.argmax(scores.detach(), dim=1)
95 | num_correct = (predict == batch_labs).sum()
96 | acc_cnt[0] += num_correct
97 | acc_cnt[1] += predict.size(0)
98 |
99 | positive_idx = torch.where(batch_labs == 1)[0]
100 | num_correct_pos = (predict[positive_idx] == batch_labs[positive_idx]).sum()
101 | acc_cnt_pos[0] += num_correct_pos
102 | acc_cnt_pos[1] += positive_idx.size(0)
103 |
104 | dist.all_reduce(acc_cnt, op=dist.ReduceOp.SUM)
105 | dist.all_reduce(acc_cnt_pos, op=dist.ReduceOp.SUM)
106 |
107 | acc = acc_cnt[0] / acc_cnt[1]
108 | acc_pos = acc_cnt_pos[0] / acc_cnt_pos[1]
109 | pos_ratio = acc_cnt_pos[1] / acc_cnt[1]
110 |
111 | val_scores = torch.cat(val_scores, dim=0)
112 | val_impids = torch.IntTensor(imp_ids).to(rank)
113 | val_labels = torch.IntTensor(labels).to(rank)
114 |
115 | val_scores_list = [torch.zeros_like(val_scores).to(rank) for _ in range(world_size)]
116 | val_impids_list = [torch.zeros_like(val_impids).to(rank) for _ in range(world_size)]
117 | val_labels_list = [torch.zeros_like(val_labels).to(rank) for _ in range(world_size)]
118 |
119 | dist.all_gather(val_scores_list, val_scores)
120 | dist.all_gather(val_impids_list, val_impids)
121 | dist.all_gather(val_labels_list, val_labels)
122 |
123 | return val_scores_list, acc.item(), acc_pos.item(), pos_ratio.item(), val_impids_list, val_labels_list
124 |
125 |
126 | def ddp_main(rank, world_size, args):
127 | args.rank = rank
128 | args.world_size = world_size
129 | init_seed(rank + 1)
130 | if rank == 0:
131 | if args.log:
132 | sys.stdout = Logger(args.log_file, sys.stdout)
133 | setup(rank, world_size)
134 |
135 | print('| distributed init rank {}'.format(rank))
136 | dist.barrier()
137 |
138 | # load model
139 | net, tokenizer = load_model(args.model_name, args)
140 |
141 | # load data
142 | news_dict = pickle.load(open(os.path.join(args.data_path, 'news.txt'), 'rb'))
143 | test_dataset = MyDataset(args, tokenizer, news_dict, status='test')
144 |
145 | if rank == 0:
146 | print(args)
147 | print('Vocabulary size of tokenizer after adding new tokens : %d' % args.vocab_size)
148 | print(test_dataset[0]['sentence'])
149 | print('num test: %d' % len(test_dataset))
150 |
151 | test_sampler = DistributedSampler(test_dataset,
152 | rank=rank,
153 | num_replicas=world_size)
154 | nw = 2
155 | test_kwargs = {'batch_size': args.test_batch_size, 'sampler': test_sampler,
156 | 'shuffle': False, 'pin_memory': False,
157 | 'num_workers': nw, 'collate_fn': test_dataset.collate_fn}
158 |
159 | test_loader = DataLoader(test_dataset, **test_kwargs)
160 |
161 | net = net.to(rank)
162 | net = DDP(net, device_ids=[rank])
163 |
164 | dist.barrier()
165 |
166 | map_location = {'cuda:%d' % 0: 'cuda:%d' % rank}
167 | net.module.load_state_dict(torch.load(args.model_file, map_location=map_location))
168 |
169 | with torch.no_grad():
170 | st_test = time.time()
171 | test_scores, acc_test, acc_pos_test, pos_ratio_test, test_impids, test_labels = \
172 | eval(net, rank, world_size, test_loader)
173 | impressions = {} # {1: {'score': [], 'lab': []}}
174 | for i in range(world_size):
175 | scores, imp_id, labs = test_scores[i], test_impids[i], test_labels[i]
176 | assert scores.size() == imp_id.size() == labs.size()
177 | scores = scores.cpu().numpy().tolist()
178 | imp_id = imp_id.cpu().numpy().tolist()
179 | labs = labs.cpu().numpy().tolist()
180 | for j in range(len(scores)):
181 | sco, imp, lab = scores[j], imp_id[j], labs[j]
182 | if imp not in impressions:
183 | impressions[imp] = {'score': [], 'lab': []}
184 | impressions[imp]['score'].append(sco)
185 | impressions[imp]['lab'].append(lab)
186 | else:
187 | impressions[imp]['score'].append(sco)
188 | impressions[imp]['lab'].append(lab)
189 | predicts, truths = [], []
190 | for imp in impressions:
191 | sims, labs = impressions[imp]['score'], impressions[imp]['lab']
192 | sl_zip = sorted(zip(sims, labs), key=lambda x: x[0], reverse=True)
193 | sort_sims, sort_labs = zip(*sl_zip)
194 | predicts.append(list(range(1, len(sort_labs) + 1, 1)))
195 | truths.append(sort_labs)
196 |
197 | auc_test, mrr_test, ndcg5_test, ndcg10_test = evaluate(predicts, truths)
198 | end_test = time.time()
199 | test_spend = (end_test - st_test) / 60
200 |
201 | if rank == 0:
202 | print("Test: AUC: %0.4f\tMRR: %0.4f\tnDCG@5: %0.4f\tnDCG@10: %0.4f\t[Test-Time: %0.2f mim]" %
203 | (auc_test, mrr_test, ndcg5_test, ndcg10_test, test_spend))
204 | cleanup()
205 |
206 |
207 | if __name__ == '__main__':
208 | t0 = time.time()
209 | parser = argparse.ArgumentParser()
210 | parser.add_argument('--data_path', default='../DATA/MIND-Small', type=str, help='Path')
211 | parser.add_argument('--model_name', default='bert-base-uncased', type=str)
212 |
213 | parser.add_argument('--test_batch_size', default=15, type=int, help='test batch_size')
214 | parser.add_argument('--max_his', default=50, type=int, help='max number of history')
215 | parser.add_argument('--max_tokens', default=500, type=int, help='max number of tokens')
216 |
217 | parser.add_argument('--max_his_len', default=450, type=int, help='max number of history')
218 |
219 | parser.add_argument('--ratio', default=0.5, type=float, help='ratio of all datasets')
220 |
221 | parser.add_argument('--device', default='cuda', help='device id')
222 | parser.add_argument('--world_size', default=2, type=int, help='number of distributed processes')
223 |
224 | parser.add_argument('--model_file', default='', type=str, help='model file')
225 | parser.add_argument('--log', default=False, type=bool, help='whether write log file')
226 | # parser.add_argument('--log', default=True, type=bool, help='whether write log file')
227 |
228 | args = parser.parse_args()
229 |
230 | # Create log file, All dataset and Few-shot use different log director
231 | if args.ratio == 1.0:
232 | if args.data_path == '../DATA/MIND-Demo':
233 | if args.log:
234 | if not os.path.exists('./log-Test'):
235 | os.makedirs('./log-Test')
236 | log_file = './log-Test/' + 'Tbs' + str(args.test_batch_size) + '-' + str(datetime.now())[-5:]+'.txt'
237 | args.log_file = log_file
238 | else:
239 | if args.log:
240 | if not os.path.exists('./log-Test-Small'):
241 | os.makedirs('./log-Test-Small')
242 | log_file = './log-Test-Small/' + 'Tbs' + str(args.test_batch_size) + '-' + str(datetime.now())[-5:]+'.txt'
243 | args.log_file = log_file
244 | else:
245 | if args.data_path == '../DATA/MIND-Demo':
246 | if args.log:
247 | if not os.path.exists('./log-Test-Few'):
248 | os.makedirs('./log-Test-Few')
249 | log_file = './log-Test-Few/' + 'Tbs' + str(args.test_batch_size) + \
250 | '-ratio' + str(args.ratio) + \
251 | '-' + str(datetime.now())[-5:]+'.txt'
252 | args.log_file = log_file
253 | else:
254 | if args.log:
255 | if not os.path.exists('./log-Test-Small-Few'):
256 | os.makedirs('./log-Test-Small-Few')
257 | log_file = './log-Test-Small-Few/' + 'Tbs' + str(args.test_batch_size) + \
258 | '-ratio' + str(args.ratio) + \
259 | '-' + str(datetime.now())[-5:]+'.txt'
260 | args.log_file = log_file
261 |
262 | WORLD_SIZE = torch.cuda.device_count()
263 | mp.spawn(ddp_main,
264 | args=(WORLD_SIZE, args),
265 | nprocs=WORLD_SIZE,
266 | join=True)
267 | t1 = time.time()
268 | run_time = (t1 - t0) / 3600
269 | print('Running time: %0.4f' % run_time)
--------------------------------------------------------------------------------
/Continuous-Action/predict.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | import pickle
4 | import time
5 | import sys
6 |
7 | from tqdm import tqdm
8 | from datetime import datetime
9 | import torch.cuda
10 | from torch.utils.data import DataLoader
11 |
12 | from transformers import BertTokenizer
13 | from transformers import AdamW
14 |
15 | import torch.distributed as dist
16 | import torch.multiprocessing as mp
17 | from torch.nn.parallel import DistributedDataParallel as DDP
18 | from torch.utils.data.distributed import DistributedSampler
19 |
20 | from model import BERTPrompt4NR
21 | from prepro_data import *
22 | from utils import evaluate
23 |
24 |
25 | def setup(rank, world_size):
26 | os.environ['MASTER_ADDR'] = 'localhost'
27 | os.environ['MASTER_PORT'] = '23342'
28 | dist.init_process_group("nccl", rank=rank, world_size=world_size)
29 |
30 |
31 | def cleanup():
32 | dist.destroy_process_group()
33 |
34 |
35 | def init_seed(seed):
36 | random.seed(seed)
37 | np.random.seed(seed)
38 | torch.manual_seed(seed)
39 | torch.cuda.manual_seed(seed)
40 | torch.cuda.manual_seed_all(seed)
41 |
42 |
43 | class Logger(object):
44 | def __init__(self, filename, stream=sys.stdout):
45 | self.terminal = stream
46 | self.log = open(filename, 'w')
47 |
48 | def write(self, message):
49 | self.terminal.write(message)
50 | self.log.write(message)
51 |
52 | def flush(self):
53 | pass
54 |
55 |
56 | def load_tokenizer(model_name, args):
57 | tokenizer = BertTokenizer.from_pretrained(model_name)
58 | conti_tokens1 = []
59 | for i in range(args.num_conti1):
60 | conti_tokens1.append('[P' + str(i + 1) + ']')
61 | conti_tokens2 = []
62 | for i in range(args.num_conti2):
63 | conti_tokens2.append('[Q' + str(i + 1) + ']')
64 | conti_tokens3 = []
65 | for i in range(args.num_conti3):
66 | conti_tokens3.append('[M' + str(i + 1) + ']')
67 |
68 | new_tokens = ['[NSEP]']
69 | tokenizer.add_tokens(new_tokens)
70 |
71 | conti_tokens = conti_tokens1 + conti_tokens2 + conti_tokens3
72 | tokenizer.add_tokens(conti_tokens)
73 |
74 | new_vocab_size = len(tokenizer)
75 | args.vocab_size = new_vocab_size
76 |
77 | return tokenizer, conti_tokens1, conti_tokens2, conti_tokens3
78 |
79 |
80 | def load_model(model_name, tokenizer, args):
81 | answer = ['no', 'yes']
82 | answer_ids = tokenizer.encode(answer, add_special_tokens=False)
83 |
84 | net = BERTPrompt4NR(model_name, answer_ids, args)
85 | return net
86 |
87 |
88 | def eval(model, rank, world_size, data_loader):
89 | model.eval()
90 | data_loader = tqdm(data_loader)
91 | val_scores = []
92 | acc_cnt = torch.zeros(2).to(rank)
93 | acc_cnt_pos = torch.zeros(2).to(rank)
94 | imp_ids = []
95 | labels = []
96 | for step, data in enumerate(data_loader):
97 | batch_enc, batch_attn, batch_labs, batch_imp = data
98 | imp_ids = imp_ids + batch_imp
99 | labels = labels + batch_labs.cpu().numpy().tolist()
100 |
101 | batch_enc = batch_enc.to(rank)
102 | batch_attn = batch_attn.to(rank)
103 | batch_labs = batch_labs.to(rank)
104 |
105 | loss, scores = model(batch_enc, batch_attn, batch_labs)
106 |
107 | ranking_scores = scores[:, 1].detach()
108 | val_scores.append(ranking_scores)
109 |
110 | predict = torch.argmax(scores.detach(), dim=1)
111 | num_correct = (predict == batch_labs).sum()
112 | acc_cnt[0] += num_correct
113 | acc_cnt[1] += predict.size(0)
114 |
115 | positive_idx = torch.where(batch_labs == 1)[0]
116 | num_correct_pos = (predict[positive_idx] == batch_labs[positive_idx]).sum()
117 | acc_cnt_pos[0] += num_correct_pos
118 | acc_cnt_pos[1] += positive_idx.size(0)
119 |
120 | dist.all_reduce(acc_cnt, op=dist.ReduceOp.SUM)
121 | dist.all_reduce(acc_cnt_pos, op=dist.ReduceOp.SUM)
122 |
123 | acc = acc_cnt[0] / acc_cnt[1]
124 | acc_pos = acc_cnt_pos[0] / acc_cnt_pos[1]
125 | pos_ratio = acc_cnt_pos[1] / acc_cnt[1]
126 |
127 | val_scores = torch.cat(val_scores, dim=0)
128 | val_impids = torch.IntTensor(imp_ids).to(rank)
129 | val_labels = torch.IntTensor(labels).to(rank)
130 |
131 | val_scores_list = [torch.zeros_like(val_scores).to(rank) for _ in range(world_size)]
132 | val_impids_list = [torch.zeros_like(val_impids).to(rank) for _ in range(world_size)]
133 | val_labels_list = [torch.zeros_like(val_labels).to(rank) for _ in range(world_size)]
134 |
135 | dist.all_gather(val_scores_list, val_scores)
136 | dist.all_gather(val_impids_list, val_impids)
137 | dist.all_gather(val_labels_list, val_labels)
138 |
139 | return val_scores_list, acc.item(), acc_pos.item(), pos_ratio.item(), val_impids_list, val_labels_list
140 |
141 |
142 | def ddp_main(rank, world_size, args):
143 | args.rank = rank
144 | args.world_size = world_size
145 | init_seed(rank + 1)
146 | if rank == 0:
147 | if args.log:
148 | sys.stdout = Logger(args.log_file, sys.stdout)
149 | setup(rank, world_size)
150 |
151 | print('| distributed init rank {}'.format(rank))
152 | dist.barrier()
153 |
154 | # load tokenizer
155 | tokenizer, conti_tokens1, conti_tokens2, conti_tokens3 = load_tokenizer(args.model_name, args)
156 | conti_tokens = [conti_tokens1, conti_tokens2, conti_tokens3]
157 |
158 | # load model
159 | net = load_model(args.model_name, tokenizer, args)
160 |
161 | # load data
162 | news_dict = pickle.load(open(os.path.join(args.data_path, 'news.txt'), 'rb'))
163 | test_dataset = MyDataset(args, tokenizer, news_dict, conti_tokens, status='test')
164 |
165 | if rank == 0:
166 | print(args)
167 | print('Vocabulary size of tokenizer after adding new tokens : %d' % args.vocab_size)
168 | print(test_dataset[0]['sentence'])
169 | print('num test: %d' % len(test_dataset))
170 |
171 | test_sampler = DistributedSampler(test_dataset,
172 | rank=rank,
173 | num_replicas=world_size)
174 | nw = 2
175 | test_kwargs = {'batch_size': args.test_batch_size, 'sampler': test_sampler,
176 | 'shuffle': False, 'pin_memory': False,
177 | 'num_workers': nw, 'collate_fn': test_dataset.collate_fn}
178 |
179 | test_loader = DataLoader(test_dataset, **test_kwargs)
180 |
181 | net = net.to(rank)
182 | net = DDP(net, device_ids=[rank])
183 |
184 | dist.barrier()
185 |
186 | map_location = {'cuda:%d' % 0: 'cuda:%d' % rank}
187 | net.module.load_state_dict(torch.load(args.model_file, map_location=map_location))
188 |
189 | with torch.no_grad():
190 | st_test = time.time()
191 | test_scores, acc_test, acc_pos_test, pos_ratio_test, test_impids, test_labels = \
192 | eval(net, rank, world_size, test_loader)
193 | impressions = {} # {1: {'score': [], 'lab': []}}
194 | for i in range(world_size):
195 | scores, imp_id, labs = test_scores[i], test_impids[i], test_labels[i]
196 | assert scores.size() == imp_id.size() == labs.size()
197 | scores = scores.cpu().numpy().tolist()
198 | imp_id = imp_id.cpu().numpy().tolist()
199 | labs = labs.cpu().numpy().tolist()
200 | for j in range(len(scores)):
201 | sco, imp, lab = scores[j], imp_id[j], labs[j]
202 | if imp not in impressions:
203 | impressions[imp] = {'score': [], 'lab': []}
204 | impressions[imp]['score'].append(sco)
205 | impressions[imp]['lab'].append(lab)
206 | else:
207 | impressions[imp]['score'].append(sco)
208 | impressions[imp]['lab'].append(lab)
209 | predicts, truths = [], []
210 | for imp in impressions:
211 | sims, labs = impressions[imp]['score'], impressions[imp]['lab']
212 | sl_zip = sorted(zip(sims, labs), key=lambda x: x[0], reverse=True)
213 | sort_sims, sort_labs = zip(*sl_zip)
214 | predicts.append(list(range(1, len(sort_labs) + 1, 1)))
215 | truths.append(sort_labs)
216 |
217 | auc_test, mrr_test, ndcg5_test, ndcg10_test = evaluate(predicts, truths)
218 | end_test = time.time()
219 | test_spend = (end_test - st_test) / 60
220 |
221 | if rank == 0:
222 | print("Test: AUC: %0.4f\tMRR: %0.4f\tnDCG@5: %0.4f\tnDCG@10: %0.4f\t[Test-Time: %0.2f mim]" %
223 | (auc_test, mrr_test, ndcg5_test, ndcg10_test, test_spend))
224 | cleanup()
225 |
226 |
227 | if __name__ == '__main__':
228 | t0 = time.time()
229 | parser = argparse.ArgumentParser()
230 | parser.add_argument('--data_path', default='../DATA/MIND-Small', type=str, help='Path')
231 | parser.add_argument('--model_name', default='bert-base-uncased', type=str)
232 |
233 | parser.add_argument('--test_batch_size', default=15, type=int, help='test batch_size')
234 | parser.add_argument('--max_his', default=50, type=int, help='max number of history')
235 | parser.add_argument('--max_tokens', default=500, type=int, help='max number of tokens')
236 |
237 | parser.add_argument('--max_his_len', default=450, type=int, help='max number of history')
238 |
239 | parser.add_argument('--num_conti1', default=3, type=int, help='number of continuous tokens')
240 | parser.add_argument('--num_conti2', default=3, type=int, help='number of continuous tokens')
241 | parser.add_argument('--num_conti3', default=3, type=int, help='number of continuous tokens')
242 |
243 | parser.add_argument('--device', default='cuda', help='device id')
244 | parser.add_argument('--world_size', default=2, type=int, help='number of distributed processes')
245 |
246 | parser.add_argument('--model_file', default='', type=str, help='model file')
247 | # parser.add_argument('--log', default=False, type=bool, help='whether write log file')
248 | parser.add_argument('--log', default=True, type=bool, help='whether write log file')
249 |
250 | args = parser.parse_args()
251 |
252 | if args.data_path == '../DATA/MIND-Demo':
253 | if args.log:
254 | if not os.path.exists('./log-Test'):
255 | os.makedirs('./log-Test')
256 | log_file = './log-Test/' + 'Tbs' + str(args.test_batch_size) + \
257 | '-n' + str(args.num_conti1) + str(args.num_conti2) + str(args.num_conti3) + \
258 | '-' + str(datetime.now())[-5:]+'.txt'
259 | args.log_file = log_file
260 | else:
261 | if args.log:
262 | if not os.path.exists('./log-Test-Small'):
263 | os.makedirs('./log-Test-Small')
264 | log_file = './log-Test-Small/' + 'Tbs' + str(args.test_batch_size) + \
265 | '-n' + str(args.num_conti1) + str(args.num_conti2) + str(args.num_conti3) + \
266 | '-' + str(datetime.now())[-5:]+'.txt'
267 | args.log_file = log_file
268 |
269 | WORLD_SIZE = torch.cuda.device_count()
270 | mp.spawn(ddp_main,
271 | args=(WORLD_SIZE, args),
272 | nprocs=WORLD_SIZE,
273 | join=True)
274 | t1 = time.time()
275 | run_time = (t1 - t0) / 3600
276 | print('Running time: %0.4f' % run_time)
--------------------------------------------------------------------------------