├── README.md
├── data
    ├── __pycache__
    │   └── data_utils.cpython-37.pyc
    ├── __init__.py
    ├── process.py
    ├── semi-supervise.py
    ├── data_utils.py
    ├── back_translate.py
    └── embed_replace.py
├── .gitignore
├── __init__.py
├── model
    ├── __init__.py
    ├── evaluate.py
    ├── vocab.py
    ├── rouge_eval.py
    ├── config.py
    ├── dataset.py
    ├── train.py
    ├── utils.py
    ├── predict.py
    └── model.py
├── files
    └── HIT_stop_words.txt
├── requirements.txt
└── 运行结果.txt


/README.md:
--------------------------------------------------------------------------------
1 | # marketing_text_generation
2 | 


--------------------------------------------------------------------------------
/data/__pycache__/data_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Wasim37/marketing_text_generation/HEAD/data/__pycache__/data_utils.cpython-37.pyc


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.json
 2 | *.whl
 3 | *.iter5
 4 | *.zip
 5 | *.cpython-36.pyc
 6 | *.pdf
 7 | *.ipynb_checkpoints/
 8 | *.DS_Store
 9 | *.txt
10 | *.log
11 | runs/


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | @Author: your name
 3 | @Date: 2020-04-08 16:05:19
 4 | @LastEditTime: 2020-04-08 17:24:47
 5 | @LastEditors: Please set LastEditors
 6 | @Description: In User Settings Edit
 7 | @FilePath: /textClassification/src/data/__init__.py
 8 | '''
 9 | import sys
10 | import os
11 | curPath = os.path.abspath(os.path.dirname(__file__))
12 | sys.path.append(curPath)


--------------------------------------------------------------------------------
/data/__init__.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | @Author: your name
 3 | @Date: 2020-04-08 16:05:19
 4 | @LastEditTime: 2020-04-08 17:24:47
 5 | @LastEditors: Please set LastEditors
 6 | @Description: In User Settings Edit
 7 | @FilePath: /textClassification/src/data/__init__.py
 8 | '''
 9 | import sys
10 | import os
11 | curPath = os.path.abspath(os.path.dirname(__file__))
12 | rootPath = os.path.split(curPath)[0]
13 | sys.path.append(os.path.split(rootPath)[0])


--------------------------------------------------------------------------------
/model/__init__.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | @Author: your name
 3 | @Date: 2020-04-08 16:05:19
 4 | @LastEditTime: 2020-04-08 17:24:47
 5 | @LastEditors: Please set LastEditors
 6 | @Description: In User Settings Edit
 7 | @FilePath: /textClassification/src/data/__init__.py
 8 | '''
 9 | import sys
10 | import os
11 | curPath = os.path.abspath(os.path.dirname(__file__))
12 | rootPath = os.path.split(curPath)[0]
13 | sys.path.append(os.path.split(rootPath)[0])


--------------------------------------------------------------------------------
/data/process.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | '''
 4 | @Author: author
 5 | @Date: 2020-07-13 20:16:37
 6 | @LastEditTime: 2020-07-18 17:28:41
 7 | @LastEditors: Please set LastEditors
 8 | @Description: Process a raw dataset into a sample file.
 9 | @FilePath: /project_2/data/process.py
10 | '''
11 | 
12 | import sys
13 | import os
14 | import pathlib
15 | import json
16 | import jieba
17 | 
18 | from data_utils import write_samples, partition
19 | 
20 | abs_path = pathlib.Path(__file__).parent.absolute()
21 | sys.path.append(sys.path.append(abs_path))
22 | curPath = os.path.abspath(os.path.dirname(__file__)) + '/'
23 | 
24 | 
25 | samples = set()
26 | # Read json file.
27 | json_path = os.path.join(abs_path, '../files/服饰_50k.json')
28 | with open(json_path, 'r', encoding='utf8') as file:
29 |     jsf = json.load(file)
30 | 
31 | for jsobj in jsf.values():
32 |     title = jsobj['title'] + ' '  # Get title.
33 |     kb = dict(jsobj['kb']).items()  # Get attributes.
34 |     kb_merged = ''
35 |     for key, val in kb:
36 |         kb_merged += key+' '+val+' '  # Merge attributes.
37 | 
38 |     ocr = ' '.join(list(jieba.cut(jsobj['ocr'])))  # Get OCR text.
39 |     texts = []
40 |     texts.append(title + ocr + kb_merged)  # Merge them.
41 |     reference = ' '.join(list(jieba.cut(jsobj['reference'])))
42 |     for text in texts:
43 |         sample = text+'<sep>'+reference  # Seperate source and reference.
44 |         samples.add(sample)
45 | write_path = os.path.join(abs_path, '../files/samples.txt')
46 | write_samples(samples, write_path)
47 | partition(samples)
48 | 


--------------------------------------------------------------------------------
/data/semi-supervise.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: UTF-8 -*-
 3 | """
 4 | @Author: frank
 5 | @Date: 2020-08-07 16:43:30
 6 | @LastEditTime: 
 7 | @LastEditors: 
 8 | @Description: 
 9 | @File: semi-supervised.py
10 | """
11 | import pathlib
12 | import sys
13 | 
14 | abs_path = pathlib.Path(__file__).parent
15 | sys.path.append('../model')
16 | 
17 | from predict import Predict
18 | from data_utils import write_samples
19 | 
20 | 
21 | def semi_supervised(samples_path, write_path, beam_search):
22 |     """use reference to predict source
23 | 
24 |     Args:
25 |         samples_path (str): The path of reference
26 |         write_path (str): The path of new samples
27 | 
28 |     """
29 |     ###########################################
30 |     #          TODO: module 3 task 1          #
31 |     ###########################################
32 |     pred = Predict()
33 |     print('vocab_size:', len(pred.vocab))
34 |     count = 0
35 |     semi = []
36 | 
37 |     with open(samples_path, 'r') as f:
38 |         for picked in f:
39 |             count += 1
40 |             source, ref = picked.strip().split('<sep>')
41 |             prediction = pred.predict(ref.split(), beam_search=beam_search)
42 |             # 拼接ref的预测结果与ref，形成新的样本
43 |             semi.append(prediction + ' <sep> ' + ref)
44 | 
45 |             if count % 100 == 0:
46 |                 print(count)
47 |                 write_samples(semi, write_path, 'a')
48 |                 semi = []
49 | 
50 | 
51 | if __name__ == '__main__':
52 |     samples_path = 'output/train.txt'
53 |     write_path_greedy = 'output/semi_greedy.txt'
54 |     write_path_beam = 'output/semi_beam.txt'
55 |     beam_search = True
56 |     write_path = write_path_beam if beam_search else write_path_greedy
57 |     semi_supervised(samples_path, write_path, beam_search)
58 | 


--------------------------------------------------------------------------------
/model/evaluate.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | '''
 4 | @Author: author
 5 | @Date: 2020-07-13 11:00:51
 6 | LastEditTime: 2020-10-19 18:45:54
 7 | LastEditors: Please set LastEditors
 8 | @Description: Evaluate the loss in the dev set.
 9 | @FilePath: /project_2/model/evaluate.py
10 | '''
11 | 
12 | import os
13 | import sys
14 | import pathlib
15 | import torch
16 | from tqdm import tqdm
17 | import numpy as np
18 | from torch.utils.data import DataLoader
19 | 
20 | from dataset import collate_fn
21 | import config
22 | 
23 | abs_path = pathlib.Path(__file__).parent.absolute()
24 | sys.path.append(sys.path.append(abs_path))
25 | curPath = os.path.abspath(os.path.dirname(__file__)) + '/'
26 | 
27 | 
28 | def evaluate(model, val_data, epoch):
29 |     """Evaluate the loss for an epoch.
30 | 
31 |     Args:
32 |         model (torch.nn.Module): The model to evaluate.
33 |         val_data (dataset.PairDataset): The evaluation data set.
34 |         epoch (int): The epoch number.
35 | 
36 |     Returns:
37 |         numpy ndarray: The average loss of the dev set.
38 |     """
39 |     print('validating')
40 | 
41 |     val_loss = []
42 |     with torch.no_grad():
43 |         DEVICE = config.DEVICE
44 |         val_dataloader = DataLoader(dataset=val_data,
45 |                                     batch_size=config.batch_size,
46 |                                     shuffle=True,
47 |                                     pin_memory=True, drop_last=True,
48 |                                     collate_fn=collate_fn)
49 |         for batch, data in enumerate(tqdm(val_dataloader)):
50 |             x, y, x_len, y_len, oov, len_oovs = data
51 |             if config.is_cuda:
52 |                 x = x.to(DEVICE)
53 |                 y = y.to(DEVICE)
54 |                 x_len = x_len.to(DEVICE)
55 |                 len_oovs = len_oovs.to(DEVICE)
56 |             # Calculate loss.  Call model forward propagation
57 |             ###########################################
58 |             #          TODO: module 5 task 4          #
59 |             ###########################################
60 |             loss = model(x,
61 |                          x_len,
62 |                          y,
63 |                          len_oovs,
64 |                          batch=batch,
65 |                          num_batches=len(val_dataloader),
66 |                          teacher_forcing=True)
67 |             val_loss.append(loss.item())
68 |     return np.mean(val_loss)
69 | 


--------------------------------------------------------------------------------
/model/vocab.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | '''
 4 | @Author: author
 5 | @Date: 2020-07-13 14:18:13
 6 | LastEditTime: 2020-10-17 21:18:35
 7 | LastEditors: Please set LastEditors
 8 | @Description: Define the vocabulary object.
 9 | @FilePath: /project_2/model/vocab.py
10 | '''
11 | 
12 | from collections import Counter
13 | import numpy as np
14 | 
15 | 
16 | class Vocab(object):
17 |     PAD = 0
18 |     SOS = 1
19 |     EOS = 2
20 |     UNK = 3
21 | 
22 |     def __init__(self):
23 |         self.word2index = {}
24 |         self.word2count = Counter()
25 |         self.reserved = ['<PAD>', '<SOS>', '<EOS>', '<UNK>']
26 |         self.index2word = self.reserved[:]
27 |         self.embeddings = None
28 | 
29 |     def add_words(self, words):
30 |         """Add a new token to the vocab and do mapping between word and index.
31 |         向词典加入新词，完成对word2index、index2word和word2count三个变量的更新
32 | 
33 |         Args:
34 |             words (list): The list of tokens to be added.
35 |         """
36 |         for word in words:
37 |             if word not in self.word2index:
38 |                 self.word2index[word] = len(self.index2word)
39 |                 self.index2word.append(word)
40 |         self.word2count.update(words)
41 | 
42 |     def load_embeddings(self, file_path: str, dtype=np.float32) -> int:
43 |         num_embeddings = 0
44 |         vocab_size = len(self)
45 |         with open(file_path, 'rb') as f:
46 |             for line in f:
47 |                 line = line.split()
48 |                 word = line[0].decode('utf-8')
49 |                 idx = self.word2index.get(word)
50 |                 if idx is not None:
51 |                     vec = np.array(line[1:], dtype=dtype)
52 |                     if self.embeddings is None:
53 |                         n_dims = len(vec)
54 |                         self.embeddings = np.random.normal(
55 |                             np.zeros((vocab_size, n_dims))).astype(dtype)
56 |                         self.embeddings[self.PAD] = np.zeros(n_dims)
57 |                     self.embeddings[idx] = vec
58 |                     num_embeddings += 1
59 |         return num_embeddings
60 | 
61 |     def __getitem__(self, item):
62 |         if type(item) is int:
63 |             return self.index2word[item]
64 |         return self.word2index.get(item, self.UNK)
65 | 
66 |     def __len__(self):
67 |         return len(self.index2word)
68 | 
69 |     def size(self):
70 |         """Returns the total size of the vocabulary"""
71 |         return len(self.index2word)
72 | 


--------------------------------------------------------------------------------
/data/data_utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | '''
 4 | @Author: author
 5 | @Date: 2020-07-13 11:00:51
 6 | LastEditTime: 2021-08-27 17:20:57
 7 | LastEditors: Please set LastEditors
 8 | @Description: Helper functions or classes used in data processing.
 9 | @FilePath: /project_2/data/data_utils.py
10 | '''
11 | 
12 | import os
13 | import pathlib
14 | import sys
15 | 
16 | abs_path = pathlib.Path(__file__).parent.absolute()
17 | sys.path.append(sys.path.append(abs_path))
18 | curPath = os.path.abspath(os.path.dirname(__file__)) + "/"
19 | 
20 | 
21 | def read_samples(filename):
22 |     """Read the data file and return a sample list.
23 | 
24 |     Args:
25 |         filename (str): The path of the txt file.
26 | 
27 |     Returns:
28 |         list: A list conatining all the samples in the file.
29 |     """
30 |     samples = []
31 |     with open(filename, 'r', encoding='utf8') as file:
32 |         for line in file:
33 |             samples.append(line.strip())
34 |     return samples
35 | 
36 | 
37 | def write_samples(samples, file_path, opt='w'):
38 |     """Write the samples into a file.
39 | 
40 |     Args:
41 |         samples (list): The list of samples to write.
42 |         file_path (str): The path of file to write.
43 |         opt (str, optional): The "mode" parameter in open(). Defaults to 'w'.
44 |     """
45 |     with open(curPath + file_path, opt, encoding='utf8') as file:
46 |         for line in samples:
47 |             file.write(line)
48 |             file.write('\n')
49 | 
50 | 
51 | def partition(samples):
52 |     """Partition a whole sample set into training set, dev set and test set.
53 | 
54 |     Args:
55 |         samples (Iterable): The iterable that holds the whole sample set.
56 |     """
57 |     train, dev, test = [], [], []
58 |     count = 0
59 |     for sample in samples:
60 |         count += 1
61 |         if count % 1000 == 0:
62 |             print(count)
63 |         if count <= 1000:  # Test set size.
64 |             test.append(sample)
65 |         elif count <= 6000:  # Dev set size.
66 |             dev.append(sample)
67 |         else:
68 |             train.append(sample)
69 |     print('train: ', len(train))
70 | 
71 |     write_samples(train, os.path.join(abs_path, '../files/train.txt'))
72 |     write_samples(dev, os.path.join(abs_path, '../files/dev.txt'))
73 |     write_samples(test, os.path.join(abs_path, '../files/test.txt'))
74 | 
75 | 
76 | def isChinese(word):
77 |     """Distinguish Chinese words from non-Chinese ones.
78 | 
79 |     Args:
80 |         word (str): The word to be distinguished.
81 | 
82 |     Returns:
83 |         bool: Whether the word is a Chinese word.
84 |     """
85 |     for ch in word:
86 |         if '\u4e00' <= ch <= '\u9fff':
87 |             return True
88 |     return False
89 | 


--------------------------------------------------------------------------------
/model/rouge_eval.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | '''
 4 | @Author: author
 5 | @Date: 2020-07-13 17:56:13
 6 | LastEditTime: 2021-09-03 12:47:46
 7 | LastEditors: Please set LastEditors
 8 | @Description: Evaluate the model with ROUGE score.
 9 | @FilePath: /project_2/model/rouge_eval.py
10 | '''
11 | 
12 | import os
13 | import sys
14 | import pathlib
15 | 
16 | from rouge import Rouge
17 | import jieba
18 | 
19 | from predict import Predict
20 | from utils import timer
21 | import config
22 | 
23 | abs_path = pathlib.Path(__file__).parent.absolute()
24 | sys.path.append(sys.path.append(abs_path))
25 | curPath = os.path.abspath(os.path.dirname(__file__)) + '/'
26 | 
27 | 
28 | class RougeEval():
29 |     def __init__(self, path):
30 |         self.path = path
31 |         self.scores = None
32 |         self.rouge = Rouge()
33 |         self.sources = []
34 |         self.hypos = []
35 |         self.refs = []
36 |         self.process()
37 | 
38 |     def process(self):
39 |         print('Reading from ', self.path)
40 |         with open(self.path, 'r', encoding='utf-8') as test:
41 |             for line in test:
42 |                 source, ref = line.strip().split('<sep>')
43 |                 ref = ''.join(list(jieba.cut(ref))).replace('。', '.')
44 |                 self.sources.append(source)
45 |                 self.refs.append(ref)
46 |         print(f'Test set contains {len(self.sources)} samples.')
47 | 
48 |     @timer('building hypotheses')
49 |     def build_hypos(self, predict):
50 |         """Generate hypos for the dataset.
51 | 
52 |         Args:
53 |             predict (predict.Predict()): The predictor instance.
54 |         """
55 |         print('Building hypotheses.')
56 |         count = 0
57 |         for source in self.sources:
58 |             count += 1
59 |             if count % 100 == 0:
60 |                 print(count)
61 |             self.hypos.append(predict.predict(source.split()))
62 | 
63 |     def get_average(self):
64 |         assert len(self.hypos) > 0, 'Build hypotheses first!'
65 |         print('Calculating average rouge scores.')
66 |         return self.rouge.get_scores(self.hypos, self.refs, avg=True)
67 | 
68 |     def one_sample(self, hypo, ref):
69 |         return self.rouge.get_scores(hypo, ref)[0]
70 | 
71 | 
72 | rouge_eval = RougeEval(config.test_data_path)
73 | predict = Predict()
74 | rouge_eval.build_hypos(predict)
75 | result = rouge_eval.get_average()
76 | # Rouge简介：https://blog.csdn.net/mch2869253130/article/details/89810974
77 | print('rouge1: ', result['rouge-1'])
78 | print('rouge2: ', result['rouge-2'])
79 | print('rougeL: ', result['rouge-l'])
80 | with open('../files/rouge_result.txt', 'a') as file:
81 |     for r, metrics in result.items():
82 |         file.write(r + '\n')
83 |         for metric, value in metrics.items():
84 |             file.write(metric + ': ' + str(value * 100))
85 |             file.write('\n')
86 | 


--------------------------------------------------------------------------------
/model/config.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | '''
  4 | @Author: author
  5 | @Date: 2020-07-13 11:00:51
  6 | LastEditTime: 2020-10-19 16:16:52
  7 | LastEditors: Please set LastEditors
  8 | @Description: Define configuration parameters.
  9 | @FilePath: /project_2/model/config.py
 10 | '''
 11 | 
 12 | from typing import Optional
 13 | 
 14 | import torch
 15 | 
 16 | # General
 17 | hidden_size: int = 512
 18 | dec_hidden_size: Optional[int] = 512
 19 | embed_size: int = 512
 20 | 
 21 | # Data
 22 | max_vocab_size = 20000
 23 | # embed_file: Optional[str] = '../files/sgns.target.word-ngram.1-2.dynwin5.thr10.neg5.dim300.iter5'  # use pre-trained embeddings
 24 | embed_file: Optional[str] = None  # use pre-trained embeddings
 25 | source = 'train'    # use value: train or  big_samples
 26 | data_path: str = '../files/{}.txt'.format(source)
 27 | val_data_path: Optional[str] = '../files/dev.txt'
 28 | test_data_path: Optional[str] = '../files/test.txt'
 29 | stop_word_file = '../files/HIT_stop_words.txt'
 30 | max_src_len: int = 300  # exclusive of special tokens such as EOS
 31 | max_tgt_len: int = 100  # exclusive of special tokens such as EOS
 32 | truncate_src: bool = True
 33 | truncate_tgt: bool = True
 34 | min_dec_steps: int = 30
 35 | max_dec_steps: int = 100
 36 | enc_rnn_dropout: float = 0.5
 37 | enc_attn: bool = True
 38 | dec_attn: bool = True
 39 | dec_in_dropout = 0
 40 | dec_rnn_dropout = 0
 41 | dec_out_dropout = 0
 42 | 
 43 | 
 44 | # Training
 45 | trunc_norm_init_std = 1e-4
 46 | eps = 1e-31
 47 | learning_rate = 0.001
 48 | lr_decay = 0.0
 49 | initial_accumulator_value = 0.1
 50 | epochs = 8
 51 | batch_size = 32
 52 | max_grad_norm = 2.0
 53 | is_cuda = True
 54 | DEVICE = torch.device("cuda" if is_cuda else "cpu")
 55 | LAMBDA = 1
 56 | 
 57 | # 通过以下配置可以训练以下模型：
 58 | # 1. PGN，令 pointer = True（默认 source = 'train'） 即可。
 59 | # 2. PGN (with coverage)，令 pointer = True 以及 coverage = True
 60 | # 3. PGN (fine-tuned with coverage)，令 pointer = True ， coverage = True 以及 fine_tune = True
 61 | # 4. PGN (with Weight tying)，令 pointer = True 以及 weight_tying = True
 62 | # 5. PGN (with Scheduled sampling)，令 pointer = True ， scheduled_sampling= True
 63 | # 6. PGN (training with big_samples.txt)，令 pointer = True 以及 source = 'big_samples'
 64 | pointer = True
 65 | coverage = False
 66 | fine_tune = False
 67 | scheduled_sampling = False
 68 | weight_tying = False
 69 | 
 70 | if pointer:
 71 |     if coverage:
 72 |         if fine_tune:
 73 |             model_name = 'ft_pgn'
 74 |         else:
 75 |             model_name = 'cov_pgn'
 76 |     elif scheduled_sampling:
 77 |         model_name = 'ss_pgn'
 78 |     elif weight_tying:
 79 |         model_name = 'wt_pgn'
 80 |     else:
 81 |         if source == 'big_samples':
 82 |             model_name = 'pgn_big_samples'
 83 |         else:
 84 |             model_name = 'pgn'
 85 | else:
 86 |     model_name = 'baseline'
 87 | 
 88 | encoder_save_name = '../saved_model/' + model_name + '/encoder.pt'
 89 | decoder_save_name = '../saved_model/' + model_name + '/decoder.pt'
 90 | attention_save_name = '../saved_model/' + model_name + '/attention.pt'
 91 | reduce_state_save_name = '../saved_model/' + model_name + '/reduce_state.pt'
 92 | losses_path = '../saved_model/' + model_name + '/val_losses.pkl'
 93 | log_path = '../runs/' + model_name
 94 | 
 95 | 
 96 | # Beam search
 97 | beam_size: int = 3
 98 | alpha = 0.2
 99 | beta = 0.2
100 | gamma = 0.6
101 | 


--------------------------------------------------------------------------------
/data/back_translate.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: UTF-8 -*-
  3 | """
  4 | @Author: frank
  5 | @Date: 2020-08-08 22:40:43
  6 | @File: back_translate.py
  7 | """
  8 | # !pip3 install jieba==0.36.2
  9 | import jieba
 10 | import http.client
 11 | import hashlib
 12 | import urllib
 13 | import random
 14 | import json
 15 | import time
 16 | from data_utils import write_samples
 17 | 
 18 | import os
 19 | 
 20 | curPath = os.path.abspath(os.path.dirname(__file__)) + "/"
 21 | 
 22 | 
 23 | def translate(q, source, target):
 24 |     """translate q from source language to target language
 25 |     Please refer to the official documentation   https://api.fanyi.baidu.com/ 通用翻译API
 26 |     There are demo on the website ,  register on the web site ,and get AppID, key, python3 demo.
 27 | 
 28 |     Args:
 29 |         q (str): sentence
 30 |         source(str): The language code
 31 |         target(str): The language code
 32 |     Returns:
 33 |         (str): result of translation
 34 |     """
 35 |     ###########################################
 36 |     #          TODO: module 2 task 1          #
 37 |     ###########################################
 38 |     appid = '20201019000593790'  # Fill in your AppID
 39 |     secretKey = 'meLMnlr5lBpDjpL5kLIr'  # Fill in your key
 40 | 
 41 |     httpClient = None
 42 |     myurl = '/api/trans/vip/translate'
 43 | 
 44 |     fromLang = source  # 原文语种
 45 |     toLang = target  # 译文语种
 46 |     salt = random.randint(32768, 65536)
 47 |     sign = appid + q + str(salt) + secretKey
 48 |     sign = hashlib.md5(sign.encode()).hexdigest()
 49 |     myurl = myurl + '?appid=' + appid + '&q=' + urllib.parse.quote(
 50 |         q) + '&from=' + fromLang + '&to=' + toLang + '&salt=' + str(
 51 |             salt) + '&sign=' + sign
 52 | 
 53 |     try:
 54 |         httpClient = http.client.HTTPConnection('api.fanyi.baidu.com')
 55 |         httpClient.request('GET', myurl)
 56 |         response = httpClient.getresponse()
 57 |         result_all = response.read().decode("utf-8")
 58 |         result = json.loads(result_all)
 59 |         return result
 60 |     except Exception as e:
 61 |         print(e)
 62 |     finally:
 63 |         if httpClient:
 64 |             httpClient.close()
 65 | 
 66 | 
 67 | def back_translate(q):
 68 |     """back_translate
 69 | 
 70 |     Args:
 71 |         q (str): sentence
 72 | 
 73 |     Returns:
 74 |         (str): result of back translation
 75 |     """
 76 |     ###########################################
 77 |     #          TODO: module 2 task 2          #
 78 |     ###########################################
 79 |     en = translate(q, "zh", "en")['trans_result'][0]['dst']
 80 |     time.sleep(2)
 81 |     target = translate(en, "en", "zh")['trans_result'][0]['dst']
 82 |     time.sleep(2)
 83 |     return target
 84 | 
 85 | 
 86 | def translate_continue(sample_path, translate_path):
 87 |     """translate  original file to new file
 88 | 
 89 |     Args:
 90 |         sample_path (str): original file path
 91 |         translate_path (str): target file path
 92 |     Returns:
 93 |         (str): result of back translation
 94 |     """
 95 |     ###########################################
 96 |     #          TODO: module 2 task 3          #
 97 |     ###########################################
 98 |     if os.path.exists(translate_path):
 99 |         with open(translate_path, 'r+', encoding='urf-8') as file:
100 |             exit_len = len(list(file))
101 |     else:
102 |         # with open(translate_path, 'w', encoding='urf-8') as file:
103 |         exit_len = 0
104 | 
105 |     translated = []
106 |     count = 0
107 |     with open(curPath + sample_path, 'r', encoding='utf-8') as file:
108 |         for line in file:
109 |             count += 1
110 |             print(count)
111 |             if count <= exit_len or count == 21585:
112 |                 continue
113 |             source, ref = tuple(line.strip().split('<sep>'))
114 |             source = back_translate(source.strip())
115 |             ref = back_translate(ref.strip())
116 |             source = ' '.join(list(jieba.cut(source)))
117 |             ref = ' '.join(list(jieba.cut(ref)))
118 |             translated.append(source + ' <sep> ' + ref)
119 |             if count % 10 == 0:
120 |                 print(count)
121 |                 write_samples(translated, translate_path, 'a')
122 |                 translated = []
123 |                 if count == 1000:
124 |                     break
125 | 
126 | 
127 | if __name__ == '__main__':
128 |     sample_path = 'output/train.txt'
129 |     translate_path = 'output/translated.txt'
130 |     translate_continue(sample_path, translate_path)
131 | 


--------------------------------------------------------------------------------
/data/embed_replace.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | '''
  4 | @Author: author
  5 | @Date: 2020-07-13 20:16:37
  6 | LastEditTime: 2021-08-27 17:21:59
  7 | @FilePath: /project_2/data/embed_replace.py
  8 | '''
  9 | 
 10 | from gensim.models import KeyedVectors, TfidfModel
 11 | from gensim.corpora import Dictionary
 12 | from data_utils import read_samples, isChinese, write_samples
 13 | import os
 14 | from gensim import matutils
 15 | from itertools import islice
 16 | import numpy as np
 17 | 
 18 | 
 19 | class EmbedReplace():
 20 |     def __init__(self, sample_path, wv_path):
 21 |         print("read sample file ...")
 22 |         self.samples = read_samples(sample_path)
 23 |         self.refs = [sample.split('<sep>')[1].split() for sample in self.samples]
 24 |         print("load word_vectors file ...")
 25 |         self.wv = KeyedVectors.load_word2vec_format(wv_path, binary=False)
 26 | 
 27 |         if os.path.exists('saved/tfidf.model'):
 28 |             print("load tfidf model..")
 29 |             self.tfidf_model = TfidfModel.load('saved/tfidf.model')
 30 |             self.dct = Dictionary.load('saved/tfidf.dict')
 31 |             self.corpus = [self.dct.doc2bow(doc) for doc in self.refs]
 32 |         else:
 33 |             # 训练tfidf，后续单词替换时，用来排除核心词汇
 34 |             print("train tfidf model..")
 35 |             self.dct = Dictionary(self.refs)
 36 |             self.corpus = [self.dct.doc2bow(doc) for doc in self.refs]
 37 |             self.tfidf_model = TfidfModel(self.corpus)
 38 |             self.dct.save('saved/tfidf.dict')
 39 |             self.tfidf_model.save('saved/tfidf.model')
 40 |             self.vocab_size = len(self.dct.token2id)
 41 | 
 42 |     def vectorize(self, docs, vocab_size):
 43 |         '''
 44 |         docs :: iterable of iterable of (int, number)
 45 |         '''
 46 |         return matutils.corpus2dense(docs, vocab_size)
 47 | 
 48 |     def extract_keywords(self, dct, tfidf, threshold=0.2, topk=5):
 49 | 
 50 |         """find high TFIDF socore keywords
 51 |         根据TFIDF确认需要排除的核心词汇.
 52 |         注意：为了防止将体现关键卖点的词给替换掉，导致核心语义丢失，
 53 |         所以通过 tfidf 权重对词表的词进行排序，然后替换排序靠后的词
 54 | 
 55 |         Args:
 56 |             dct (Dictionary): gensim.corpora Dictionary  a reference Dictionary
 57 |             tfidf (list of tfidf):  model[doc]  [(int, number)]
 58 |             threshold (float) : high TFIDF socore must be greater than the threshold
 59 |             topk(int): num of highest TFIDF socore
 60 |         Returns:
 61 |             (list): A list of keywords
 62 |         """
 63 | 
 64 |         ###########################################
 65 |         #          TODO: module 1 task 1          #
 66 |         ###########################################
 67 |         # 降序。sort 是应用在 list 上的方法，sorted 可以对所有可迭代的对象进行排序操作
 68 |         tfidf = sorted(tfidf, key=lambda x: x[1], reverse=True)
 69 |         # islice()获取迭代器的切片，消耗迭代器. islice(iterable, [start, ] stop [, step])
 70 |         return list(islice([dct[w] for w, score in tfidf if score > threshold], topk))
 71 | 
 72 |     def replace(self, token_list, doc):
 73 |         """replace token by another token which is similar in wordvector
 74 |         在 embedding 的词向量空间中寻找语义最接近的词进行替换
 75 | 
 76 |         Args:
 77 |             token_list (list): reference token list
 78 |             doc (list): A reference represented by a word bag model
 79 |         Returns:
 80 |             (str):  new reference str
 81 |         """
 82 | 
 83 |         ###########################################
 84 |         #          TODO: module 1 task 2          #
 85 |         ###########################################
 86 |         keywords = self.extract_keywords(self.dct, self.tfidf_model[doc])
 87 |         num = int(len(token_list) * 0.3)
 88 |         new_tokens = token_list.copy()
 89 |         while num == int(len(token_list) * 0.3):
 90 |             indexes = np.random.choice(len(token_list), num)
 91 |             for index in indexes:
 92 |                 token = token_list[index]
 93 |                 if isChinese(token) and token not in keywords and token in self.wv:
 94 |                     new_tokens[index] = self.wv.most_similar(positive=token, negative=None, topn=1)[0][0]
 95 |             num -= 1
 96 |         return ' '.join(new_tokens)
 97 | 
 98 |     def generate_samples(self, write_path):
 99 |         """generate new samples file
100 |         通过替换reference中的词生成新的reference样本
101 | 
102 |         Args:
103 |             write_path (str):  new samples file path
104 | 
105 |         """
106 |         ###########################################
107 |         #          TODO: module 1 task 3          #
108 |         ###########################################
109 |         replaced = []
110 |         count = 0
111 |         for sample, token_list, doc in zip(self.samples, self.refs, self.corpus):
112 |             replaced.append(
113 |                 sample.split('<sep>')[0] + ' <sep> ' +
114 |                 self.replace(token_list, doc))
115 |             count += 1
116 |             if count % 100 == 0:
117 |                 print(count)
118 |                 write_samples(replaced, write_path, 'a')
119 |                 replaced = []
120 | 
121 | 
122 | if __name__ == '__main__':
123 |     sample_path = 'output/train.txt'
124 |     wv_path = 'word_vectors/merge_sgns_bigram_char300.txt'
125 |     replacer = EmbedReplace(sample_path, wv_path)
126 |     replacer.generate_samples('output/replaced.txt')
127 | 


--------------------------------------------------------------------------------
/model/dataset.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | '''
  4 | @Author: author
  5 | @Date: 2020-07-13 11:00:51
  6 | LastEditTime: 2021-08-27 17:19:29
  7 | LastEditors: Please set LastEditors
  8 | @Description: Define the format of data used in the model.
  9 | @FilePath: /project_2/model/dataset.py
 10 | '''
 11 | 
 12 | 
 13 | import sys
 14 | import os
 15 | import pathlib
 16 | from collections import Counter
 17 | from typing import Callable
 18 | 
 19 | import torch
 20 | from torch.utils.data import Dataset
 21 | 
 22 | from utils import simple_tokenizer, count_words, sort_batch_by_len, source2ids, abstract2ids
 23 | from vocab import Vocab
 24 | import config
 25 | 
 26 | abs_path = pathlib.Path(__file__).parent.absolute()
 27 | sys.path.append(sys.path.append(abs_path))
 28 | curPath = os.path.abspath(os.path.dirname(__file__)) + '/'
 29 | 
 30 | 
 31 | class PairDataset(object):
 32 |     """The class represents source-reference pairs.
 33 | 
 34 |     """
 35 |     def __init__(self,
 36 |                  filename,
 37 |                  tokenize: Callable = simple_tokenizer,
 38 |                  max_src_len: int = None,
 39 |                  max_tgt_len: int = None,
 40 |                  truncate_src: bool = False,
 41 |                  truncate_tgt: bool = False):
 42 |         print("Reading dataset %s..." % filename, end=' ', flush=True)
 43 |         self.filename = filename
 44 |         self.pairs = []
 45 | 
 46 |         with open(filename, 'rt', encoding='utf-8') as f:
 47 |             next(f)
 48 |             for i, line in enumerate(f):
 49 |                 # Split the source and reference by the <sep> tag.
 50 |                 pair = line.strip().split('<sep>')
 51 |                 if len(pair) != 2:
 52 |                     print("Line %d of %s is malformed." % (i, filename))
 53 |                     print(line)
 54 |                     continue
 55 |                 src = tokenize(pair[0])
 56 |                 if max_src_len and len(src) > max_src_len:
 57 |                     if truncate_src:
 58 |                         src = src[:max_src_len]
 59 |                     else:
 60 |                         continue
 61 |                 tgt = tokenize(pair[1])
 62 |                 if max_tgt_len and len(tgt) > max_tgt_len:
 63 |                     if truncate_tgt:
 64 |                         tgt = tgt[:max_tgt_len]
 65 |                     else:
 66 |                         continue
 67 |                 self.pairs.append((src, tgt))
 68 |         print("%d pairs." % len(self.pairs))
 69 | 
 70 |     def build_vocab(self, embed_file: str = None):
 71 |         """Build the vocabulary for the data set.
 72 | 
 73 |         Args:
 74 |             embed_file (str, optional):
 75 |             The file path of the pre-trained embedding word vector.
 76 |             Defaults to None.
 77 | 
 78 |         Returns:
 79 |             vocab.Vocab: The vocab object.
 80 |         """
 81 |         # word frequency
 82 |         word_counts = Counter()
 83 |         count_words(word_counts,
 84 |                     [src + tgr for src, tgr in self.pairs])
 85 |         vocab = Vocab()
 86 |         # Filter the vocabulary by keeping only the top k tokens in terms of
 87 |         # word frequncy in the data set, where k is the maximum vocab size set
 88 |         # in "config.py".
 89 |         for word, count in word_counts.most_common(config.max_vocab_size):
 90 |             vocab.add_words([word])
 91 |         if embed_file is not None:
 92 |             count = vocab.load_embeddings(embed_file)
 93 |             print("%d pre-trained embeddings loaded." % count)
 94 | 
 95 |         return vocab
 96 | 
 97 | 
 98 | class SampleDataset(Dataset):
 99 |     """The class represents a sample set for training.
100 | 
101 |     """
102 |     def __init__(self, data_pair, vocab):
103 |         self.src_sents = [x[0] for x in data_pair]
104 |         self.trg_sents = [x[1] for x in data_pair]
105 |         self.vocab = vocab
106 |         # Keep track of how many data points.
107 |         self._len = len(data_pair)
108 | 
109 |     def __getitem__(self, index):
110 |         x, oov = source2ids(self.src_sents[index], self.vocab)
111 |         return {
112 |             'x': [self.vocab.SOS] + x + [self.vocab.EOS],
113 |             'OOV': oov,
114 |             'len_OOV': len(oov),
115 |             'y': [self.vocab.SOS] +
116 |             abstract2ids(self.trg_sents[index],
117 |                          self.vocab, oov) + [self.vocab.EOS],
118 |             'x_len': len(self.src_sents[index]),
119 |             'y_len': len(self.trg_sents[index])
120 |         }
121 | 
122 |     def __len__(self):
123 |         return self._len
124 | 
125 | 
126 | def collate_fn(batch):
127 |     """Split data set into batches and do padding for each batch.
128 | 
129 |     Args:
130 |         x_padded (Tensor): Padded source sequences.
131 |         y_padded (Tensor): Padded reference sequences.
132 |         x_len (int): Sequence length of the sources.
133 |         y_len (int): Sequence length of the references.
134 |         OOV (dict): Out-of-vocabulary tokens.
135 |         len_OOV (int): Number of OOV tokens.
136 |     """
137 |     def padding(indice, max_length, pad_idx=0):
138 |         pad_indice = [item + [pad_idx] * max(0, max_length - len(item))
139 |                       for item in indice]
140 |         return torch.tensor(pad_indice)
141 | 
142 |     data_batch = sort_batch_by_len(batch)
143 | 
144 |     x = data_batch["x"]
145 |     x_max_length = max([len(t) for t in x])
146 |     y = data_batch["y"]
147 |     y_max_length = max([len(t) for t in y])
148 | 
149 |     OOV = data_batch["OOV"]
150 |     len_OOV = torch.tensor(data_batch["len_OOV"])
151 | 
152 |     x_padded = padding(x, x_max_length)
153 |     y_padded = padding(y, y_max_length)
154 | 
155 |     x_len = torch.tensor(data_batch["x_len"])
156 |     y_len = torch.tensor(data_batch["y_len"])
157 |     return x_padded, y_padded, x_len, y_len, OOV, len_OOV
158 | 


--------------------------------------------------------------------------------
/files/HIT_stop_words.txt:
--------------------------------------------------------------------------------
  1 | ———
  2 | 》），
  3 | ）÷（１－
  4 | ”，
  5 | ）、
  6 | ＝（
  7 | :
  8 | →
  9 | ℃ 
 10 | &
 11 | *
 12 | 一一
 13 | ~~~~
 14 | ’
 15 | . 
 16 | 『
 17 | .一
 18 | ./
 19 | -- 
 20 | 』
 21 | ＝″
 22 | 【
 23 | ［＊］
 24 | ｝＞
 25 | ［⑤］］
 26 | ［①Ｄ］
 27 | ｃ］
 28 | ｎｇ昉
 29 | ＊
 30 | //
 31 | ［
 32 | ］
 33 | ［②ｅ］
 34 | ［②ｇ］
 35 | ＝｛
 36 | }
 37 | ，也 
 38 | ‘
 39 | Ａ
 40 | ［①⑥］
 41 | ［②Ｂ］ 
 42 | ［①ａ］
 43 | ［④ａ］
 44 | ［①③］
 45 | ［③ｈ］
 46 | ③］
 47 | １． 
 48 | －－ 
 49 | ［②ｂ］
 50 | ’‘ 
 51 | ××× 
 52 | ［①⑧］
 53 | ０：２ 
 54 | ＝［
 55 | ［⑤ｂ］
 56 | ［②ｃ］ 
 57 | ［④ｂ］
 58 | ［②③］
 59 | ［③ａ］
 60 | ［④ｃ］
 61 | ［①⑤］
 62 | ［①⑦］
 63 | ［①ｇ］
 64 | ∈［ 
 65 | ［①⑨］
 66 | ［①④］
 67 | ［①ｃ］
 68 | ［②ｆ］
 69 | ［②⑧］
 70 | ［②①］
 71 | ［①Ｃ］
 72 | ［③ｃ］
 73 | ［③ｇ］
 74 | ［②⑤］
 75 | ［②②］
 76 | 一.
 77 | ［①ｈ］
 78 | .数
 79 | ［］
 80 | ［①Ｂ］
 81 | 数/
 82 | ［①ｉ］
 83 | ［③ｅ］
 84 | ［①①］
 85 | ［④ｄ］
 86 | ［④ｅ］
 87 | ［③ｂ］
 88 | ［⑤ａ］
 89 | ［①Ａ］
 90 | ［②⑧］
 91 | ［②⑦］
 92 | ［①ｄ］
 93 | ［②ｊ］
 94 | 〕〔
 95 | ］［
 96 | ://
 97 | ′∈
 98 | ［②④
 99 | ［⑤ｅ］
100 | １２％
101 | ｂ］
102 | ...
103 | ...................
104 | …………………………………………………③
105 | ＺＸＦＩＴＬ
106 | ［③Ｆ］
107 | 」
108 | ［①ｏ］
109 | ］∧′＝［ 
110 | ∪φ∈
111 | ′｜
112 | ｛－
113 | ②ｃ
114 | ｝
115 | ［③①］
116 | Ｒ．Ｌ．
117 | ［①Ｅ］
118 | Ψ
119 | －［＊］－
120 | ↑
121 | .日 
122 | ［②ｄ］
123 | ［②
124 | ［②⑦］
125 | ［②②］
126 | ［③ｅ］
127 | ［①ｉ］
128 | ［①Ｂ］
129 | ［①ｈ］
130 | ［①ｄ］
131 | ［①ｇ］
132 | ［①②］
133 | ［②ａ］
134 | ｆ］
135 | ［⑩］
136 | ａ］
137 | ［①ｅ］
138 | ［②ｈ］
139 | ［②⑥］
140 | ［③ｄ］
141 | ［②⑩］
142 | ｅ］
143 | 〉
144 | 】
145 | 元／吨
146 | ［②⑩］
147 | ２．３％
148 | ５：０  
149 | ［①］
150 | ::
151 | ［②］
152 | ［③］
153 | ［④］
154 | ［⑤］
155 | ［⑥］
156 | ［⑦］
157 | ［⑧］
158 | ［⑨］ 
159 | ……
160 | ——
161 | ?
162 | 、
163 | 。
164 | “
165 | ”
166 | 《
167 | 》
168 | ！
169 | ，
170 | ：
171 | ；
172 | ？
173 | ．
174 | ,
175 | ．
176 | '
177 | ? 
178 | ·
179 | ———
180 | ──
181 | ? 
182 | —
183 | <
184 | >
185 | （
186 | ）
187 | 〔
188 | 〕
189 | [
190 | ]
191 | (
192 | )
193 | -
194 | +
195 | ～
196 | ×
197 | ／
198 | /
199 | ①
200 | ②
201 | ③
202 | ④
203 | ⑤
204 | ⑥
205 | ⑦
206 | ⑧
207 | ⑨
208 | ⑩
209 | Ⅲ
210 | В
211 | "
212 | ;
213 | #
214 | @
215 | γ
216 | μ
217 | φ
218 | φ．
219 | × 
220 | Δ
221 | ■
222 | ▲
223 | sub
224 | exp 
225 | sup
226 | sub
227 | Lex 
228 | ＃
229 | ％
230 | ＆
231 | ＇
232 | ＋
233 | ＋ξ
234 | ＋＋
235 | －
236 | －β
237 | ＜
238 | ＜±
239 | ＜Δ
240 | ＜λ
241 | ＜φ
242 | ＜＜
243 | =
244 | ＝
245 | ＝☆
246 | ＝－
247 | ＞
248 | ＞λ
249 | ＿
250 | ～±
251 | ～＋
252 | ［⑤ｆ］
253 | ［⑤ｄ］
254 | ［②ｉ］
255 | ≈ 
256 | ［②Ｇ］
257 | ［①ｆ］
258 | ＬＩ
259 | ㈧ 
260 | ［－
261 | ......
262 | 〉
263 | ［③⑩］
264 | 第二
265 | 一番
266 | 一直
267 | 一个
268 | 一些
269 | 许多
270 | 种
271 | 有的是
272 | 也就是说
273 | 末##末
274 | 啊
275 | 阿
276 | 哎
277 | 哎呀
278 | 哎哟
279 | 唉
280 | 俺
281 | 俺们
282 | 按
283 | 按照
284 | 吧
285 | 吧哒
286 | 把
287 | 罢了
288 | 被
289 | 本
290 | 本着
291 | 比
292 | 比方
293 | 比如
294 | 鄙人
295 | 彼
296 | 彼此
297 | 边
298 | 别
299 | 别的
300 | 别说
301 | 并
302 | 并且
303 | 不比
304 | 不成
305 | 不单
306 | 不但
307 | 不独
308 | 不管
309 | 不光
310 | 不过
311 | 不仅
312 | 不拘
313 | 不论
314 | 不怕
315 | 不然
316 | 不如
317 | 不特
318 | 不惟
319 | 不问
320 | 不只
321 | 朝
322 | 朝着
323 | 趁
324 | 趁着
325 | 乘
326 | 冲
327 | 除
328 | 除此之外
329 | 除非
330 | 除了
331 | 此
332 | 此间
333 | 此外
334 | 从
335 | 从而
336 | 打
337 | 待
338 | 但
339 | 但是
340 | 当
341 | 当着
342 | 到
343 | 得
344 | 的
345 | 的话
346 | 等
347 | 等等
348 | 地
349 | 第
350 | 叮咚
351 | 对
352 | 对于
353 | 多
354 | 多少
355 | 而
356 | 而况
357 | 而且
358 | 而是
359 | 而外
360 | 而言
361 | 而已
362 | 尔后
363 | 反过来
364 | 反过来说
365 | 反之
366 | 非但
367 | 非徒
368 | 否则
369 | 嘎
370 | 嘎登
371 | 该
372 | 赶
373 | 个
374 | 各
375 | 各个
376 | 各位
377 | 各种
378 | 各自
379 | 给
380 | 根据
381 | 跟
382 | 故
383 | 故此
384 | 固然
385 | 关于
386 | 管
387 | 归
388 | 果然
389 | 果真
390 | 过
391 | 哈
392 | 哈哈
393 | 呵
394 | 和
395 | 何
396 | 何处
397 | 何况
398 | 何时
399 | 嘿
400 | 哼
401 | 哼唷
402 | 呼哧
403 | 乎
404 | 哗
405 | 还是
406 | 还有
407 | 换句话说
408 | 换言之
409 | 或
410 | 或是
411 | 或者
412 | 极了
413 | 及
414 | 及其
415 | 及至
416 | 即
417 | 即便
418 | 即或
419 | 即令
420 | 即若
421 | 即使
422 | 几
423 | 几时
424 | 己
425 | 既
426 | 既然
427 | 既是
428 | 继而
429 | 加之
430 | 假如
431 | 假若
432 | 假使
433 | 鉴于
434 | 将
435 | 较
436 | 较之
437 | 叫
438 | 接着
439 | 结果
440 | 借
441 | 紧接着
442 | 进而
443 | 尽
444 | 尽管
445 | 经
446 | 经过
447 | 就
448 | 就是
449 | 就是说
450 | 据
451 | 具体地说
452 | 具体说来
453 | 开始
454 | 开外
455 | 靠
456 | 咳
457 | 可
458 | 可见
459 | 可是
460 | 可以
461 | 况且
462 | 啦
463 | 来
464 | 来着
465 | 离
466 | 例如
467 | 哩
468 | 连
469 | 连同
470 | 两者
471 | 了
472 | 临
473 | 另
474 | 另外
475 | 另一方面
476 | 论
477 | 嘛
478 | 吗
479 | 慢说
480 | 漫说
481 | 冒
482 | 么
483 | 每
484 | 每当
485 | 们
486 | 莫若
487 | 某
488 | 某个
489 | 某些
490 | 拿
491 | 哪
492 | 哪边
493 | 哪儿
494 | 哪个
495 | 哪里
496 | 哪年
497 | 哪怕
498 | 哪天
499 | 哪些
500 | 哪样
501 | 那
502 | 那边
503 | 那儿
504 | 那个
505 | 那会儿
506 | 那里
507 | 那么
508 | 那么些
509 | 那么样
510 | 那时
511 | 那些
512 | 那样
513 | 乃
514 | 乃至
515 | 呢
516 | 能
517 | 你
518 | 你们
519 | 您
520 | 宁
521 | 宁可
522 | 宁肯
523 | 宁愿
524 | 哦
525 | 呕
526 | 啪达
527 | 旁人
528 | 呸
529 | 凭
530 | 凭借
531 | 其
532 | 其次
533 | 其二
534 | 其他
535 | 其它
536 | 其一
537 | 其余
538 | 其中
539 | 起
540 | 起见
541 | 起见
542 | 岂但
543 | 恰恰相反
544 | 前后
545 | 前者
546 | 且
547 | 然而
548 | 然后
549 | 然则
550 | 让
551 | 人家
552 | 任
553 | 任何
554 | 任凭
555 | 如
556 | 如此
557 | 如果
558 | 如何
559 | 如其
560 | 如若
561 | 如上所述
562 | 若
563 | 若非
564 | 若是
565 | 啥
566 | 上下
567 | 尚且
568 | 设若
569 | 设使
570 | 甚而
571 | 甚么
572 | 甚至
573 | 省得
574 | 时候
575 | 什么
576 | 什么样
577 | 使得
578 | 是
579 | 是的
580 | 首先
581 | 谁
582 | 谁知
583 | 顺
584 | 顺着
585 | 似的
586 | 虽
587 | 虽然
588 | 虽说
589 | 虽则
590 | 随
591 | 随着
592 | 所
593 | 所以
594 | 他
595 | 他们
596 | 他人
597 | 它
598 | 它们
599 | 她
600 | 她们
601 | 倘
602 | 倘或
603 | 倘然
604 | 倘若
605 | 倘使
606 | 腾
607 | 替
608 | 通过
609 | 同
610 | 同时
611 | 哇
612 | 万一
613 | 往
614 | 望
615 | 为
616 | 为何
617 | 为了
618 | 为什么
619 | 为着
620 | 喂
621 | 嗡嗡
622 | 我
623 | 我们
624 | 呜
625 | 呜呼
626 | 乌乎
627 | 无论
628 | 无宁
629 | 毋宁
630 | 嘻
631 | 吓
632 | 相对而言
633 | 像
634 | 向
635 | 向着
636 | 嘘
637 | 呀
638 | 焉
639 | 沿
640 | 沿着
641 | 要
642 | 要不
643 | 要不然
644 | 要不是
645 | 要么
646 | 要是
647 | 也
648 | 也罢
649 | 也好
650 | 一
651 | 一般
652 | 一旦
653 | 一方面
654 | 一来
655 | 一切
656 | 一样
657 | 一则
658 | 依
659 | 依照
660 | 矣
661 | 以
662 | 以便
663 | 以及
664 | 以免
665 | 以至
666 | 以至于
667 | 以致
668 | 抑或
669 | 因
670 | 因此
671 | 因而
672 | 因为
673 | 哟
674 | 用
675 | 由
676 | 由此可见
677 | 由于
678 | 有
679 | 有的
680 | 有关
681 | 有些
682 | 又
683 | 于
684 | 于是
685 | 于是乎
686 | 与
687 | 与此同时
688 | 与否
689 | 与其
690 | 越是
691 | 云云
692 | 哉
693 | 再说
694 | 再者
695 | 在
696 | 在下
697 | 咱
698 | 咱们
699 | 则
700 | 怎
701 | 怎么
702 | 怎么办
703 | 怎么样
704 | 怎样
705 | 咋
706 | 照
707 | 照着
708 | 者
709 | 这
710 | 这边
711 | 这儿
712 | 这个
713 | 这会儿
714 | 这就是说
715 | 这里
716 | 这么
717 | 这么点儿
718 | 这么些
719 | 这么样
720 | 这时
721 | 这些
722 | 这样
723 | 正如
724 | 吱
725 | 之
726 | 之类
727 | 之所以
728 | 之一
729 | 只是
730 | 只限
731 | 只要
732 | 只有
733 | 至
734 | 至于
735 | 诸位
736 | 着
737 | 着呢
738 | 自
739 | 自从
740 | 自个儿
741 | 自各儿
742 | 自己
743 | 自家
744 | 自身
745 | 综上所述
746 | 总的来看
747 | 总的来说
748 | 总的说来
749 | 总而言之
750 | 总之
751 | 纵
752 | 纵令
753 | 纵然
754 | 纵使
755 | 遵照
756 | 作为
757 | 兮
758 | 呃
759 | 呗
760 | 咚
761 | 咦
762 | 喏
763 | 啐
764 | 喔唷
765 | 嗬
766 | 嗯
767 | 嗳
768 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | Package                           Version
  2 | --------------------------------- ----------------------
  3 | absl-py                           0.9.0
  4 | alembic                           1.0.5
  5 | annoy                             1.16.3
  6 | astor                             0.7.1
  7 | async-generator                   1.10
  8 | attrs                             18.2.0
  9 | awscli                            1.16.170
 10 | backcall                          0.1.0
 11 | backports.functools-lru-cache     1.5
 12 | bayesian-optimization             1.2.0
 13 | beautifulsoup4                    4.6.3
 14 | bleach                            1.5.0
 15 | bokeh                             1.0.1
 16 | boto                              2.49.0
 17 | boto3                             1.14.16
 18 | botocore                          1.17.16
 19 | certifi                           2018.10.15
 20 | cffi                              1.11.5
 21 | chardet                           3.0.4
 22 | cheroot                           6.5.2
 23 | CherryPy                          18.0.1
 24 | Click                             7.0
 25 | cloudpickle                       0.6.1
 26 | colorama                          0.3.9
 27 | cycler                            0.10.0
 28 | Cython                            0.29
 29 | dask                              0.20.2
 30 | dataclasses                       0.7
 31 | decorator                         4.3.0
 32 | defusedxml                        0.5.0
 33 | dill                              0.2.8.2
 34 | docutils                          0.14
 35 | dowser                            0.2
 36 | easydict                          1.9
 37 | entrypoints                       0.2.3
 38 | faiss                             1.6.3
 39 | fasttext                          0.9.2
 40 | filelock                          3.0.12
 41 | Flask                             1.0.2
 42 | future                            0.17.1
 43 | gast                              0.2.0
 44 | gensim                            3.8.3
 45 | glog                              0.3.1
 46 | glove-python                      0.1.0
 47 | google-pasta                      0.2.0
 48 | graphviz                          0.9
 49 | grpcio                            1.16.1
 50 | h5py                              2.8.0
 51 | html5lib                          0.9999999
 52 | hypothesis                        3.82.1
 53 | idna                              2.7
 54 | imbalanced-learn                  0.7.0
 55 | infi.recipe.console-scripts       0.5.4
 56 | ipykernel                         5.1.0
 57 | ipympl                            0.2.1
 58 | ipython                           7.0.1
 59 | ipython-genutils                  0.2.0
 60 | ipywidgets                        7.4.2
 61 | itsdangerous                      1.1.0
 62 | jaraco.functools                  1.20
 63 | jedi                              0.13.1
 64 | jieba                             0.39
 65 | Jinja2                            2.10
 66 | jmespath                          0.9.4
 67 | joblib                            0.16.0
 68 | jsonschema                        2.6.0
 69 | jupyter-client                    5.2.3
 70 | jupyter-contrib-core              0.3.3
 71 | jupyter-contrib-nbextensions      0.5.0
 72 | jupyter-core                      4.4.0
 73 | jupyter-highlight-selected-word   0.2.0
 74 | jupyter-latex-envs                1.4.4
 75 | jupyter-nbextensions-configurator 0.4.0
 76 | jupyter-tensorboard               0.1.8
 77 | jupyterhub                        0.9.4
 78 | jupyterlab                        0.35.4
 79 | jupyterlab-server                 0.2.0
 80 | Keras                             2.2.4
 81 | Keras-Applications                1.0.8
 82 | Keras-Preprocessing               1.1.2
 83 | kiwisolver                        1.0.1
 84 | lightgbm                          2.3.2
 85 | llvmlite                          0.25.0
 86 | lmdb                              0.94
 87 | lxml                              4.2.5
 88 | Mako                              1.0.7
 89 | Markdown                          3.0.1
 90 | MarkupSafe                        1.1.0
 91 | matplotlib                        2.2.3
 92 | mistune                           0.8.4
 93 | more-itertools                    4.3.0
 94 | mpmath                            1.0.0
 95 | nbconvert                         5.4.0
 96 | nbformat                          4.4.0
 97 | networkx                          2.2
 98 | nltk                              3.3
 99 | notebook                          5.7.0
100 | numba                             0.40.1
101 | numexpr                           2.6.8
102 | numpy                             1.19.2
103 | packaging                         18.0
104 | pamela                            0.3.0
105 | pandarallel                       1.5.1
106 | pandas                            1.1.2
107 | pandocfilters                     1.4.2
108 | parso                             0.3.1
109 | pathlib                           1.0.1
110 | patsy                             0.5.1
111 | pexpect                           4.6.0
112 | pickleshare                       0.7.5
113 | Pillow                            5.3.0
114 | pip                               18.1
115 | plotly                            3.4.2
116 | portend                           2.3
117 | prettytable                       0.7.2
118 | prometheus-client                 0.4.2
119 | prompt-toolkit                    2.0.7
120 | protobuf                          3.6.1
121 | ptyprocess                        0.6.0
122 | pyaml                             20.4.0
123 | pyasn1                            0.4.5
124 | pybind11                          2.5.0
125 | pycparser                         2.19
126 | pycurl                            7.43.0
127 | pydot                             1.2.4
128 | pyemd                             0.5.1
129 | Pygments                          2.2.0
130 | pygobject                         3.20.0
131 | pyparsing                         2.3.0
132 | python-apt                        1.1.0b1+ubuntu0.16.4.2
133 | python-dateutil                   2.7.5
134 | python-editor                     1.0.3
135 | python-gflags                     3.1.2
136 | python-magic                      0.4.15
137 | python-nvd3                       0.15.0
138 | python-oauth2                     1.1.0
139 | python-slugify                    1.2.6
140 | pytorch-ignite                    0.1.2
141 | pytz                              2018.7
142 | PyWavelets                        1.0.1
143 | PyYAML                            3.13
144 | pyzmq                             17.1.2
145 | regex                             2020.6.8
146 | requests                          2.20.1
147 | retrying                          1.3.3
148 | rouge                             1.0.0
149 | rsa                               3.4.2
150 | s3cmd                             2.0.2
151 | s3transfer                        0.3.3
152 | sacremoses                        0.0.43
153 | scikit-image                      0.14.1
154 | scikit-learn                      0.23.1
155 | scikit-multilearn                 0.2.0
156 | scikit-optimize                   0.7.4
157 | scipy                             1.1.0
158 | seaborn                           0.9.0
159 | Send2Trash                        1.5.0
160 | sentencepiece                     0.1.91
161 | setuptools                        40.6.2
162 | Shapely                           1.6.4.post2
163 | simplegeneric                     0.8.1
164 | simplejson                        3.16.0
165 | six                               1.11.0
166 | smart-open                        2.1.0
167 | SQLAlchemy                        1.2.14
168 | statsmodels                       0.9.0
169 | sympy                             1.3
170 | synonyms                          3.15.0
171 | tempora                           1.14
172 | tensorboard                       1.12.2
173 | tensorboardX                      1.8
174 | tensorflow-estimator              1.14.0
175 | tensorflow-gpu                    1.12.3
176 | termcolor                         1.1.0
177 | terminado                         0.8.1
178 | testpath                          0.4.2
179 | textbrewer                        0.2.0.1
180 | threadpoolctl                     2.1.0
181 | tokenizers                        0.7.0
182 | toolz                             0.9.0
183 | torch                             1.1.0
184 | torchvision                       0.3.0
185 | tornado                           5.1.1
186 | tqdm                              4.51.0
187 | traitlets                         4.3.2
188 | transformers                      2.11.0
189 | typing                            3.6.6
190 | Unidecode                         1.0.23
191 | urllib3                           1.24.1
192 | vincent                           0.4.0
193 | Wand                              0.4.5
194 | wcwidth                           0.1.7
195 | webencodings                      0.5.1
196 | Werkzeug                          0.14.1
197 | wheel                             0.32.3
198 | widgetsnbextension                3.4.2
199 | wrapt                             1.12.1
200 | xarray                            0.16.1
201 | xlrd                              1.1.0
202 | xlwt                              1.3.0
203 | yacs                              0.1.5
204 | zc.buildout                       2.12.2
205 | zc.lockfile                       1.4


--------------------------------------------------------------------------------
/model/train.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | '''
  4 | @Author: author
  5 | @Date: 2020-07-13 12:31:25
  6 | LastEditTime: 2021-09-03 17:09:08
  7 | LastEditors: Please set LastEditors
  8 | @Description: Train the model.
  9 | @FilePath: /project_2/model/train.py
 10 | '''
 11 | 
 12 | 
 13 | import pickle
 14 | import os
 15 | import sys
 16 | import pathlib
 17 | 
 18 | import numpy as np
 19 | from torch import optim
 20 | from torch.utils.data import DataLoader
 21 | import torch
 22 | from torch.nn.utils import clip_grad_norm_
 23 | from tqdm import tqdm
 24 | from tensorboardX import SummaryWriter
 25 | 
 26 | from dataset import PairDataset
 27 | from model import PGN
 28 | import config
 29 | from evaluate import evaluate
 30 | from dataset import collate_fn, SampleDataset
 31 | from utils import ScheduledSampler, config_info
 32 | 
 33 | abs_path = pathlib.Path(__file__).parent.absolute()
 34 | sys.path.append(sys.path.append(abs_path))
 35 | curPath = os.path.abspath(os.path.dirname(__file__)) + '/'
 36 | 
 37 | 
 38 | def train(dataset, val_dataset, v, start_epoch=0):
 39 |     """Train the model, evaluate it and store it.
 40 | 
 41 |     Args:
 42 |         dataset (dataset.PairDataset): The training dataset.
 43 |         val_dataset (dataset.PairDataset): The evaluation dataset.
 44 |         v (vocab.Vocab): The vocabulary built from the training dataset.
 45 |         start_epoch (int, optional): The starting epoch number. Defaults to 0.
 46 |     """
 47 | 
 48 |     DEVICE = torch.device("cuda" if config.is_cuda else "cpu")
 49 | 
 50 |     model = PGN(v)
 51 |     model.load_model()
 52 |     model.to(DEVICE)
 53 |     if config.fine_tune:
 54 |         # In fine-tuning mode, we fix the weights of all parameters except attention.wc.
 55 |         print('Fine-tuning mode.')
 56 |         for name, params in model.named_parameters():
 57 |             if name != 'attention.wc.weight':
 58 |                 params.requires_grad = False
 59 |     # forward
 60 |     print("loading data ...")
 61 |     train_data = SampleDataset(dataset.pairs, v)
 62 |     val_data = SampleDataset(val_dataset.pairs, v)
 63 | 
 64 |     print("initializing optimizer")
 65 |     # Define the optimizer.
 66 |     optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)
 67 |     train_dataloader = DataLoader(dataset=train_data,
 68 |                                   batch_size=config.batch_size,
 69 |                                   shuffle=True,
 70 |                                   collate_fn=collate_fn)
 71 | 
 72 |     val_losses = np.inf
 73 |     if (os.path.exists(config.losses_path)):
 74 |         with open(config.losses_path, 'rb') as f:
 75 |             val_losses = pickle.load(f)
 76 | 
 77 |     # torch.cuda.empty_cache()
 78 |     # SummaryWriter: Log writer used for TensorboardX visualization.
 79 |     writer = SummaryWriter(config.log_path)
 80 |     # scheduled_sampler : A tool for choosing teacher_forcing or not
 81 |     ###########################################
 82 |     #          TODO: module 5 task 3          #
 83 |     ###########################################
 84 |     num_epochs = len(range(start_epoch, config.epochs))
 85 |     scheduled_sampler = ScheduledSampler(num_epochs)
 86 | 
 87 |     if config.scheduled_sampling:
 88 |         print('scheduled_sampling mode.')
 89 |     #  teacher_forcing = True
 90 | 
 91 |     # tqdm: A tool for drawing progress bars during training.
 92 |     # 详细介绍Python进度条tqdm的使用 https://blog.csdn.net/kdongyi/article/details/101547216
 93 |     with tqdm(total=config.epochs) as epoch_progress:
 94 |         for epoch in range(start_epoch, config.epochs):
 95 |             print(config_info(config))
 96 |             batch_losses = []  # Get loss of each batch.
 97 |             num_batches = len(train_dataloader)
 98 |             # set a teacher_forcing signal
 99 |             ###########################################
100 |             #          TODO: module 5 task 3          #
101 |             ###########################################
102 |             if config.scheduled_sampling:
103 |                 teacher_forcing = scheduled_sampler.teacher_forcing(epoch - start_epoch)
104 |             else:
105 |                 teacher_forcing = True
106 | 
107 |             print('teacher_forcing = {}'.format(teacher_forcing))
108 |             with tqdm(total=num_batches // 100) as batch_progress:
109 |                 for batch, data in enumerate(tqdm(train_dataloader)):
110 |                     x, y, x_len, y_len, oov, len_oovs = data
111 |                     assert not np.any(np.isnan(x.numpy()))
112 |                     if config.is_cuda:  # Training with GPUs.
113 |                         x = x.to(DEVICE)
114 |                         y = y.to(DEVICE)
115 |                         x_len = x_len.to(DEVICE)
116 |                         len_oovs = len_oovs.to(DEVICE)
117 | 
118 |                     model.train()  # Sets the module in training mode.
119 |                     
120 |                     # https://www.zhihu.com/question/303070254 
121 |                     # 1、为什么要在反向传播前梯度清零? 2、如果是梯度累加怎么处理?
122 |                     optimizer.zero_grad()  # Clear gradients.
123 |                     
124 |                     ###########################################
125 |                     #          TODO: module 5 task 3          #
126 |                     ###########################################
127 |                     # Calculate loss.  Call model forward propagation
128 |                     loss = model(x,
129 |                                  x_len,
130 |                                  y,
131 |                                  len_oovs,
132 |                                  batch=batch,
133 |                                  num_batches=num_batches,
134 |                                  teacher_forcing=teacher_forcing)
135 |                     
136 |                     # 为什么添加的是loss.item 而不是loss 
137 |                     # https://blog.csdn.net/StarfishCu/article/details/112473856
138 |                     batch_losses.append(loss.item())
139 |                     loss.backward()  # Backpropagation.
140 | 
141 |                     # Do gradient clipping to prevent gradient explosion.
142 |                     clip_grad_norm_(model.encoder.parameters(),
143 |                                     config.max_grad_norm)
144 |                     clip_grad_norm_(model.decoder.parameters(),
145 |                                     config.max_grad_norm)
146 |                     clip_grad_norm_(model.attention.parameters(),
147 |                                     config.max_grad_norm)
148 |                     optimizer.step()  # Update weights.
149 | 
150 |                     # Output and record epoch loss every 100 batches.
151 |                     if (batch % 100) == 0:
152 |                         # 设置进度条左边显示的信息
153 |                         batch_progress.set_description(f'Epoch {epoch}')
154 |                         # 设置进度条右边显示的信息
155 |                         batch_progress.set_postfix(Batch=batch,
156 |                                                    Loss=loss.item())
157 |                         batch_progress.update()  # 更新进度条
158 |                         # Write loss for tensorboard.
159 |                         writer.add_scalar(f'Average loss for epoch {epoch}',
160 |                                           np.mean(batch_losses),
161 |                                           global_step=batch)
162 |             # Calculate average loss over all batches in an epoch.
163 |             epoch_loss = np.mean(batch_losses)
164 | 
165 |             epoch_progress.set_description(f'Epoch {epoch}')
166 |             epoch_progress.set_postfix(Loss=epoch_loss)
167 |             epoch_progress.update()
168 | 
169 |             avg_val_loss = evaluate(model, val_data, epoch)
170 | 
171 |             print('training loss:{}'.format(epoch_loss),
172 |                   'validation loss:{}'.format(avg_val_loss))
173 | 
174 |             # Update minimum evaluating loss.
175 |             if (avg_val_loss < val_losses):
176 |                 # 此处保存完整模型，只保存模型参数使用：torch.save(model..encoder.state_dict(), config.encoder_save_name)
177 |                 # 两种模型保存方式：https://www.jianshu.com/p/6ba95579082c
178 |                 torch.save(model.encoder, config.encoder_save_name)
179 |                 torch.save(model.decoder, config.decoder_save_name)
180 |                 torch.save(model.attention, config.attention_save_name)
181 |                 torch.save(model.reduce_state, config.reduce_state_save_name)
182 |                 val_losses = avg_val_loss
183 |             with open(config.losses_path, 'wb') as f:
184 |                 pickle.dump(val_losses, f)
185 | 
186 |     writer.close()
187 | 
188 | 
189 | if __name__ == "__main__":
190 |     # Prepare dataset for training.
191 |     dataset = PairDataset(config.data_path,
192 |                           max_src_len=config.max_src_len,
193 |                           max_tgt_len=config.max_tgt_len,
194 |                           truncate_src=config.truncate_src,
195 |                           truncate_tgt=config.truncate_tgt)
196 |     val_dataset = PairDataset(config.val_data_path,
197 |                               max_src_len=config.max_src_len,
198 |                               max_tgt_len=config.max_tgt_len,
199 |                               truncate_src=config.truncate_src,
200 |                               truncate_tgt=config.truncate_tgt)
201 | 
202 |     vocab = dataset.build_vocab(embed_file=config.embed_file)
203 | 
204 |     train(dataset, val_dataset, vocab, start_epoch=0)
205 | 


--------------------------------------------------------------------------------
/model/utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | '''
  4 | @Author: author
  5 | @Date: 2020-07-13 11:07:48
  6 | LastEditTime: 2020-10-20 15:11:03
  7 | LastEditors: Please set LastEditors
  8 | @Description: Helper functions or classes used for the model.
  9 | @FilePath: /project_2/model/utils.py
 10 | '''
 11 | 
 12 | 
 13 | import numpy as np
 14 | import time
 15 | import heapq
 16 | import random
 17 | import sys
 18 | import os
 19 | import pathlib
 20 | import torch
 21 | import config
 22 | 
 23 | abs_path = pathlib.Path(__file__).parent.absolute()
 24 | sys.path.append(sys.path.append(abs_path))
 25 | curPath = os.path.abspath(os.path.dirname(__file__)) + '/'
 26 | 
 27 | 
 28 | def timer(module):
 29 |     """Decorator function for a timer.
 30 | 
 31 |     Args:
 32 |         module (str): Description of the function being timed.
 33 |     """
 34 |     def wrapper(func):
 35 |         """Wrapper of the timer function.
 36 | 
 37 |         Args:
 38 |             func (function): The function to be timed.
 39 |         """
 40 |         def cal_time(*args, **kwargs):
 41 |             """The timer function.
 42 | 
 43 |             Returns:
 44 |                 res (any): The returned value of the function being timed.
 45 |             """
 46 |             t1 = time.time()
 47 |             res = func(*args, **kwargs)
 48 |             t2 = time.time()
 49 |             cost_time = t2 - t1
 50 |             print(f'{cost_time} secs used for ', module)
 51 |             return res
 52 |         return cal_time
 53 |     return wrapper
 54 | 
 55 | 
 56 | def simple_tokenizer(text):
 57 |     return text.split()
 58 | 
 59 | 
 60 | def count_words(counter, text):
 61 |     '''Count the number of occurrences of each word in a set of text'''
 62 |     for sentence in text:
 63 |         for word in sentence:
 64 |             counter[word] += 1
 65 | 
 66 | 
 67 | def sort_batch_by_len(data_batch):
 68 |     res = {'x': [],
 69 |            'y': [],
 70 |            'x_len': [],
 71 |            'y_len': [],
 72 |            'OOV': [],
 73 |            'len_OOV': []}
 74 |     for i in range(len(data_batch)):
 75 |         res['x'].append(data_batch[i]['x'])
 76 |         res['y'].append(data_batch[i]['y'])
 77 |         res['x_len'].append(len(data_batch[i]['x']))
 78 |         res['y_len'].append(len(data_batch[i]['y']))
 79 |         res['OOV'].append(data_batch[i]['OOV'])
 80 |         res['len_OOV'].append(data_batch[i]['len_OOV'])
 81 | 
 82 |     # Sort indices of data in batch by lengths.
 83 |     sorted_indices = np.array(res['x_len']).argsort()[::-1].tolist()
 84 | 
 85 |     data_batch = {
 86 |         name: [_tensor[i] for i in sorted_indices]
 87 |         for name, _tensor in res.items()
 88 |     }
 89 |     return data_batch
 90 | 
 91 | 
 92 | def outputids2words(id_list, source_oovs, vocab):
 93 |     """
 94 |         Maps output ids to words, including mapping in-source OOVs from
 95 |         their temporary ids to the original OOV string (applicable in
 96 |         pointer-generator mode).
 97 |         Args:
 98 |             id_list: list of ids (integers)
 99 |             vocab: Vocabulary object
100 |             source_oovs:
101 |                 list of OOV words (strings) in the order corresponding to
102 |                 their temporary source OOV ids (that have been assigned in
103 |                 pointer-generator mode), or None (in baseline mode)
104 |         Returns:
105 |             words: list of words (strings)
106 |     """
107 |     words = []
108 |     for i in id_list:
109 |         try:
110 |             w = vocab.index2word[i]  # might be [UNK]
111 |         except IndexError:  # w is OOV
112 |             assert_msg = "Error: cannot find the ID the in the vocabulary."
113 |             assert source_oovs is not None, assert_msg
114 |             source_oov_idx = i - vocab.size()
115 |             try:
116 |                 w = source_oovs[source_oov_idx]
117 |             except ValueError:  # i doesn't correspond to an source oov
118 |                 raise ValueError(
119 |                     'Error: model produced word ID %i corresponding to source OOV %i \
120 |                      but this example only has %i source OOVs'
121 |                     % (i, source_oov_idx, len(source_oovs)))
122 |         words.append(w)
123 |     return ' '.join(words)
124 | 
125 | 
126 | def source2ids(source_words, vocab):
127 |     """Map the source words to their ids and return a list of OOVs in the source.
128 |     当训练好模型要对测试集进行测试时，测试集中的样本往往会包含OOV tokens。
129 |     本函数需要将词典中的 token 映射到相应的index，对于oov tokens，需要记录下来并返回。
130 | 
131 |     Args:
132 |         source_words: list of words (strings)
133 |         vocab: Vocabulary object
134 |     Returns:
135 |         ids:
136 |         A list of word ids (integers); OOVs are represented by their temporary
137 |         source OOV number. If the vocabulary size is 50k and the source has 3
138 |         OOVs tokens, then these temporary OOV numbers will be 50000, 50001,
139 |         50002.
140 |     oovs:
141 |         A list of the OOV words in the source (strings), in the order
142 |         corresponding to their temporary source OOV numbers.
143 |     """
144 |     ids = []
145 |     oovs = []
146 |     unk_id = vocab.UNK
147 |     for w in source_words:
148 |         i = vocab[w]
149 |         if i == unk_id:  # If w is OOV
150 |             if w not in oovs:  # Add to list of OOVs
151 |                 oovs.append(w)
152 |             # This is 0 for the first source OOV, 1 for the second source OOV
153 |             oov_num = oovs.index(w)
154 |             # This is e.g. 20000 for the first source OOV, 50001 for the second
155 |             ids.append(vocab.size() + oov_num)
156 |         else:
157 |             ids.append(i)
158 |     return ids, oovs
159 | 
160 | 
161 | def abstract2ids(abstract_words, vocab, source_oovs):
162 |     """Map tokens in the abstract (reference) to ids.
163 |        OOV tokens in the source will be remained.
164 |        由于PGN可以生成在source出现过的OOV tokens，所以需要对reference的token ids需要换一种映射方式，
165 |        即将在source出现过的OOV tokens也记录下来并给个临时的id，而不是直接替换为"<UNK>" token，以便在训练阶段准确的计算损失。
166 |        因为不同的unk代表的含义不同，如果都用统一的id来表示，误差相对更大
167 | 
168 | 
169 |     Args:
170 |         abstract_words (list): Tokens in the reference.
171 |         vocab (vocab.Vocab): The vocabulary.
172 |         source_oovs (list): OOV tokens in the source.
173 | 
174 |     Returns:
175 |         list: The reference with tokens mapped into ids.
176 |     """
177 |     ids = []
178 |     unk_id = vocab.UNK
179 |     for w in abstract_words:
180 |         i = vocab[w]
181 |         if i == unk_id:  # If w is an OOV word
182 |             if w in source_oovs:  # If w is an in-source OOV
183 |                 # Map to its temporary source OOV number
184 |                 vocab_idx = vocab.size() + source_oovs.index(w)
185 |                 ids.append(vocab_idx)
186 |             else:  # If w is an out-of-source OOV
187 |                 ids.append(unk_id)  # Map to the UNK token id
188 |         else:
189 |             ids.append(i)
190 |     return ids
191 | 
192 | 
193 | class Beam(object):
194 |     def __init__(self,
195 |                  tokens,
196 |                  log_probs,
197 |                  decoder_states,
198 |                  coverage_vector):
199 |         self.tokens = tokens
200 |         self.log_probs = log_probs
201 |         self.decoder_states = decoder_states
202 |         self.coverage_vector = coverage_vector
203 | 
204 |     def extend(self,
205 |                token,
206 |                log_prob,
207 |                decoder_states,
208 |                coverage_vector):
209 |         return Beam(tokens=self.tokens + [token],
210 |                     log_probs=self.log_probs + [log_prob],
211 |                     decoder_states=decoder_states,
212 |                     coverage_vector=coverage_vector)
213 | 
214 |     def seq_score(self):
215 |         """
216 |         This function calculate the score of the current sequence.
217 |         The scores are calculated according to the definitions in
218 |         https://opennmt.net/OpenNMT/translation/beam_search/.
219 |         1. Lenth normalization is used to normalize the cumulative score
220 |         of a whole sequence.
221 |         2. Coverage normalization is used to favor the sequences that fully
222 |         cover the information in the source. (In this case, it serves different
223 |         purpose from the coverage mechanism defined in PGN.)
224 |         3. Alpha and beta are hyperparameters that used to control the
225 |         strengths of ln and cn.
226 |         """
227 |         len_Y = len(self.tokens)
228 |         # Lenth normalization
229 |         ln = (5+len_Y)**config.alpha / (5+1)**config.alpha
230 |         cn = config.beta * torch.sum(  # Coverage normalization
231 |             torch.log(
232 |                 config.eps +
233 |                 torch.where(
234 |                     self.coverage_vector < 1.0,
235 |                     self.coverage_vector,
236 |                     torch.ones((1, self.coverage_vector.shape[1])).to(torch.device(config.DEVICE))
237 |                 )
238 |             )
239 |         )
240 | 
241 |         score = sum(self.log_probs) / ln + cn
242 |         return score
243 | 
244 |     def __lt__(self, other):
245 |         return self.seq_score() < other.seq_score()
246 | 
247 |     def __le__(self, other):
248 |         return self.seq_score() <= other.seq_score()
249 | 
250 | 
251 | def add2heap(heap, item, k):
252 |     """Maintain a heap with k nodes and the smallest one as root.
253 | 
254 |     Args:
255 |         heap (list): The list to heapify.
256 |         item (tuple):
257 |             The tuple as item to store.
258 |             Comparsion will be made according to values in the first position.
259 |             If there is a tie, values in the second position will be compared,
260 |             and so on.
261 |         k (int): The capacity of the heap.
262 |     """
263 |     if len(heap) < k:
264 |         heapq.heappush(heap, item)
265 |     else:
266 |         heapq.heappushpop(heap, item)
267 | 
268 | 
269 | def replace_oovs(in_tensor, vocab):
270 |     """Replace oov tokens in a tensor with the <UNK> token.
271 | 
272 |     Args:
273 |         in_tensor (Tensor): The tensor before replacement.
274 |         vocab (vocab.Vocab): The vocabulary.
275 | 
276 |     Returns:
277 |         Tensor: The tensor after replacement.
278 |     """
279 |     oov_token = torch.full(in_tensor.shape, vocab.UNK).long().to(config.DEVICE)
280 |     out_tensor = torch.where(in_tensor > len(vocab) - 1, oov_token, in_tensor)
281 |     return out_tensor
282 | 
283 | 
284 | class ScheduledSampler():
285 |     def __init__(self, phases):
286 |         self.phases = phases
287 |         self.scheduled_probs = [i / (self.phases - 1) for i in range(self.phases)]
288 | 
289 |     def teacher_forcing(self, phase):
290 |         """According to a certain probability to choose whether to execute teacher_forcing
291 |         每个 time step 以一个 p 的概率进行 Teacher forcing，以 1-p的概率不进行 Teacher forcing。
292 |         p 的大小可以随着 batch 或者 epoch衰减，即开始训练的阶段完全使用 ground truth 以加快模型收敛，
293 |         到后面逐渐将 ground truth 替换成模型自己的输出，到训练后期就与预测阶段的输出一致
294 |         论文参考：https://arxiv.org/pdf/1506.03099.pdf
295 |         文章参考：Teacher Forcing训练机制详解 https://blog.csdn.net/qq_30219017/article/details/89090690
296 |         
297 |         Args:
298 |             phase (int): probability level  if phase = 0, 100% teacher_forcing ,phase = self.phases - 1, 0% teacher_forcing
299 | 
300 |         Returns:
301 |             bool: teacher_forcing or not
302 |         """
303 |         ###########################################
304 |         #          TODO: module 5 task 1          #
305 |         ###########################################
306 |         sampling_prob = random.random()
307 |         if sampling_prob >= self.scheduled_probs[phase]:
308 |             return True
309 |         else:
310 |             return False
311 | 
312 | 
313 | def config_info(config):
314 |     """get some config information
315 | 
316 |     Args:
317 |         config (model): define in  model/config.py
318 |     Returns:
319 |         string: config information
320 |     """
321 |     info = 'model_name = {}, pointer = {}, coverage = {}, fine_tune = {}, scheduled_sampling = {}, weight_tying = {},' + 'source = {}  '
322 |     return (info.format(config.model_name, config.pointer, config.coverage,
323 |                         config.fine_tune, config.scheduled_sampling,
324 |                         config.weight_tying, config.source))
325 | 


--------------------------------------------------------------------------------
/model/predict.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | '''
  4 | @Author: author
  5 | @Date: 2020-07-13 11:00:51
  6 | LastEditTime: 2020-10-17 21:23:34
  7 | LastEditors: Please set LastEditors
  8 | @Description: Generate a summary.
  9 | @FilePath: /project_2/model/predict.py
 10 | '''
 11 | 
 12 | import random
 13 | import os
 14 | import sys
 15 | import pathlib
 16 | 
 17 | import torch
 18 | import jieba
 19 | 
 20 | import config
 21 | from model import PGN
 22 | from dataset import PairDataset
 23 | from utils import source2ids, outputids2words, Beam, timer, add2heap, replace_oovs
 24 | 
 25 | abs_path = pathlib.Path(__file__).parent.absolute()
 26 | sys.path.append(sys.path.append(abs_path))
 27 | curPath = os.path.abspath(os.path.dirname(__file__)) + '/'
 28 | 
 29 | 
 30 | class Predict():
 31 |     @timer(module='initalize predicter')
 32 |     def __init__(self):
 33 |         self.DEVICE = config.DEVICE
 34 | 
 35 |         dataset = PairDataset(config.data_path,
 36 |                               max_src_len=config.max_src_len,
 37 |                               max_tgt_len=config.max_tgt_len,
 38 |                               truncate_src=config.truncate_src,
 39 |                               truncate_tgt=config.truncate_tgt)
 40 | 
 41 |         self.vocab = dataset.build_vocab(embed_file=config.embed_file)
 42 | 
 43 |         self.model = PGN(self.vocab)
 44 |         self.stop_word = list(
 45 |             set([
 46 |                 self.vocab[x.strip()] for x in
 47 |                 open(config.stop_word_file, encoding='utf-8').readlines()
 48 |             ]))
 49 |         self.model.load_model()
 50 |         self.model.to(self.DEVICE)
 51 | 
 52 |     def greedy_search(self,
 53 |                       x,
 54 |                       max_sum_len,
 55 |                       len_oovs,
 56 |                       x_padding_masks):
 57 |         """Function which returns a summary by always picking
 58 |            the highest probability option conditioned on the previous word.
 59 | 
 60 |         Args:
 61 |             x (Tensor): Input sequence as the source.
 62 |             max_sum_len (int): The maximum length a summary can have.
 63 |             len_oovs (Tensor): Numbers of out-of-vocabulary tokens.
 64 |             x_padding_masks (Tensor):
 65 |                 The padding masks for the input sequences
 66 |                 with shape (batch_size, seq_len).
 67 | 
 68 |         Returns:
 69 |             summary (list): The token list of the result summary.
 70 |         """
 71 | 
 72 |         # Get encoder output and states.Call encoder forward propagation
 73 |         ###########################################
 74 |         #          TODO: module 4 task 2          #
 75 |         ###########################################
 76 |         # use decoder to generate vocab distribution for the next token
 77 |         encoder_output, encoder_states = self.model.encoder(
 78 |             replace_oovs(x, self.vocab), self.model.decoder.embedding)
 79 | 
 80 |         # Initialize decoder's hidden states with encoder's hidden states.
 81 |         decoder_states = self.model.reduce_state(encoder_states)
 82 | 
 83 |         # Initialize decoder's input at time step 0 with the SOS token.
 84 |         x_t = torch.ones(1) * self.vocab.SOS
 85 |         x_t = x_t.to(self.DEVICE, dtype=torch.int64)
 86 |         summary = [self.vocab.SOS]
 87 |         coverage_vector = torch.zeros((1, x.shape[1])).to(self.DEVICE)
 88 |         # Generate hypothesis with maximum decode step.
 89 |         while int(x_t.item()) != (self.vocab.EOS) \
 90 |                 and len(summary) < max_sum_len:
 91 | 
 92 |             context_vector, attention_weights, coverage_vector = \
 93 |                 self.model.attention(decoder_states,
 94 |                                      encoder_output,
 95 |                                      x_padding_masks,
 96 |                                      coverage_vector)
 97 |             p_vocab, decoder_states, p_gen = \
 98 |                 self.model.decoder(x_t.unsqueeze(1),
 99 |                                    decoder_states,
100 |                                    context_vector)
101 |             final_dist = self.model.get_final_distribution(x,
102 |                                                            p_gen,
103 |                                                            p_vocab,
104 |                                                            attention_weights,
105 |                                                            torch.max(len_oovs))
106 |             # Get next token with maximum probability.
107 |             x_t = torch.argmax(final_dist, dim=1).to(self.DEVICE)
108 |             decoder_word_idx = x_t.item()
109 |             summary.append(decoder_word_idx)
110 |             x_t = replace_oovs(x_t, self.vocab)
111 | 
112 |         return summary
113 | 
114 | #     @timer('best k')
115 |     def best_k(self, beam, k, encoder_output, x_padding_masks, x, len_oovs):
116 |         """Get best k tokens to extend the current sequence at the current time step.
117 | 
118 |         Args:
119 |             beam (untils.Beam): The candidate beam to be extended.
120 |             k (int): Beam size.
121 |             encoder_output (Tensor): The lstm output from the encoder.
122 |             x_padding_masks (Tensor):
123 |                 The padding masks for the input sequences.
124 |             x (Tensor): Source token ids.
125 |             len_oovs (Tensor): Number of oov tokens in a batch.
126 | 
127 |         Returns:
128 |             best_k (list(Beam)): The list of best k candidates.
129 | 
130 |         """
131 |         # use decoder to generate vocab distribution for the next token
132 |         x_t = torch.tensor(beam.tokens[-1]).reshape(1, 1)
133 |         x_t = x_t.to(self.DEVICE)
134 | 
135 |         # Get context vector from attention network.
136 |         context_vector, attention_weights, coverage_vector = \
137 |             self.model.attention(beam.decoder_states,
138 |                                  encoder_output,
139 |                                  x_padding_masks,
140 |                                  beam.coverage_vector)
141 | 
142 |         # Replace the indexes of OOV words with the index of OOV token
143 |         # to prevent index-out-of-bound error in the decoder.
144 | 
145 |         p_vocab, decoder_states, p_gen = \
146 |             self.model.decoder(replace_oovs(x_t, self.vocab),
147 |                                beam.decoder_states,
148 |                                context_vector)
149 | 
150 |         final_dist = self.model.get_final_distribution(x,
151 |                                                        p_gen,
152 |                                                        p_vocab,
153 |                                                        attention_weights,
154 |                                                        torch.max(len_oovs))
155 |         # Calculate log probabilities.
156 |         log_probs = torch.log(final_dist.squeeze())
157 |         # Filter forbidden tokens.
158 |         if len(beam.tokens) == 1:
159 |             forbidden_ids = [
160 |                 self.vocab[u"台独"],
161 |                 self.vocab[u"吸毒"],
162 |                 self.vocab[u"黄赌毒"]
163 |             ]
164 |             log_probs[forbidden_ids] = -float('inf')
165 |         # EOS token penalty. Follow the definition in
166 |         # https://opennmt.net/OpenNMT/translation/beam_search/.
167 |         log_probs[self.vocab.EOS] *= \
168 |             config.gamma * x.size()[1] / len(beam.tokens)
169 | 
170 |         log_probs[self.vocab.UNK] = -float('inf')
171 |         # Get top k tokens and the corresponding logprob.
172 |         topk_probs, topk_idx = torch.topk(log_probs, k)
173 | 
174 |         # Extend the current hypo with top k tokens, resulting k new hypos.
175 |         best_k = [beam.extend(x,
176 |                   log_probs[x],
177 |                   decoder_states,
178 |                   coverage_vector) for x in topk_idx.tolist()]
179 | 
180 |         return best_k
181 | 
182 |     def beam_search(self,
183 |                     x,
184 |                     max_sum_len,
185 |                     beam_width,
186 |                     len_oovs,
187 |                     x_padding_masks):
188 |         """Using beam search to generate summary.
189 | 
190 |         Args:
191 |             x (Tensor): Input sequence as the source.
192 |             max_sum_len (int): The maximum length a summary can have.
193 |             beam_width (int): Beam size.
194 |             max_oovs (int): Number of out-of-vocabulary tokens.
195 |             x_padding_masks (Tensor):
196 |                 The padding masks for the input sequences.
197 | 
198 |         Returns:
199 |             result (list(Beam)): The list of best k candidates.
200 |         """
201 |         # run body_sequence input through encoder. Call encoder forward propagation
202 |         ###########################################
203 |         #          TODO: module 4 task 2          #
204 |         ###########################################
205 |         encoder_output, encoder_states = self.model.encoder(
206 |             replace_oovs(x, self.vocab), self.model.decoder.embedding)
207 |         coverage_vector = torch.zeros((1, x.shape[1])).to(self.DEVICE)
208 |         # initialize decoder states with encoder forward states
209 |         decoder_states = self.model.reduce_state(encoder_states)
210 | 
211 |         # initialize the hypothesis with a class Beam instance.
212 | 
213 |         init_beam = Beam([self.vocab.SOS],
214 |                          [0],
215 |                          decoder_states,
216 |                          coverage_vector)
217 | 
218 |         # get the beam size and create a list for stroing current candidates
219 |         # and a list for completed hypothesis
220 |         k = beam_width
221 |         curr, completed = [init_beam], []
222 | 
223 |         # use beam search for max_sum_len (maximum length) steps
224 |         for _ in range(max_sum_len):
225 |             # get k best hypothesis when adding a new token
226 | 
227 |             topk = []
228 |             for beam in curr:
229 |                 # When an EOS token is generated, add the hypo to the completed
230 |                 # list and decrease beam size.
231 |                 if beam.tokens[-1] == self.vocab.EOS:
232 |                     completed.append(beam)
233 |                     k -= 1
234 |                     continue
235 |                 for can in self.best_k(beam, k,
236 |                                        encoder_output, x_padding_masks, x,
237 |                                        torch.max(len_oovs)):
238 |                     # Using topk as a heap to keep track of top k candidates.
239 |                     # Using the sequence scores of the hypos to campare
240 |                     # and object ids to break ties.
241 |                     add2heap(topk, (can.seq_score(), id(can), can), k)
242 | 
243 |             curr = [items[2] for items in topk]
244 |             # stop when there are enough completed hypothesis
245 |             if len(completed) == beam_width:
246 |                 break
247 |         # When there are not engouh completed hypotheses,
248 |         # take whatever when have in current best k as the final candidates.
249 |         completed += curr
250 |         # sort the hypothesis by normalized probability and choose the best one
251 |         result = sorted(completed,
252 |                         key=lambda x: x.seq_score(),
253 |                         reverse=True)[0].tokens
254 |         return result
255 | 
256 |     @timer(module='doing prediction')
257 |     def predict(self, text, tokenize=True, beam_search=True):
258 |         """Generate summary.
259 | 
260 |         Args:
261 |             text (str or list): Source.
262 |             tokenize (bool, optional):
263 |                 Whether to do tokenize or not. Defaults to True.
264 |             beam_search (bool, optional):
265 |                 Whether to use beam search or not.
266 |                 Defaults to True (means using greedy search).
267 | 
268 |         Returns:
269 |             str: The final summary.
270 |         """
271 |         if isinstance(text, str) and tokenize:
272 |             text = list(jieba.cut(text))
273 |         x, oov = source2ids(text, self.vocab)
274 |         x = torch.tensor(x).to(self.DEVICE)
275 |         len_oovs = torch.tensor([len(oov)]).to(self.DEVICE)
276 |         x_padding_masks = torch.ne(x, 0).byte().float()
277 |         if beam_search:
278 |             summary = self.beam_search(x.unsqueeze(0),
279 |                                        max_sum_len=config.max_dec_steps,
280 |                                        beam_width=config.beam_size,
281 |                                        len_oovs=len_oovs,
282 |                                        x_padding_masks=x_padding_masks)
283 |         else:
284 |             summary = self.greedy_search(x.unsqueeze(0),
285 |                                          max_sum_len=config.max_dec_steps,
286 |                                          len_oovs=len_oovs,
287 |                                          x_padding_masks=x_padding_masks)
288 |         summary = outputids2words(summary,
289 |                                   oov,
290 |                                   self.vocab)
291 |         return summary.replace('<SOS>', '').replace('<EOS>', '').strip()
292 | 
293 | 
294 | if __name__ == "__main__":
295 |     pred = Predict()
296 |     print('vocab_size: ', len(pred.vocab))
297 |     # Randomly pick a sample in test set to predict.
298 |     with open(config.test_data_path, 'r', encoding='utf-8') as test:
299 |         picked = random.choice(list(test))
300 |         source, ref = picked.strip().split('<sep>')
301 | 
302 |     print('source: ', source, '\n')
303 |     greedy_prediction = pred.predict(source.split(), beam_search=False)
304 |     print('greedy: ', greedy_prediction, '\n')
305 |     beam_prediction = pred.predict(source.split(), beam_search=True)
306 |     print('beam: ', beam_prediction, '\n')
307 |     print('ref: ', ref, '\n')
308 | 


--------------------------------------------------------------------------------
/model/model.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | '''
  4 | @Author: your name
  5 | @Date: 2020-07-13 11:00:51
  6 | LastEditTime: 2021-09-03 16:02:14
  7 | LastEditors: Please set LastEditors
  8 | @Description: Define the model.
  9 | @FilePath: /project_2/model/model.py
 10 | '''
 11 | 
 12 | 
 13 | import os
 14 | import sys
 15 | import pathlib
 16 | 
 17 | import torch
 18 | import torch.nn as nn
 19 | import torch.nn.functional as F
 20 | 
 21 | import config
 22 | from utils import timer, replace_oovs
 23 | 
 24 | abs_path = pathlib.Path(__file__).parent.absolute()
 25 | sys.path.append(sys.path.append(abs_path))
 26 | curPath = os.path.abspath(os.path.dirname(__file__)) + '/'
 27 | 
 28 | 
 29 | class Encoder(nn.Module):
 30 |     def __init__(self,
 31 |                  vocab_size,
 32 |                  embed_size,
 33 |                  hidden_size,
 34 |                  rnn_drop: float = 0):
 35 |         super(Encoder, self).__init__()
 36 |         self.embedding = nn.Embedding(vocab_size, embed_size)
 37 |         self.hidden_size = hidden_size
 38 |         self.lstm = nn.LSTM(embed_size,
 39 |                             hidden_size,
 40 |                             bidirectional=True,
 41 |                             dropout=rnn_drop,
 42 |                             batch_first=True)
 43 | 
 44 | #     @timer('encoder')
 45 | # forward调用链 https://blog.csdn.net/u011501388/article/details/84062483
 46 |     def forward(self, x, decoder_embedding):
 47 |         """Define forward propagation for the endoer.
 48 | 
 49 |         Args:
 50 |             x (Tensor): The input samples as shape (batch_size, seq_len).
 51 |             decoder_embedding (torch.nn.modules): The input embedding layer from decoder
 52 |         Returns:
 53 |             output (Tensor):
 54 |                 The output of lstm with shape
 55 |                 (batch_size, seq_len, 2 * hidden_units).
 56 |             hidden (tuple):
 57 |                 The hidden states of lstm (h_n, c_n).
 58 |                 Each with shape (2, batch_size, hidden_units)
 59 |         """
 60 |         ###########################################
 61 |         #          TODO: module 4 task 1          #
 62 |         ###########################################
 63 |         if config.weight_tying:
 64 |             embedded = decoder_embedding(x)
 65 |         else:
 66 |             embedded = self.embedding(x)
 67 |         output, hidden = self.lstm(embedded)
 68 |         return output, hidden
 69 | 
 70 | 
 71 | class Attention(nn.Module):
 72 |     def __init__(self, hidden_units):
 73 |         super(Attention, self).__init__()
 74 |         # Define feed-forward layers.
 75 |         self.Wh = nn.Linear(2 * hidden_units, 2 * hidden_units, bias=False)
 76 |         self.Ws = nn.Linear(2 * hidden_units, 2 * hidden_units)
 77 |         # wc for coverage feature
 78 |         self.wc = nn.Linear(1, 2 * hidden_units, bias=False)
 79 |         self.v = nn.Linear(2 * hidden_units, 1, bias=False)
 80 | 
 81 | #     @timer('attention')
 82 |     def forward(self,
 83 |                 decoder_states,
 84 |                 encoder_output,
 85 |                 x_padding_masks,
 86 |                 coverage_vector):
 87 |         """Define forward propagation for the attention network.
 88 | 
 89 |         Args:
 90 |             decoder_states (tuple):
 91 |                 The hidden states from lstm (h_n, c_n) in the decoder,
 92 |                 each with shape (1, batch_size, hidden_units)
 93 |             encoder_output (Tensor):
 94 |                 The output from the lstm in the decoder with
 95 |                 shape (batch_size, seq_len, hidden_units).
 96 |             x_padding_masks (Tensor):
 97 |                 The padding masks for the input sequences
 98 |                 with shape (batch_size, seq_len).
 99 |             coverage_vector (Tensor):
100 |                 The coverage vector from last time step.
101 |                 with shape (batch_size, seq_len).
102 | 
103 |         Returns:
104 |             context_vector (Tensor):
105 |                 Dot products of attention weights and encoder hidden states.
106 |                 The shape is (batch_size, 2*hidden_units).
107 |             attention_weights (Tensor): The shape is (batch_size, seq_length).
108 |             coverage_vector (Tensor): The shape is (batch_size, seq_length).
109 |         """
110 |         # Concatenate h and c to get s_t and expand the dim of s_t.
111 |         h_dec, c_dec = decoder_states
112 |         # (1, batch_size, 2*hidden_units)
113 |         s_t = torch.cat([h_dec, c_dec], dim=2)
114 |         # (batch_size, 1, 2*hidden_units)
115 |         s_t = s_t.transpose(0, 1)
116 |         # (batch_size, seq_length, 2*hidden_units)
117 |         # contiguous 把tensor变成在内存中连续分布的形式
118 |         # https://blog.csdn.net/Z199448Y/article/details/89384158
119 |         s_t = s_t.expand_as(encoder_output).contiguous()
120 | 
121 |         # calculate attention scores
122 |         # Equation(11).
123 |         # Wh h_* (batch_size, seq_length, 2*hidden_units)
124 |         encoder_features = self.Wh(encoder_output.contiguous())
125 |         # Ws s_t (batch_size, seq_length, 2*hidden_units)
126 |         decoder_features = self.Ws(s_t)
127 |         # (batch_size, seq_length, 2*hidden_units)
128 |         att_inputs = encoder_features + decoder_features
129 | 
130 |         # Add coverage feature.
131 |         if config.coverage:
132 |             coverage_features = self.wc(coverage_vector.unsqueeze(2))  # wc c
133 |             # 论文公式11相对公式1，新增 coverage_features
134 |             att_inputs = att_inputs + coverage_features
135 | 
136 |         # 论文公式11
137 |         # (batch_size, seq_length, 1)
138 |         score = self.v(torch.tanh(att_inputs))
139 |         # (batch_size, seq_length)
140 |         attention_weights = F.softmax(score, dim=1).squeeze(2)
141 |         attention_weights = attention_weights * x_padding_masks
142 |         # Normalize attention weights after excluding padded positions.
143 |         # 此处的标准化不一定需要，因为前面已经经过softmax归一化处理了
144 |         normalization_factor = attention_weights.sum(1, keepdim=True)
145 |         attention_weights = attention_weights / normalization_factor
146 |         # (batch_size, 1, 2*hidden_units)
147 |         # torch.bmm() 与 torch.matmul() 区别 https://blog.csdn.net/foneone/article/details/103876519
148 |         # bmm 强制规定维度和大小相同
149 |         context_vector = torch.bmm(attention_weights.unsqueeze(1),
150 |                                    encoder_output)
151 |         # (batch_size, 2*hidden_units)
152 |         context_vector = context_vector.squeeze(1)
153 | 
154 |         # Update coverage vector.
155 |         if config.coverage:
156 |             # 论文公式10。解码的每个时间步，通过attention维护coverage_vector
157 |             coverage_vector = coverage_vector + attention_weights
158 | 
159 |         return context_vector, attention_weights, coverage_vector
160 | 
161 | 
162 | class Decoder(nn.Module):
163 |     def __init__(self,
164 |                  vocab_size,
165 |                  embed_size,
166 |                  hidden_size,
167 |                  enc_hidden_size=None,
168 |                  is_cuda=True):
169 |         super(Decoder, self).__init__()
170 |         self.embedding = nn.Embedding(vocab_size, embed_size)
171 |         self.DEVICE = torch.device('cuda') if is_cuda else torch.device('cpu')
172 |         self.vocab_size = vocab_size
173 |         self.hidden_size = hidden_size
174 | 
175 |         self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
176 | 
177 |         self.W1 = nn.Linear(self.hidden_size * 3, self.hidden_size)
178 |         self.W2 = nn.Linear(self.hidden_size, vocab_size)
179 |         if config.pointer:
180 |             self.w_gen = nn.Linear(self.hidden_size * 4 + embed_size, 1)
181 | 
182 | #     @timer('decoder')
183 |     def forward(self, x_t, decoder_states, context_vector):
184 |         """Define forward propagation for the decoder.
185 | 
186 |         Args:
187 |             x_t (Tensor):
188 |                 The input of the decoder x_t of shape (batch_size, 1).
189 |             decoder_states (tuple):
190 |                 The hidden states(h_n, c_n) of the decoder from last time step.
191 |                 The shapes are (1, batch_size, hidden_units) for each.
192 |             context_vector (Tensor):
193 |                 The context vector from the attention network
194 |                 of shape (batch_size,2*hidden_units).
195 | 
196 |         Returns:
197 |             p_vocab (Tensor):
198 |                 The vocabulary distribution of shape (batch_size, vocab_size).
199 |             docoder_states (tuple):
200 |                 The lstm states in the decoder.
201 |                 The shapes are (1, batch_size, hidden_units) for each.
202 |             p_gen (Tensor):
203 |                 The generation probabilities of shape (batch_size, 1).
204 |         """
205 |         decoder_emb = self.embedding(x_t)
206 | 
207 |         decoder_output, decoder_states = self.lstm(decoder_emb, decoder_states)
208 | 
209 |         # concatenate context vector and decoder state
210 |         # (batch_size, 3*hidden_units)
211 |         decoder_output = decoder_output.view(-1, config.hidden_size)
212 |         concat_vector = torch.cat(
213 |             [decoder_output,
214 |              context_vector],
215 |             dim=-1)
216 | 
217 |         # calculate vocabulary distribution
218 |         # (batch_size, hidden_units)
219 |         # 论文公式4
220 |         FF1_out = self.W1(concat_vector)
221 |         # (batch_size, vocab_size)
222 |         ###########################################
223 |         #          TODO: module 4 task 1          #
224 |         ###########################################
225 |         # (batch_size, vocab_size)
226 |         if config.weight_tying:
227 |             # three-way tying，即Encoder的input embedding，Decoder的input emdedding 和Decoder的output embedding之间的权重共享
228 |             FF2_out = torch.mm(FF1_out, torch.t(self.embedding.weight))
229 |         else:
230 |             FF2_out = self.W2(FF1_out)
231 |         p_vocab = F.softmax(FF2_out, dim=1)
232 | 
233 |         # Concatenate h and c to get s_t and expand the dim of s_t.
234 |         h_dec, c_dec = decoder_states
235 |         # (1, batch_size, 2*hidden_units)
236 |         s_t = torch.cat([h_dec, c_dec], dim=2)
237 | 
238 |         p_gen = None
239 |         if config.pointer:
240 |             # Calculate p_gen.
241 |             # Refer to equation (8). 论文公式8
242 |             # 虽然论文是各项相加，但此处使用的是cat拼接。只是信息累加的方式不同而已
243 |             x_gen = torch.cat(
244 |                 [context_vector,
245 |                  s_t.squeeze(0),
246 |                  decoder_emb.squeeze(1)],
247 |                 dim=-1)
248 |             p_gen = torch.sigmoid(self.w_gen(x_gen))
249 | 
250 |         return p_vocab, decoder_states, p_gen
251 | 
252 | 
253 | class ReduceState(nn.Module):
254 |     """
255 |     由于编码器是一个双向的LSTM层，而解码器为一个单向的LSTM层，
256 |     我们添加这个模块来减少编码器输出的隐藏状态(合并两个方向)，
257 |     然后将隐藏状态输入到解码器中
258 |     """
259 |     def __init__(self):
260 |         super(ReduceState, self).__init__()
261 | 
262 |     def forward(self, hidden):
263 |         """The forward propagation of reduce state module.
264 | 
265 |         Args:
266 |             hidden (tuple):
267 |                 Hidden states of encoder,
268 |                 each with shape (2, batch_size, hidden_units).
269 | 
270 |         Returns:
271 |             tuple:
272 |                 Reduced hidden states,
273 |                 each with shape (1, batch_size, hidden_units).
274 |         """
275 |         h, c = hidden
276 |         h_reduced = torch.sum(h, dim=0, keepdim=True)
277 |         c_reduced = torch.sum(c, dim=0, keepdim=True)
278 |         return (h_reduced, c_reduced)
279 | 
280 | 
281 | class PGN(nn.Module):
282 |     def __init__(self, v):
283 |         super(PGN, self).__init__()
284 |         self.v = v
285 |         self.DEVICE = config.DEVICE
286 |         self.attention = Attention(config.hidden_size)
287 |         self.encoder = Encoder(
288 |             len(v),
289 |             config.embed_size,
290 |             config.hidden_size,
291 |         )
292 |         self.decoder = Decoder(len(v),
293 |                                config.embed_size,
294 |                                config.hidden_size,
295 |                                )
296 |         self.reduce_state = ReduceState()
297 | 
298 |     def load_model(self):
299 | 
300 |         if (os.path.exists(config.encoder_save_name)):
301 |             print('Loading model: ', config.encoder_save_name)
302 |             # 两种模型的保存与加载方式：https://www.jianshu.com/p/6ba95579082c
303 |             self.encoder = torch.load(config.encoder_save_name)
304 |             self.decoder = torch.load(config.decoder_save_name)
305 |             self.attention = torch.load(config.attention_save_name)
306 |             self.reduce_state = torch.load(config.reduce_state_save_name)
307 | 
308 |         elif config.fine_tune:
309 |             print('Loading model: ', '../saved_model/pgn/encoder.pt')
310 |             self.encoder = torch.load('../saved_model/pgn/encoder.pt')
311 |             self.decoder = torch.load('../saved_model/pgn/decoder.pt')
312 |             self.attention = torch.load('../saved_model/pgn/attention.pt')
313 |             self.reduce_state = torch.load('../saved_model/pgn/reduce_state.pt')
314 | 
315 | #     @timer('final dist')
316 |     def get_final_distribution(self, x, p_gen, p_vocab, attention_weights,
317 |                                max_oov):
318 |         """Calculate the final distribution for the model.
319 |         pointer是根据attention分布，从source中挑选最佳的token作为输出; generator是根据P_vocab分布，从字典中挑选最佳的token作为输出。
320 |         但是Attention的分布和P_vocab的分布的长度和对应位置代表的token是不一样的，所以在计算 final distribution 的时候应该如何对应上呢?
321 | 
322 |         这里的推荐方式是，先对 P_vocab 进行扩展，将 source 中的 oov 添 加到 P_vocab 的尾部，
323 |         得到 P_vocab_extend 这样 attention weights 中的每一个 token 都能在 P_vocab_extend 中找到对应的位置，
324 |         然后将对应的 attention weights 叠加到扩展后的 P_vocab_extend 中的对应位置，得到 finaldistribution。
325 | 
326 |         为了做到将 attention weights 这个 tensor 中的值添加到 P_vocab_extend 中对应的位置，
327 |         需要使到 torch.Tensor.scatter_add 这个函数，
328 |         P_vocab_extend 作为添加值的目标 tensor，attention_weights 作为 添加值的来源 tensor，
329 |         index 化后的 source 可以作为 attention_weights 的添加依据。
330 | 
331 |         Args:
332 |             x: (batch_size, seq_len)
333 |             p_gen: (batch_size, 1)
334 |             p_vocab: (batch_size, vocab_size)
335 |             attention_weights: (batch_size, seq_len)
336 |             max_oov: (Tensor or int): The maximum sequence length in the batch. 单个批次中oov的最大长度
337 | 
338 |         Returns:
339 |             final_distribution (Tensor):
340 |             The final distribution over the extended vocabualary.
341 |             The shape is (batch_size, )
342 |         """
343 | 
344 |         if not config.pointer:
345 |             return p_vocab
346 | 
347 |         batch_size = x.size()[0]
348 |         # Clip the probabilities.
349 |         # 将输入input张量每个元素的夹紧到区间 [min,max][min,max]，并返回结果到一个新张量
350 |         p_gen = torch.clamp(p_gen, 0.001, 0.999)
351 |         # Get the weighted probabilities.
352 |         # Refer to equation (9). 参考论文公式9
353 |         p_vocab_weighted = p_gen * p_vocab
354 |         # (batch_size, seq_len)
355 |         attention_weighted = (1 - p_gen) * attention_weights
356 | 
357 |         # Get the extended-vocab probability distribution
358 |         # extended_size = len(self.v) + max_oovs
359 |         extension = torch.zeros((batch_size, max_oov)).float().to(self.DEVICE)
360 |         # (batch_size, extended_vocab_size)
361 |         p_vocab_extended = torch.cat([p_vocab_weighted, extension], dim=1)
362 | 
363 |         # Add the attention weights to the corresponding vocab positions.
364 |         # Refer to equation (9). 参考论文公式9
365 |         # scatter_add图表示例：https://www.cnblogs.com/dogecheng/p/11938009.html
366 |         final_distribution = \
367 |             p_vocab_extended.scatter_add_(dim=1,
368 |                                           index=x,
369 |                                           src=attention_weighted)
370 | 
371 |         return final_distribution
372 | 
373 |     @timer('model forward')
374 |     def forward(self, x, x_len, y, len_oovs, batch, num_batches, teacher_forcing):
375 |         """Define the forward propagation for the seq2seq model.
376 | 
377 |         Args:
378 |             x (Tensor):
379 |                 Input sequences as source with shape (batch_size, seq_len)
380 |             x_len ([int): Sequence length of the current batch.
381 |             y (Tensor):
382 |                 Input sequences as reference with shape (bacth_size, y_len)
383 |             len_oovs (Tensor):
384 |                 The numbers of out-of-vocabulary words for samples in this batch.
385 |             batch (int): The number of the current batch.
386 |             num_batches(int): Number of batches in the epoch.
387 |             teacher_forcing(bool): teacher_forcing or not
388 | 
389 |         Returns:
390 |             batch_loss (Tensor): The average loss of the current batch.
391 |         """
392 | 
393 |         ###########################################
394 |         #          TODO: module 4 task 1          #
395 |         ###########################################
396 |         x_copy = replace_oovs(x, self.v)
397 |         # torch.ne 比较每个位置是否相等，不相等返回true
398 |         # https://blog.csdn.net/m0_37962192/article/details/105308012
399 |         x_padding_masks = torch.ne(x, 0).byte().float()
400 |         # Call encoder  forward propagation
401 |         encoder_output, encoder_states = self.encoder(x_copy, self.decoder.embedding)
402 |         # Reduce encoder hidden states.
403 |         decoder_states = self.reduce_state(encoder_states)
404 |         # Initialize coverage vector.
405 |         coverage_vector = torch.zeros(x.size()).to(self.DEVICE)
406 |         # Calculate loss for every step.
407 |         step_losses = []
408 |         # use ground true to set x_t as first step data for decoder input
409 |         x_t = y[:, 0]
410 | 
411 |         # 论文公式10中的coverage_vector更新是在attention阶段
412 |         # 此处for循环0~t-1，即维护先前所有解码的时间步长的注意力分布总和
413 |         for t in range(y.shape[1] - 1):
414 | 
415 |             # use ground true to set x_t ,if teacher_forcing is True
416 |             ###########################################
417 |             #          TODO: module 5 task 2          #
418 |             ###########################################
419 |             if teacher_forcing:
420 |                 x_t = y[:, t]
421 | 
422 |             x_t = replace_oovs(x_t, self.v)
423 | 
424 |             y_t = y[:, t + 1]
425 |             # Get context vector from the attention network.
426 |             context_vector, attention_weights, coverage_vector = \
427 |                 self.attention(decoder_states,
428 |                                encoder_output,
429 |                                x_padding_masks,
430 |                                coverage_vector)
431 |             # Get vocab distribution and hidden states from the decoder.
432 |             p_vocab, decoder_states, p_gen = self.decoder(x_t.unsqueeze(1),
433 |                                                           decoder_states,
434 |                                                           context_vector)
435 | 
436 |             final_dist = self.get_final_distribution(x,
437 |                                                      p_gen,
438 |                                                      p_vocab,
439 |                                                      attention_weights,
440 |                                                      torch.max(len_oovs))
441 |             # t step predict result as t+1 step input
442 |             x_t = torch.argmax(final_dist, dim=1).to(self.DEVICE)
443 | 
444 |             # Get the probabilities predict by the model for target tokens.
445 |             if not config.pointer:
446 |                 y_t = replace_oovs(y_t, self.v)
447 | 
448 |             # https://blog.csdn.net/cpluss/article/details/90260550
449 |             # gather，根据index来索引input特定位置的数值
450 |             target_probs = torch.gather(final_dist, 1, y_t.unsqueeze(1))
451 |             target_probs = target_probs.squeeze(1)
452 | 
453 |             # Apply a mask such that pad zeros do not affect the loss
454 |             mask = torch.ne(y_t, 0).byte()
455 |             # Do smoothing to prevent getting NaN loss because of log(0).
456 |             loss = -torch.log(target_probs + config.eps)
457 | 
458 |             if config.coverage:
459 |                 # Add coverage loss.
460 |                 # 论文公式12，min即强迫模型多去关注之前没被注意过的角落
461 |                 ct_min = torch.min(attention_weights, coverage_vector)
462 |                 cov_loss = torch.sum(ct_min, dim=1)
463 |                 # 论文公式13
464 |                 # cov_loss 添加惩罚项，抑制重复词汇出现的几率
465 |                 loss = loss + config.LAMBDA * cov_loss
466 | 
467 |             mask = mask.float()
468 |             loss = loss * mask
469 | 
470 |             step_losses.append(loss)
471 | 
472 |         sample_losses = torch.sum(torch.stack(step_losses, 1), 1)
473 |         # get the non-padded length of each sequence in the batch
474 |         seq_len_mask = torch.ne(y, 0).byte().float()
475 |         batch_seq_len = torch.sum(seq_len_mask, dim=1)
476 | 
477 |         # get batch loss by dividing the loss of each batch
478 |         # by the target sequence length and mean
479 |         batch_loss = torch.mean(sample_losses / batch_seq_len)
480 |         return batch_loss
481 | 


--------------------------------------------------------------------------------
/运行结果.txt:
--------------------------------------------------------------------------------
  1 | ######################################################################################################################
  2 | #         PGN													     #
  3 | #         pointer = True、coverage = False、fine_tune = False、scheduled_sampling = False、weight_tying = False      #
  4 | ######################################################################################################################
  5 | 
  6 | user10000469@jupyter-user10000469-2dserver3708:~/notespace/model$ python3 train.py
  7 | Epoch 0: training loss:3.88209558283199 validation loss:3.504600860369511
  8 | Epoch 1: training loss:3.2578509818207135 validation loss:3.3797141286807184
  9 | Epoch 2: training loss:2.9778695575323972 validation loss:3.3768604500935626
 10 | Epoch 3: training loss:2.742982468214902 validation loss:3.427774029664504
 11 | Epoch 4: training loss:2.5292034436139192 validation loss:3.523524146813613
 12 | Epoch 5: training loss:2.3421160951094193 validation loss:3.6446718180026765
 13 | Epoch 6: training loss:2.178770269393921 validation loss:3.7735544939835868
 14 | Epoch 7: training loss:2.0416961255940524 validation loss:3.916442552056068
 15 | 
 16 | 
 17 | ser10000469@jupyter-user10000469-2dserver3708:~/notespace/model$ python3 rouge_eval.py
 18 | rouge1:  {'f': 0.2447488664574899, 'p': 0.3247486290344969, 'r': 0.2015132759240438}
 19 | rouge2:  {'f': 0.047085553397356525, 'p': 0.06234250283983856, 'r': 0.03891577878627682}
 20 | rougeL:  {'f': 0.15853867659583096, 'p': 0.22493607674945412, 'r': 0.12608364916245707}
 21 | 
 22 | 
 23 | user10000469@jupyter-user10000469-2dserver3708:~/notespace/model$ python3 predict.py
 24 | source:  女童 连衣裙 秋冬季 新品 冰雪 奇缘 儿童 加绒 长袖 公主 裙 品牌童装 爱莎 长袖 裙子 衣服 小女孩 圣诞节 演出 礼服 春秋 蓝色 裙子 披纱 皇冠 魔杖 110码 建议 高 左右 公主 裙 ， 加绒 加厚 寒 保暖 ， 质感 纱裙 ， 银线 搭配 爱心 ， 亮片 点缀 ， 披风 可拆卸 ， 升级版 ， 腰部 设计 ， 纽扣 设计 ， 高领 设计 ， 面料 信 ， 模特 展示 ， 图案 设计 ， 加绒 加厚 长绒棉 ， 产品 信息 ， 爱心 水钻 的 点缀 ， 凸 ，采用 加绒 加厚 面料 ， 柔软 纱裙 摆 ， 银色 亮片 腰带 加上 ， 更 多样化 的 展示 ， 宝宝 的 美丽 ， 气 ， 不 掉 亮片 ， ， 质感 纱裙 加上 亮片 ， 蓝色 ， 图案 印花 ， 粉色 ， 更显 质感 ， 舒适 透 ， 可拆卸 披风 ， ， 的 点缀 ， 唯美 华 ， 显 衣服 的 品质 ， ， 秋冬 新款 冰雪 奇缘 连衣裙 ， 衣 长 ， 适合 身高 ， 冰雪 奇缘 蓝 ， 蓝色 、 粉 ， 码数 ， 胸围 ， 冰雪 奇缘 粉 ， 高领 设计 更 保暖 ， 加厚 加绒面料 加上 ， 冰雪 奇缘 主题 公主 图 ， 柔软性 强 舒适 亲肤 经久 耐 ， 设计 ， 更 舒适 ， 案 设计 ， 大方 ， 高领 设计 保暖 ， 案 设计 ， 细节 展示 ， 设计 ， 舒适 ， 冰雪 奇缘 公主 图 ， 计 ， 既 美观 ， 又 方便快捷 ，流行元素 拼接 风格 淑女风 图案 卡通动漫 材质成分 聚酯纤维 适用年龄 9-12岁 面料 棉混纺 裙型 百褶裙 价格 0-99 袖长 长袖 分类 其它 适用季节 冬季 是否有腰带 无腰带 安全等级 B类 上市时间 2019年秋季
 25 | greedy:  <UNK> 的 连衣裙 ， 采用 了 亮片 的 点缀 ， 让 衣服 更加 的 时尚 ， 同时 也 能 彰显 出 孩子 的 童真 活力 。
 26 | beam:  可爱 的 主题 印花 图案 ， 甜美 可爱 ， 充满 了 灵动 的 美感 。 宽松 的 版型 设计 ， 包容性 好 ， 不 挑 身材 ， 适合 不同 体型 的 人 穿着 。
 27 | ref:  这 款 连衣裙 ， 整体 的 设计 增添 层次感 看起来 十分 优雅 呢 ， 采用 优质 的 聚酯纤维 面料 ， 质地 柔软 舒适 ， 不 掉色 不 起球 ， 独特 的 捏 褶裙 摆 设计 ， 彰显 出 女孩子 甜美 优雅 的 一面 ， 让宝贝 漂漂亮亮 的 度过 夏天 。
 28 | 
 29 | user10000469@jupyter-user10000469-2dserver3708:~/notespace/model$ python3 predict.py
 30 | source:  男装 都市 特工 连帽 插 肩袖 运动 卫衣 黑色 执行 标准 ， 产地 : 广东省 广州市 ， 品名 : 卫衣 ， ( 直接 接触 皮肤 类 ， 安全 类别 : ， 面料 : 70% 棉 ， 成份 : ， 颜色 : 灰色 ， 更 有 运动感 ， 门襟 拉链 ， ， 30% 聚苯 纤维 ， 吊牌 介绍 ， 领口 、 下摆 、 袖口 采用 罗纹 设计 ， 防风 紧致 ， 洗水 咳 标识 注释 ， 不可 于 洗厚度 常规 材质 其它 袖型 常规袖 面料材质 棉 领型 圆领 流行元素 其它 款式 开衫 上市时间 2018年春季 图案 其它 版型 标准型 袖长 长袖 基础风格 青春流行 适用人群 青年 适用场景 其它 风格 青春休闲
 31 | greedy:  卫衣 采用 连帽 的 设计 ， 可以 很 好 的 防风 保暖 ， 同时 也 能 为 你 的 头部 带来 防风 保暖 的 效果 。 宽松 的 版型 设计 ， 包容性 强 ， 适合 更多 身型 穿着 。
 32 | beam:  运动 休闲 风 的 设计 ， 采用 连帽 的 设计 ， 防风 保暖 的 同时 ， 还 能 防风 保暖 。 胸前 的 品牌 logo ， 彰显 品质 。
 33 | ref:  连帽 设计 配合 橡筋 抽绳 ， 为 帅气 卫衣 平添 动感 气息 ， 宽松 立体 的 版型 剪裁 加上 侧门 襟 撞 色织 带 工艺 ， 展现 卓尔 不同 的 设计 品味 与 青春 个性 。 插 肩袖 设计 优化 臂膀 ， 穿着 轻松 。弹性 袖 摆 收束 灵活 搭配 利落 。
 34 | 
 35 | user10000469@jupyter-user10000469-2dserver3708:~/notespace/model$ python3 predict.py
 36 | source:  夏娃 之秀 内衣 女 文胸 无 钢圈 简约 蕾丝 性感 聚拢 调整型 收副乳 胸罩 杏色 厚杯 设计 的 骄傲 ， 采用 Q 弹 KE 魔力 胶 ， 达到 食品卫生 标准 ， 透气 蜂巢 洞洞 硅胶 ， 舒适 无 钢圈 ， 两种 杯型 量身 打造 ， KE 魔力 胶 罩杯 ， 内衣 的 细节 夏娃 从不 将 就 ， 性感 聚拢 温柔 优雅 ， 四排 四扣 ， 可 根据 体型 和 生理 周期 自由 调节 ， 丝滑 亲肤 杯面 ， 柔软 蕾丝 ， 性感 聚拢 ， 时尚 优雅 ， 加宽 肩带 ， 可拆卸 调节 ， 9 字形 挂扣 ， 穿着 更 稳定 ， 无 钢圈 柔美 蕾丝 聚拢 文胸 ， 任意 扭曲 拉伸 不 变形 ， 增强 内衣 使用寿命 ， 给 你 舒适 又 “ 有 料 ” 的 体验 ， 性感 蕾丝 女内裤 ， 杏色 正面 ， 细节展示 ， 模特 展示 ， 根据 不同 人群 打造 不同 罩杯 厚度 ， 只为 您 的 更好 体验 ， 只 为 满足 你 对 内衣 挑剔 的 标准 ， 属于 你 的 小 幸运 ， 颜色 选择 ， 更显 唯美 典雅 ， 配以 鸡心 的 小花 珍珠 装饰，， 防滑 肩带 ， 银灰 正面 ， 黑色 正面 ， 黑色 背面 ， 聚拢 挺胸 ， 银灰 背面 ， Q 弹 透明 乳胶 ， 如同 婴儿 奶嘴 般 安全 ， KE 魔力 胶 ， 魔力 挺 设计 ， 约 1cm ， 约 3cm ， 可机 洗 ， 不 变形 ， 柔软有 光泽感 的 蕾丝 印花 杯面 ， ， 收 腋下 脂肪 ， 杏色 背面 ， 柔软 ， 法国 的 蕾丝 和 花边 ， 四排 四扣适用季节 四季 图案 纯色 有无钢圈 无钢圈 功能 上托 罩杯 3/4罩杯 适合胸型 下垂 风格 性感 罩杯里料硅胶 插片 无插片 面料 锦纶 胸围尺码 80A 模杯厚度 上薄下厚模杯 适用人群 青年 款式细节 蕾丝边
 37 | greedy:  <UNK> 蕾丝 蕾丝 ， 杯面 的 蕾丝 花边 ， 性感 不失 性感 ， 让 你 轻松 穿 出 浪漫 唯美 的 氛围 。
 38 | beam:  精致 的 蕾丝 面料 ， 柔软 亲肤 ， 给 你 带来 不 一样 的 穿着 体验 。 无 钢圈 的 设计 ， 可以 很 好 的 承托 胸部 ， 穿着 舒适 不 紧绷 。
 39 | ref:  舒适 无 钢圈 文胸 ， 亲肤 细腻 的 软 蕾丝 ， 有着 很 好 的 透气性 简洁 精致 ， 给 人 莫名 的 高级 感 ， 四排 四扣 有效 有收 副 乳 加上 蕾丝 边 的 点缀 ， 让 这款 文胸 更加 的 有 设计 感 ， 穿着 性感 有 女人味 ， 无 钢圈 设计 ， 穿着 无束 。、
 40 | 
 41 | 
 42 | 
 43 | 
 44 | ######################################################################################################################
 45 | #         PGN (with coverage)											     #
 46 | #         pointer = True、coverage = True、fine_tune = False、scheduled_sampling = False、weight_tying = False       #
 47 | ######################################################################################################################
 48 | 
 49 | user10000469@jupyter-user10000469-2dserver3708:~/notespace/model$ python3 train.py
 50 | Epoch 0:  training loss:4.855625686818903 validation loss:4.487559935221305
 51 | Epoch 1:  training loss:4.235552038192749 validation loss:4.3588747554100475
 52 | Epoch 2:  training loss:3.9561317658424375 validation loss:4.359008787152095
 53 | Epoch 3:  training loss:3.7183003445972096 validation loss:4.410227659421089
 54 | Epoch 4:  training loss:3.503333048213612 validation loss:4.506666246132973
 55 | Epoch 5:  training loss:3.3143862399187953 validation loss:4.6287741760412855
 56 | Epoch 6:  training loss:3.1496329463178463 validation loss:4.766416646349124
 57 | Epoch 7:  training loss:3.011511110695926 validation loss:4.90740432150853
 58 | 
 59 | 
 60 | ser10000469@jupyter-user10000469-2dserver3708:~/notespace/model$ python3 rouge_eval.py
 61 | rouge1:  {'f': 0.26167873037384076, 'p': 0.27762046117781797, 'r': 0.25315997733127643}
 62 | rouge2:  {'f': 0.04201298429513503, 'p': 0.04447904244649462, 'r': 0.04074665879183275}
 63 | rougeL:  {'f': 0.15146645951468887, 'p': 0.1767854207296833, 'r': 0.13636035933308463}
 64 | 
 65 | 
 66 | user10000469@jupyter-user10000469-2dserver3708:~/notespace/model$ python3 predict.py
 67 | source:  女童 连衣裙 秋冬季 新品 冰雪 奇缘 儿童 加绒 长袖 公主 裙 品牌童装 爱莎 长袖 裙子 衣服 小女孩 圣诞节 演出 礼服 春秋 蓝色 裙子 披纱 皇冠 魔杖 110码 建议 高 左右 公主 裙 ， 加绒 加厚 寒 保暖 ， 质感 纱裙 ， 银线 搭配 爱心 ， 亮片 点缀 ， 披风 可拆卸 ， 升级版 ， 腰部 设计 ， 纽扣 设计 ， 高领 设计 ， 面料 信 ， 模特 展示 ， 图案 设计 ， 加绒 加厚 长绒棉 ， 产品 信息 ， 爱心 水钻 的 点缀 ， 凸 ，采用 加绒 加厚 面料 ， 柔软 纱裙 摆 ， 银色 亮片 腰带 加上 ， 更 多样化 的 展示 ， 宝宝 的 美丽 ， 气 ， 不 掉 亮片 ， ， 质感 纱裙 加上 亮片 ， 蓝色 ， 图案 印花 ， 粉色 ， 更显 质感 ， 舒适 透 ， 可拆卸 披风 ， ， 的 点缀 ， 唯美 华 ， 显 衣服 的 品质 ， ， 秋冬 新款 冰雪 奇缘 连衣裙 ， 衣 长 ， 适合 身高 ， 冰雪 奇缘 蓝 ， 蓝色 、 粉 ， 码数 ， 胸围 ， 冰雪 奇缘 粉 ， 高领 设计 更 保暖 ， 加厚 加绒面料 加上 ， 冰雪 奇缘 主题 公主 图 ， 柔软性 强 舒适 亲肤 经久 耐 ， 设计 ， 更 舒适 ， 案 设计 ， 大方 ， 高领 设计 保暖 ， 案 设计 ， 细节 展示 ， 设计 ， 舒适 ， 冰雪 奇缘 公主 图 ， 计 ， 既 美观 ， 又 方便快捷 ，流行元素 拼接 风格 淑女风 图案 卡通动漫 材质成分 聚酯纤维 适用年龄 9-12岁 面料 棉混纺 裙型 百褶裙 价格 0-99 袖长 长袖 分类 其它 适用季节 冬季 是否有腰带 无腰带 安全等级 B类 上市时间 2019年秋季
 68 | greedy:  加厚 的 面料 ， 让 你 的 肌肤 感受 到 温暖 的 呵护 ， 同时 也 能 让 你 的 肌肤 感受 到 温暖 的 呵护 ， 让 你 感受 到 温暖 的 穿着 体验 。
 69 | beam:  加绒 的 内里 ， 让 孩子 穿着 倍感 温暖 舒适 ， 高领 的 设计 ， 可以 保护 孩子 的 颈部 ， 呵护 宝宝 的 小 脑袋 ， 让 你 的 宝宝 看起来 更加 的 可爱 。
 70 | ref:  这 款 连衣裙 ， 整体 的 设计 增添 层次感 看起来 十分 优雅 呢 ， 采用 优质 的 聚酯纤维 面料 ， 质地 柔软 舒适 ， 不 掉色 不 起球 ， 独特 的 捏 褶裙 摆 设计 ， 彰显 出 女孩子 甜美 优雅 的 一面 ， 让宝贝 漂漂亮亮 的 度过 夏天 
 71 | 
 72 | user10000469@jupyter-user10000469-2dserver3708:~/notespace/model$ python3 predict.py
 73 | source:  男装 都市 特工 连帽 插 肩袖 运动 卫衣 黑色 执行 标准 ， 产地 : 广东省 广州市 ， 品名 : 卫衣 ， ( 直接 接触 皮肤 类 ， 安全 类别 : ， 面料 : 70% 棉 ， 成份 : ， 颜色 : 灰色 ， 更 有 运动感 ， 门襟 拉链 ， ， 30% 聚苯 纤维 ， 吊牌 介绍 ， 领口 、 下摆 、 袖口 采用 罗纹 设计 ， 防风 紧致 ， 洗水 咳 标识 注释 ， 不可 于 洗厚度 常规 材质 其它 袖型 常规袖 面料材质 棉 领型 圆领 流行元素 其它 款式 开衫 上市时间 2018年春季 图案 其它 版型 标准型 袖长 长袖 基础风格 青春流行 适用人群 青年 适用场景 其它 风格 青春休闲
 74 | greedy:  这款 卫衣 选用 优质 面料 制作 ， 触感 细腻 顺滑 ， 具有 良好 的 适穿 性 。 精致 的 连帽 设计 ， 穿脱 便捷 ， 提升 穿着 体验 。 精心 的 缝线 工艺 ， 走线 细密 流畅 ， 牢固 耐穿 。
 75 | beam:  连帽 的 设计 ， 不仅 能 修饰 脸型 ， 还 能 起到 修饰 脸型 的 作用 。 宽松 的 版型 设计 ， 能够 很 好 的 修饰 你 的 身材 ， 让 你 看起来 更加 的 有 精气神 。
 76 | ref:  连帽 设计 配合 橡筋 抽绳 ， 为 帅气 卫衣 平添 动感 气息 ， 宽松 立体 的 版型 剪裁 加上 侧门 襟 撞 色织 带 工艺 ， 展现 卓尔 不同 的 设计 品味 与 青春 个性 。 插 肩袖 设计 优化 臂膀 ， 穿着 轻松 。弹性 袖 摆 收束 灵活 搭配 利落 。
 77 | 
 78 | user10000469@jupyter-user10000469-2dserver3708:~/notespace/model$ python3 predict.py
 79 | source:  夏娃 之秀 内衣 女 文胸 无 钢圈 简约 蕾丝 性感 聚拢 调整型 收副乳 胸罩 杏色 厚杯 设计 的 骄傲 ， 采用 Q 弹 KE 魔力 胶 ， 达到 食品卫生 标准 ， 透气 蜂巢 洞洞 硅胶 ， 舒适 无 钢圈 ， 两种 杯型 量身 打造 ， KE 魔力 胶 罩杯 ， 内衣 的 细节 夏娃 从不 将 就 ， 性感 聚拢 温柔 优雅 ， 四排 四扣 ， 可 根据 体型 和 生理 周期 自由 调节 ， 丝滑 亲肤 杯面 ， 柔软 蕾丝 ， 性感 聚拢 ， 时尚 优雅 ， 加宽 肩带 ， 可拆卸 调节 ， 9 字形 挂扣 ， 穿着 更 稳定 ， 无 钢圈 柔美 蕾丝 聚拢 文胸 ， 任意 扭曲 拉伸 不 变形 ， 增强 内衣 使用寿命 ， 给 你 舒适 又 “ 有 料 ” 的 体验 ， 性感 蕾丝 女内裤 ， 杏色 正面 ， 细节展示 ， 模特 展示 ， 根据 不同 人群 打造 不同 罩杯 厚度 ， 只为 您 的 更好 体验 ， 只 为 满足 你 对 内衣 挑剔 的 标准 ， 属于 你 的 小 幸运 ， 颜色 选择 ， 更显 唯美 典雅 ， 配以 鸡心 的 小花 珍珠 装饰，， 防滑 肩带 ， 银灰 正面 ， 黑色 正面 ， 黑色 背面 ， 聚拢 挺胸 ， 银灰 背面 ， Q 弹 透明 乳胶 ， 如同 婴儿 奶嘴 般 安全 ， KE 魔力 胶 ， 魔力 挺 设计 ， 约 1cm ， 约 3cm ， 可机 洗 ， 不 变形 ， 柔软有 光泽感 的 蕾丝 印花 杯面 ， ， 收 腋下 脂肪 ， 杏色 背面 ， 柔软 ， 法国 的 蕾丝 和 花边 ， 四排 四扣适用季节 四季 图案 纯色 有无钢圈 无钢圈 功能 上托 罩杯 3/4罩杯 适合胸型 下垂 风格 性感 罩杯里料硅胶 插片 无插片 面料 锦纶 胸围尺码 80A 模杯厚度 上薄下厚模杯 适用人群 青年 款式细节 蕾丝边
 80 | greedy:  蕾丝 的 蕾丝 面料 ， 让 你 轻松 度过 漫长 寒冬 。 无 钢圈 的 设计 ， 穿着 舒适 ， 没有 紧绷 感 。 无 钢圈 的 设计 ， 穿着 舒适 ， 没有 紧绷 感 。
 81 | beam:  蕾丝 面料 ， 柔软 舒适 ， 亲肤 透气 。 无 钢圈 的 设计 ， 让 你 穿着 无 束缚 感 。 无 钢圈 的 设计 ， 让 你 穿着 无 束缚 感 ， 让 你 轻松 度过 漫长 般 惬意 悠闲 。
 82 | ref:  舒适 无 钢圈 文胸 ， 亲肤 细腻 的 软 蕾丝 ， 有着 很 好 的 透气性 简洁 精致 ， 给 人 莫名 的 高级 感 ， 四排 四扣 有效 有收 副 乳 加上 蕾丝 边 的 点缀 ， 让 这款 文胸 更加 的 有 设计 感 ， 穿着 性感 有 女人味 ， 无 钢圈 设计 ， 穿着 无束 。、
 83 | 
 84 | 
 85 | 
 86 | 
 87 | ######################################################################################################################
 88 | #         PGN (fine-tuned with coverage)									     #
 89 | #         pointer = True、coverage = True、fine_tune = True、scheduled_sampling = False、weight_tying = False        #
 90 | ######################################################################################################################
 91 | 
 92 | user10000469@jupyter-user10000469-2dserver3708:~/notespace/model$ python3 train.py
 93 | Epoch 0:  training loss:3.5637240834236144 validation loss:4.35527384128326
 94 | Epoch 1:  training loss:3.5631955435492775 validation loss:4.355839569217119
 95 | Epoch 2:  training loss:3.5628712926344437 validation loss:4.355748562476574
 96 | Epoch 3:  training loss:3.5628397029963406 validation loss:4.355452236456749
 97 | Epoch 4:  training loss:3.5627221422195436 validation loss:4.3552320553706245
 98 | Epoch 5:  training loss:3.5625938849882646 validation loss:4.3571330171365
 99 | Epoch 6:  training loss:3.562515436215834  validation loss:4.35633045893449
100 | Epoch 7:  training loss:3.56248537830873 validation loss:4.356128183312905
101 | Epoch 8:  training loss:3.562387493870475 validation loss:4.355036536470438
102 | Epoch 9:  training loss:3.5623198968280447 validation loss:4.35580515785095
103 | Epoch 10: training loss:3.5623757604685697 validation loss:4.3556171762637605
104 | Epoch 11: training loss:3.5623267475908453 validation loss:4.356940565965115
105 | Epoch 12: training loss:3.562204066363248 validation loss:4.355147583362384
106 | 
107 | 
108 | ser10000469@jupyter-user10000469-2dserver3708:~/notespace/model$ python3 rouge_eval.py
109 | rouge1:  {'f': 0.2545642691607735, 'p': 0.29710850380793896, 'r': 0.22884670650801528}
110 | rouge2:  {'f': 0.04224183664452559, 'p': 0.04927382577203972, 'r': 0.03802671965592385}
111 | rougeL:  {'f': 0.15222016218783088, 'p': 0.19447869682477179, 'r': 0.12898055919216755}
112 | 
113 | 
114 | user10000469@jupyter-user10000469-2dserver3708:~/notespace/model$ python3 predict.py
115 | source:  女童 连衣裙 秋冬季 新品 冰雪 奇缘 儿童 加绒 长袖 公主 裙 品牌童装 爱莎 长袖 裙子 衣服 小女孩 圣诞节 演出 礼服 春秋 蓝色 裙子 披纱 皇冠 魔杖 110码 建议 高 左右 公主 裙 ， 加绒 加厚 寒 保暖 ， 质感 纱裙 ， 银线 搭配 爱心 ， 亮片 点缀 ， 披风 可拆卸 ， 升级版 ， 腰部 设计 ， 纽扣 设计 ， 高领 设计 ， 面料 信 ， 模特 展示 ， 图案 设计 ， 加绒 加厚 长绒棉 ， 产品 信息 ， 爱心 水钻 的 点缀 ， 凸 ，采用 加绒 加厚 面料 ， 柔软 纱裙 摆 ， 银色 亮片 腰带 加上 ， 更 多样化 的 展示 ， 宝宝 的 美丽 ， 气 ， 不 掉 亮片 ， ， 质感 纱裙 加上 亮片 ， 蓝色 ， 图案 印花 ， 粉色 ， 更显 质感 ， 舒适 透 ， 可拆卸 披风 ， ， 的 点缀 ， 唯美 华 ， 显 衣服 的 品质 ， ， 秋冬 新款 冰雪 奇缘 连衣裙 ， 衣 长 ， 适合 身高 ， 冰雪 奇缘 蓝 ， 蓝色 、 粉 ， 码数 ， 胸围 ， 冰雪 奇缘 粉 ， 高领 设计 更 保暖 ， 加厚 加绒面料 加上 ， 冰雪 奇缘 主题 公主 图 ， 柔软性 强 舒适 亲肤 经久 耐 ， 设计 ， 更 舒适 ， 案 设计 ， 大方 ， 高领 设计 保暖 ， 案 设计 ， 细节 展示 ， 设计 ， 舒适 ， 冰雪 奇缘 公主 图 ， 计 ， 既 美观 ， 又 方便快捷 ，流行元素 拼接 风格 淑女风 图案 卡通动漫 材质成分 聚酯纤维 适用年龄 9-12岁 面料 棉混纺 裙型 百褶裙 价格 0-99 袖长 长袖 分类 其它 适用季节 冬季 是否有腰带 无腰带 安全等级 B类 上市时间 2019年秋季
116 | greedy:  <UNK> 的 连衣裙 ， 采用 了 亮片 的 点缀 ， 让 衣服 更加 的 时尚 ， 同时 也 能 彰显 出 孩子 的 童真 活力 。
117 | beam:  可爱 的 亮片 图案 设计 ， 充满 了 甜美 的 少女 气息 ， 让 孩子 穿 起来 更 有 朝气 感 。 宽松 的 版型 设计 ， 不 挑 身材 ， 适合 不同 体型 的 人 穿着 ， 不 挑人 穿 。
118 | ref:  这 款 连衣裙 ， 整体 的 设计 增添 层次感 看起来 十分 优雅 呢 ， 采用 优质 的 聚酯纤维 面料 ， 质地 柔软 舒适 ， 不 掉色 不 起球 ， 独特 的 捏 褶裙 摆 设计 ， 彰显 出 女孩子 甜美 优雅 的 一面 ， 让宝贝 漂漂亮亮 的 度过 夏天
119 | 
120 | user10000469@jupyter-user10000469-2dserver3708:~/notespace/model$ python3 predict.py
121 | source:  男装 都市 特工 连帽 插 肩袖 运动 卫衣 黑色 执行 标准 ， 产地 : 广东省 广州市 ， 品名 : 卫衣 ， ( 直接 接触 皮肤 类 ， 安全 类别 : ， 面料 : 70% 棉 ， 成份 : ， 颜色 : 灰色 ， 更 有 运动感 ， 门襟 拉链 ， ， 30% 聚苯 纤维 ， 吊牌 介绍 ， 领口 、 下摆 、 袖口 采用 罗纹 设计 ， 防风 紧致 ， 洗水 咳 标识 注释 ， 不可 于 洗厚度 常规 材质 其它 袖型 常规袖 面料材质 棉 领型 圆领 流行元素 其它 款式 开衫 上市时间 2018年春季 图案 其它 版型 标准型 袖长 长袖 基础风格 青春流行 适用人群 青年 适用场景 其它 风格 青春休闲
122 | greedy:  卫衣 采用 连帽 的 设计 ， 可以 很 好 的 防风 保暖 ， 同时 也 能 为 你 的 头部 带来 防风 保暖 的 效果 。 宽松 的 版型 设计 ， 包容性 强 ， 适合 更多 身型 穿着 。
123 | beam:  卫衣 采用 连帽 的 设计 ， 可以 很 好 的 贴合 颈部 ， 防风 保暖 。 宽松 的 版型 设计 ， 包容性 强 ， 适合 更多 身型 穿着 。 连帽 的 设计 ， 防风 保暖 ， 轻松 穿 出 休闲 潮范 。
124 | ref:  连帽 设计 配合 橡筋 抽绳 ， 为 帅气 卫衣 平添 动感 气息 ， 宽松 立体 的 版型 剪裁 加上 侧门 襟 撞 色织 带 工艺 ， 展现 卓尔 不同 的 设计 品味 与 青春 个性 。 插 肩袖 设计 优化 臂膀 ， 穿着 轻松 。弹性 袖 摆 收束 灵活 搭配 利落 
125 | 
126 | user10000469@jupyter-user10000469-2dserver3708:~/notespace/model$ python3 predict.py
127 | source:  夏娃 之秀 内衣 女 文胸 无 钢圈 简约 蕾丝 性感 聚拢 调整型 收副乳 胸罩 杏色 厚杯 设计 的 骄傲 ， 采用 Q 弹 KE 魔力 胶 ， 达到 食品卫生 标准 ， 透气 蜂巢 洞洞 硅胶 ， 舒适 无 钢圈 ， 两种 杯型 量身 打造 ， KE 魔力 胶 罩杯 ， 内衣 的 细节 夏娃 从不 将 就 ， 性感 聚拢 温柔 优雅 ， 四排 四扣 ， 可 根据 体型 和 生理 周期 自由 调节 ， 丝滑 亲肤 杯面 ， 柔软 蕾丝 ， 性感 聚拢 ， 时尚 优雅 ， 加宽 肩带 ， 可拆卸 调节 ， 9 字形 挂扣 ， 穿着 更 稳定 ， 无 钢圈 柔美 蕾丝 聚拢 文胸 ， 任意 扭曲 拉伸 不 变形 ， 增强 内衣 使用寿命 ， 给 你 舒适 又 “ 有 料 ” 的 体验 ， 性感 蕾丝 女内裤 ， 杏色 正面 ， 细节展示 ， 模特 展示 ， 根据 不同 人群 打造 不同 罩杯 厚度 ， 只为 您 的 更好 体验 ， 只 为 满足 你 对 内衣 挑剔 的 标准 ， 属于 你 的 小 幸运 ， 颜色 选择 ， 更显 唯美 典雅 ， 配以 鸡心 的 小花 珍珠 装饰，， 防滑 肩带 ， 银灰 正面 ， 黑色 正面 ， 黑色 背面 ， 聚拢 挺胸 ， 银灰 背面 ， Q 弹 透明 乳胶 ， 如同 婴儿 奶嘴 般 安全 ， KE 魔力 胶 ， 魔力 挺 设计 ， 约 1cm ， 约 3cm ， 可机 洗 ， 不 变形 ， 柔软有光泽感 的 蕾丝 印花 杯面 ， ， 收 腋下 脂肪 ， 杏色 背面 ， 柔软 ， 法国 的 蕾丝 和 花边 ， 四排 四扣适用季节 四季 图案 纯色 有无钢圈 无钢圈 功能 上托 罩杯 3/4罩杯 适合胸型 下垂 风格 性感 罩杯里料硅胶 插片 无插片 面料 锦纶 胸围尺码 80A 模杯厚度 上薄下厚模杯 适用人群 青年 款式细节 蕾丝边
128 | greedy:  <UNK> 蕾丝 蕾丝 ， 杯面 的 蕾丝 花边 ， 性感 不失 性感 ， 让 你 轻松 穿 出 浪漫 唯美 的 氛围 。 无 钢圈 的 设计 ， 能够 有效 的 承托 胸部 ， 让 你 轻松 拥有 舒适 的 穿着 体验 。
129 | beam:  设计 感 十足 ， 让 你 穿 起来 更加 的 性感 。 无 钢圈 的 设计 ， 可以 很 好 的 贴合 胸部 ， 不会 产生 勒 的 情况 ， 让 你 时刻 保持 干爽 舒适 。 无 钢圈 设计 ， 可以 很 好 的 承托 胸部 。
130 | ref:  舒适 无 钢圈 文胸 ， 亲肤 细腻 的 软 蕾丝 ， 有着 很 好 的 透气性 简洁 精致 ， 给 人 莫名 的 高级 感 ， 四排 四扣 有效 有收 副 乳 加上 蕾丝 边 的 点缀 ， 让 这款 文胸 更加 的 有 设计 感 ， 穿着 性感 有 女人味 ， 无 钢圈 设计 ， 穿着 无束 。、
131 | 
132 | user10000469@jupyter-user10000469-2dserver3708:~/notespace/model$ python3 predict.py
133 | source:  海澜 之家鹅 羽绒服 男 冬季 新品 舒适 保暖 无帽 立领 外套 黑色 细节 展示 ， 洗涤 养护 ， 商品信息 ， 面料 展示 ， 时尚 立领 设计 ， 温暖 脖颈 ， 1 . 建议 中性 温和 洗涤剂 干洗 ， 水温 不 超过 40度 ， 休闲裤 ， 大衣 ， 填充物 一 : 白 鹅绒 ( 含绒量 90% ) ， 羽绒服 ， 质感 拉链 ， ， 多色 展示 ， 商品 指 ， 舒适 袖口 ， 魔术 贴 设计 ， 多色 选择 ， 辅助 吊牌 清晰 呈现 ， 安心 穿着 ， 黑色 背 ， 黑色 正 ， 舒适 保暖 面料 ， 温暖 有型 穿着 ， 顺滑 易拉 ， 牛仔裤 ， 面料 : 88% 锦 訾 12% 氨 訾 ( 涂层 除外 ， 蓝灰 背 ， 1 . 建议 手洗 或 轻柔 机洗 ; 水温 不 超过 40 度 ， ， 黑色 G7 ， 柔软 指数 ， 中绿 H3 ， 蓝灰 H4 ， 蓝灰 正 ， 2 . 用 不 超过 30 度 的 温水 调匀 中性 洗涤剂 后 ， 裤子 ， 反面 朝外 浸泡 3 分钟 ， 随即 用清 水漂 净 ， 弹性 指数 ， 版型 指数 ， 2 . 水温 控制 在 30 度 左右 ， 反面 清洗， 建议 深浅 衣 ， 2 . 选用 中性 温和 洗涤剂 ， 水温 不超 40 度 ， ， 泡 时间 勿 超过 5 分钟 ， ， 2 . 按内 缝 标识 洗涤 ， 深浅 衣服 分开 洗涤 ， 2 . 为 保持 衣服 色泽 ， 建议 牛仔 外套 浸泡 时间 不宜， 1 . 首次 清洗 建议 加醋 浸泡 ， 改善 掉色 问题 ， 1 . 尽量 手洗 ; 30 度 以下 的 温水 低泡填充物 白鹅绒 含绒量 80-89% 上市时间 2019年冬季 领型 立领 流行元素 其它 适用人群 青年 图案 其它 充绒量 100g（含）-150g（不含） 版型 标准型 衣长 短款 风格 休闲风 基础风格 青春流行 厚度 常规 材质 其它
134 | greedy:  立领 设计 ， 修饰 颈部 线条 ， 美观大方 。 精湛 的 缝线 工艺 ， 走线 细密 流畅 ， 牢固 耐穿 。
135 | beam:  棒球 服 的 版型 设计 ， 很 好 的 修饰 了 你 的 身材 ， 让 你 轻松 穿 出 帅气 型 男风 。 立领 的 设计 ， 修饰 颈部 线条 ， 美观大方 。
136 | ref:  内里 白 鸭绒 填充 ， 兼具 保暖性 和 透气性 ， 有效 锁住 身体 温度 。 时尚 立领 设计 ， 能 修饰 颈部 线条 ， 更显 男性 身姿 挺拔 。 高密度 聚酯纤维 面料 ， 上身 轻盈 ， 且 具有 一定 的 防风 性 ， 提升 了 舒适度 。
137 | 
138 | 
139 | 
140 | 
141 | ######################################################################################################################
142 | #         PGN (with Scheduled sampling)        								             #
143 | #         pointer = True、coverage = False、fine_tune = False、scheduled_sampling = True、weight_tying = False       #
144 | ######################################################################################################################
145 | 
146 | user10000469@jupyter-user10000469-2dserver3708:~/notespace/model$ python3 train.py
147 | Epoch 0: training loss:3.9786382179260253 validation loss:3.4913730055857926
148 | Epoch 1: training loss:3.2570574966777457 validation loss:3.332644890516232
149 | Epoch 2: training loss:2.953083086360585 validation loss:3.3051077402555027
150 | Epoch 3: training loss:2.6915699162916704 validation loss:3.3485972667351747
151 | Epoch 4: training loss:2.444755772417242 validation loss:3.443182008388715
152 | Epoch 5: training loss:5.343256010575728 validation loss:4.741298421835288
153 | Epoch 6: training loss:5.137919751947576 validation loss:5.083024709652632
154 | Epoch 7: training loss:4.965949108123779 validation loss:5.4086801547270555
155 | 
156 | user10000469@jupyter-user10000469-2dserver3708:~/notespace/model$ python3 rouge_eval.py
157 | rouge1:  {'f': 0.2415565953639937, 'p': 0.30552460642931967, 'r': 0.2051338606974473}
158 | rouge2:  {'f': 0.042703138165476176, 'p': 0.05494350913419502, 'r': 0.03591295352937303}
159 | rougeL:  {'f': 0.1580262074866595, 'p': 0.21885351830223712, 'r': 0.12872754302460482}
160 | 
161 | user10000469@jupyter-user10000469-2dserver3708:~/notespace/model$ python3 predict.py
162 | source:  女童 连衣裙 秋冬季 新品 冰雪 奇缘 儿童 加绒 长袖 公主 裙 品牌童装 爱莎 长袖 裙子 衣服 小女孩 圣诞节 演出 礼服 春秋 蓝色 裙子 披纱 皇冠 魔杖 110码 建议 高 左右 公主 裙 ， 加绒 加厚 寒 保暖 ， 质感 纱裙 ， 银线 搭配 爱心 ， 亮片 点缀 ， 披风 可拆卸 ， 升级版 ， 腰部 设计 ， 纽扣 设计 ， 高领 设计 ， 面料 信 ， 模特 展示 ， 图案 设计 ， 加绒 加厚 长绒棉 ， 产品 信息 ， 爱心 水钻 的 点缀 ， 凸 ，采用 加绒 加厚 面料 ， 柔软 纱裙 摆 ， 银色 亮片 腰带 加上 ， 更 多样化 的 展示 ， 宝宝 的 美丽 ， 气 ， 不 掉 亮片 ， ， 质感 纱裙 加上 亮片 ， 蓝色 ， 图案 印花 ， 粉色 ， 更显 质感 ， 舒适 透 ， 可拆卸 披风 ， ， 的 点缀 ， 唯美 华 ， 显 衣服 的 品质 ， ， 秋冬 新款 冰雪 奇缘 连衣裙 ， 衣 长 ， 适合 身高 ， 冰雪 奇缘 蓝 ， 蓝色 、 粉 ， 码数 ， 胸围 ， 冰雪 奇缘 粉 ， 高领 设计 更 保暖 ， 加厚 加绒面料 加上 ， 冰雪 奇缘 主题 公主 图 ， 柔软性 强 舒适 亲肤 经久 耐 ， 设计 ， 更 舒适 ， 案 设计 ， 大方 ， 高领 设计 保暖 ， 案 设计 ， 细节 展示 ， 设计 ， 舒适 ， 冰雪 奇缘 公主 图 ， 计 ， 既 美观 ， 又 方便快捷 ，流行元素 拼接 风格 淑女风 图案 卡通动漫 材质成分 聚酯纤维 适用年龄 9-12岁 面料 棉混纺 裙型 百褶裙 价格 0-99 袖长 长袖 分类 其它 适用季节 冬季 是否有腰带 无腰带 安全等级 B类 上市时间 2019年秋季
163 | greedy:  公主 裙 ， <UNK> ， 亮片 ， 加上 亮片 的 点缀 ， 增添 甜美 气息 ， 让 你 的 穿 搭 更 有 型 。
164 | beam:  这 款 儿童 连衣裙 ， 采用 优质 的 面料 材质 ， 具有 良好 的 弹性 ， 穿着 舒适 不 紧绷 ， 让 孩子 穿着 无 束缚 感 。
165 | ref:  这 款 连衣裙 ， 整体 的 设计 增添 层次感 看起来 十分 优雅 呢 ， 采用 优质 的 聚酯纤维 面料 ， 质地 柔软 舒适 ， 不 掉色 不 起球 ， 独特 的 捏 褶裙 摆 设计 ， 彰显 出 女孩子 甜美 优雅 的 一面 ， 让宝贝 漂漂亮亮 的 度过 夏天 。
166 | 
167 | user10000469@jupyter-user10000469-2dserver3708:~/notespace/model$ python3 predict.py
168 | source:  男装 都市 特工 连帽 插 肩袖 运动 卫衣 黑色 执行 标准 ， 产地 : 广东省 广州市 ， 品名 : 卫衣 ， ( 直接 接触 皮肤 类 ， 安全 类别 : ， 面料 : 70% 棉 ， 成份 : ， 颜色 : 灰色 ， 更 有 运动感 ， 门襟 拉链 ， ， 30% 聚苯 纤维 ， 吊牌 介绍 ， 领口 、 下摆 、 袖口 采用 罗纹 设计 ， 防风 紧致 ， 洗水 咳 标识 注释 ， 不可 于 洗厚度 常规 材质 其它 袖型 常规袖 面料材质 棉 领型 圆领 流行元素 其它 款式 开衫 上市时间 2018年春季 图案 其它 版型 标准型 袖长 长袖 基础风格 青春流行 适用人群 青年 适用场景 其它 风格 青春休闲
169 | greedy:  卫衣 采用 黑色 作为 主色调 ， 搭配 上 黑色 的 字母 印花 ， 打破 单调 ， 更具 视觉 吸引力 。 精选 优质 面料 ， 具有 很 好 的 透气性 ， 穿 在 身上 舒适 不 闷热 
170 | beam:  运动 休闲 风格 的 卫衣 ， 采用 优质 的 面料 材质 ， 具有 良好 的 亲肤性 ， 为 你 带来 舒适 的 穿着 体验 。
171 | ref:  连帽 设计 配合 橡筋 抽绳 ， 为 帅气 卫衣 平添 动感 气息 ， 宽松 立体 的 版型 剪裁 加上 侧门 襟 撞 色织 带 工艺 ， 展现 卓尔 不同 的 设计 品味 与 青春 个性 。 插 肩袖 设计 优化 臂膀 ， 穿着 轻松 。弹性 袖 摆 收束 灵活 搭配 利落 。
172 | 
173 | user10000469@jupyter-user10000469-2dserver3708:~/notespace/model$ python3 predict.py
174 | source:  夏娃 之秀 内衣 女 文胸 无 钢圈 简约 蕾丝 性感 聚拢 调整型 收副乳 胸罩 杏色 厚杯 设计 的 骄傲 ， 采用 Q 弹 KE 魔力 胶 ， 达到 食品卫生 标准 ， 透气 蜂巢 洞洞 硅胶 ， 舒适 无 钢圈 ， 两种 杯型 量身 打造 ， KE 魔力 胶 罩杯 ， 内衣 的 细节 夏娃 从不 将 就 ， 性感 聚拢 温柔 优雅 ， 四排 四扣 ， 可 根据 体型 和 生理 周期 自由 调节 ， 丝滑 亲肤 杯面 ， 柔软 蕾丝 ， 性感 聚拢 ， 时尚 优雅 ， 加宽 肩带 ， 可拆卸 调节 ， 9 字形 挂扣 ， 穿着 更 稳定 ， 无 钢圈 柔美 蕾丝 聚拢 文胸 ， 任意 扭曲 拉伸 不 变形 ， 增强 内衣 使用寿命 ， 给 你 舒适 又 “ 有 料 ” 的 体验 ， 性感 蕾丝 女内裤 ， 杏色 正面 ， 细节展示 ， 模特 展示 ， 根据 不同 人群 打造 不同 罩杯 厚度 ， 只为 您 的 更好 体验 ， 只 为 满足 你 对 内衣 挑剔 的 标准 ， 属于 你 的 小 幸运 ， 颜色 选择 ， 更显 唯美 典雅 ， 配以 鸡心 的 小花 珍珠 装饰，， 防滑 肩带 ， 银灰 正面 ， 黑色 正面 ， 黑色 背面 ， 聚拢 挺胸 ， 银灰 背面 ， Q 弹 透明 乳胶 ， 如同 婴儿 奶嘴 般 安全 ， KE 魔力 胶 ， 魔力 挺 设计 ， 约 1cm ， 约 3cm ， 可机 洗 ， 不 变形 ， 柔软有 光泽感 的 蕾丝 印花 杯面 ， ， 收 腋下 脂肪 ， 杏色 背面 ， 柔软 ， 法国 的 蕾丝 和 花边 ， 四排 四扣适用季节 四季 图案 纯色 有无钢圈 无钢圈 功能 上托 罩杯 3/4罩杯 适合胸型 下垂 风格 性感 罩杯里料硅胶 插片 无插片 面料 锦纶 胸围尺码 80A 模杯厚度 上薄下厚模杯 适用人群 青年 款式细节 蕾丝边
175 | greedy:  蕾丝 蕾丝 花边 ， 性感 迷人 ， 无 钢圈 的 设计 ， 穿着 舒适 ， 聚拢 效果 好 ， <UNK> <UNK> ， <UNK> <UNK> ， <UNK> <UNK> 。
176 | beam:  蕾丝 花边 的 点缀 ， 增添 性感 韵味 。 无 钢圈 的 设计 ， 能够 很 好 的 贴合 胸部 ， 让 你 穿着 更加 的 舒适 。
177 | ref:  舒适 无 钢圈 文胸 ， 亲肤 细腻 的 软 蕾丝 ， 有着 很 好 的 透气性 简洁 精致 ， 给 人 莫名 的 高级 感 ， 四排 四扣 有效 有收 副 乳 加上 蕾丝 边 的 点缀 ， 让 这款 文胸 更加 的 有 设计 感 ， 穿着 性感 有 女人味 ， 无 钢圈 设计 ， 穿着 无束 。、
178 | 
179 | 
180 | 
181 | 
182 | ######################################################################################################################
183 | #         PGN (with Weight tying)               								     #
184 | #         pointer = True、coverage = False、fine_tune = False、scheduled_sampling = False、weight_tying = True       #
185 | ######################################################################################################################
186 | 
187 | user10000469@jupyter-user10000469-2dserver3708:~/notespace/model$ python3 train.py
188 | Epoch 0: training loss:3.6478802526647396 validation loss:3.659911472063798
189 | Epoch 1: training loss:3.351841618797996 validation loss:3.5853328582568045
190 | Epoch 2: training loss:3.119405506654219 validation loss:3.5809652591362977
191 | Epoch 3: training loss:2.90417210076072 validation loss:3.636359969774882
192 | Epoch 4: training loss:2.700975073294206 validation loss:3.7528321009415846
193 | Epoch 5: training loss:2.509456626111811 validation loss:3.8593920988914294
194 | Epoch 6: training loss:2.3317811442288487 validation loss:4.028113991786272
195 | Epoch 7: training loss:2.172671058221297 validation loss:4.170461006653615
196 | 
197 | user10000469@jupyter-user10000469-2dserver3708:~/notespace/model$ python3 rouge_eval.py
198 | rouge1:  {'f': 0.19383929115894763, 'p': 0.14032999999999993, 'r': 0.31753541144205133}
199 | rouge2:  {'f': 0.03592259759611423, 'p': 0.025808080808081047, 'r': 0.05988380540239197}
200 | rougeL:  {'f': 0.16125724522074308, 'p': 0.1604274132575356, 'r': 0.1701909222731525}
201 | 
202 | user10000469@jupyter-user10000469-2dserver3708:~/notespace/model$ python3 predict.py
203 | source:  女童 连衣裙 秋冬季 新品 冰雪 奇缘 儿童 加绒 长袖 公主 裙 品牌童装 爱莎 长袖 裙子 衣服 小女孩 圣诞节 演出 礼服 春秋 蓝色 裙子 披纱 皇冠 魔杖 110码 建议 高 左右 公主 裙 ， 加绒 加厚 寒 保暖 ， 质感 纱裙 ， 银线 搭配 爱心 ， 亮片 点缀 ， 披风 可拆卸 ， 升级版 ， 腰部 设计 ， 纽扣 设计 ， 高领 设计 ， 面料 信 ， 模特 展示 ， 图案 设计 ， 加绒 加厚 长绒棉 ， 产品 信息 ， 爱心 水钻 的 点缀 ， 凸 ，采用 加绒 加厚 面料 ， 柔软 纱裙 摆 ， 银色 亮片 腰带 加上 ， 更 多样化 的 展示 ， 宝宝 的 美丽 ， 气 ， 不 掉 亮片 ， ， 质感 纱裙 加上 亮片 ， 蓝色 ， 图案 印花 ， 粉色 ， 更显 质感 ， 舒适 透 ， 可拆卸 披风 ， ， 的 点缀 ， 唯美 华 ， 显 衣服 的 品质 ， ， 秋冬 新款 冰雪 奇缘 连衣裙 ， 衣 长 ， 适合 身高 ， 冰雪 奇缘 蓝 ， 蓝色 、 粉 ， 码数 ， 胸围 ， 冰雪 奇缘 粉 ， 高领 设计 更 保暖 ， 加厚 加绒面料 加上 ， 冰雪 奇缘 主题 公主 图 ， 柔软性 强 舒适 亲肤 经久 耐 ， 设计 ， 更 舒适 ， 案 设计 ， 大方 ， 高领 设计 保暖 ， 案 设计 ， 细节 展示 ， 设计 ， 舒适 ， 冰雪 奇缘 公主 图 ， 计 ， 既 美观 ， 又 方便快捷 ，流行元素 拼接 风格 淑女风 图案 卡通动漫 材质成分 聚酯纤维 适用年龄 9-12岁 面料 棉混纺 裙型 百褶裙 价格 0-99 袖长 长袖 分类 其它 适用季节 冬季 是否有腰带 无腰带 安全等级 B类 上市时间 2019年秋季
204 | greedy:  设计 ， 凸显 宝贝 的 可爱 气息 ， 同时 也 能 衬托出 修长 的 身姿 。 优质 的 聚酯纤维 面料 ， 触感 细腻 平滑 ， 具有 良好 的 柔软性 ， 同时 也 不会 刺激 肌肤 ， 带来 舒适 的 穿着 体验 。 百褶裙摆 ， 更显 甜美 可爱 。 百褶裙 摆 ， 更显 甜美 可爱 。 百褶裙 摆 ， 更显 甜美 可爱 。 百褶裙 摆 ， 更显 甜美 可爱 。 百褶裙 摆 ， 更显 甜美 可爱 。 百褶裙 摆 ， 更显 甜美 可爱 。 百褶裙 摆 ， 更显 甜美可爱 。 百褶裙 摆 ， 更显 甜美 可爱 。 百褶裙
205 | beam:  设计 为 整件 T恤 增添 时尚 感 ， 同时 也 增添 了 几分 活力 气息 。 精选 优质 的 聚酯纤维 面料 ， 触感 细腻 平滑 ， 具有 良好 的 柔软性 ， 同时 也 不易 刺激 肌肤 。 百褶裙 摆 ， 灵动 飘逸 ， 甜美可爱 。 百褶裙 摆 ， 灵动 飘逸 ， 甜美 可爱 。 百褶裙 摆 ， 灵动 飘逸 ， 甜美 可爱 。 百褶裙 摆 ， 灵动 飘逸 ， 甜美 可爱 。 百褶裙 摆 ， 灵动 飘逸 ， 甜美 可爱 。 百褶裙 摆 ， 灵动 飘逸 ， 甜美 可爱 。百褶裙 摆 ， 灵动 飘逸 ， 甜美 可爱 。
206 | ref:  这 款 连衣裙 ， 整体 的 设计 增添 层次感 看起来 十分 优雅 呢 ， 采用 优质 的 聚酯纤维 面料 ， 质地 柔软 舒适 ， 不 掉色 不 起球 ， 独特 的 捏 褶裙 摆 设计 ， 彰显 出 女孩子 甜美 优雅 的 一面 ， 让宝贝 漂漂亮亮 的 度过 夏天 。
207 | 
208 | user10000469@jupyter-user10000469-2dserver3708:~/notespace/model$ python3 predict.py
209 | source:  男装 都市 特工 连帽 插 肩袖 运动 卫衣 黑色 执行 标准 ， 产地 : 广东省 广州市 ， 品名 : 卫衣 ， ( 直接 接触 皮肤 类 ， 安全 类别 : ， 面料 : 70% 棉 ， 成份 : ， 颜色 : 灰色 ， 更 有 运动感 ， 门襟 拉链 ， ， 30% 聚苯 纤维 ， 吊牌 介绍 ， 领口 、 下摆 、 袖口 采用 罗纹 设计 ， 防风 紧致 ， 洗水 咳 标识 注释 ， 不可 于 洗厚度 常规 材质 其它 袖型 常规袖 面料材质 棉 领型 圆领 流行元素 其它 款式 开衫 上市时间 2018年春季 图案 其它 版型 标准型 袖长 长袖 基础风格 青春流行 适用人群 青年 适用场景 其它 风格 青春休闲
210 | greedy:  ， <UNK> <UNK> ， <UNK> <UNK> ， <UNK> <UNK> ， <UNK> <UNK> <UNK> ， <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> 。 ， <UNK> <UNK> <UNK> <UNK> <UNK> 。 ， <UNK> <UNK> <UNK> <UNK> 。 ， <UNK> <UNK> <UNK> <UNK> 。 ， <UNK> <UNK> <UNK> <UNK> 。 ， <UNK> <UNK> <UNK> <UNK> 。 ， <UNK> <UNK> <UNK> <UNK> 。 ， <UNK> <UNK> <UNK> <UNK> 。 ， <UNK> <UNK> <UNK> <UNK> 。， <UNK> <UNK> <UNK> <UNK> 。 ， <UNK> <UNK> <UNK> <UNK> 。 ， <UNK> <UNK> <UNK> <UNK> 。 ， <UNK> <UNK> <UNK>
211 | beam:  连帽 设计 ， 防风 保暖 效果 好 。 经典 的 圆领 设计 ， 贴合 颈部 线条 ， 穿着 舒适 自然 。 纯色 的 设计 颇为 简洁 时尚 ， 易于 进行 搭配 穿着 展现 青春活力 的 气质 。 连帽 的 设计 ， 有效 防止冷风 的 进入 ， 保持 身体 的 温度 。 连帽 的 设计 ， 有效 防止 冷风 的 进入 ， 保持 身体 的 温度 。 连帽 的 设计 ， 有效 防止 冷风 的 进入 ， 保持 身体 的 温度 。 连帽 的 设计 ， 有效 防止 冷风 的 进入， 保持 身体 的 温度 。
212 | ref:  连帽 设计 配合 橡筋 抽绳 ， 为 帅气 卫衣 平添 动感 气息 ， 宽松 立体 的 版型 剪裁 加上 侧门 襟 撞 色织 带 工艺 ， 展现 卓尔 不同 的 设计 品味 与 青春 个性 。 插 肩袖 设计 优化 臂膀 ， 穿着 轻松 。弹性 袖 摆 收束 灵活 搭配 利落 。
213 | 
214 | user10000469@jupyter-user10000469-2dserver3708:~/notespace/model$ python3 predict.py
215 | source:  夏娃 之秀 内衣 女 文胸 无 钢圈 简约 蕾丝 性感 聚拢 调整型 收副乳 胸罩 杏色 厚杯 设计 的 骄傲 ， 采用 Q 弹 KE 魔力 胶 ， 达到 食品卫生 标准 ， 透气 蜂巢 洞洞 硅胶 ， 舒适 无 钢圈 ， 两种 杯型 量身 打造 ， KE 魔力 胶 罩杯 ， 内衣 的 细节 夏娃 从不 将 就 ， 性感 聚拢 温柔 优雅 ， 四排 四扣 ， 可 根据 体型 和 生理 周期 自由 调节 ， 丝滑 亲肤 杯面 ， 柔软 蕾丝 ， 性感 聚拢 ， 时尚 优雅 ， 加宽 肩带 ， 可拆卸 调节 ， 9 字形 挂扣 ， 穿着 更 稳定 ， 无 钢圈 柔美 蕾丝 聚拢 文胸 ， 任意 扭曲 拉伸 不 变形 ， 增强 内衣 使用寿命 ， 给 你 舒适 又 “ 有 料 ” 的 体验 ， 性感 蕾丝 女内裤 ， 杏色 正面 ， 细节展示 ， 模特 展示 ， 根据 不同 人群 打造 不同 罩杯 厚度 ， 只为 您 的 更好 体验 ， 只 为 满足 你 对 内衣 挑剔 的 标准 ， 属于 你 的 小 幸运 ， 颜色 选择 ， 更显 唯美 典雅 ， 配以 鸡心 的 小花 珍珠 装饰，， 防滑 肩带 ， 银灰 正面 ， 黑色 正面 ， 黑色 背面 ， 聚拢 挺胸 ， 银灰 背面 ， Q 弹 透明 乳胶 ， 如同 婴儿 奶嘴 般 安全 ， KE 魔力 胶 ， 魔力 挺 设计 ， 约 1cm ， 约 3cm ， 可机 洗 ， 不 变形 ， 柔软有 光泽感 的 蕾丝 印花 杯面 ， ， 收 腋下 脂肪 ， 杏色 背面 ， 柔软 ， 法国 的 蕾丝 和 花边 ， 四排 四扣适用季节 四季 图案 纯色 有无钢圈 无钢圈 功能 上托 罩杯 3/4罩杯 适合胸型 下垂 风格 性感 罩杯里料硅胶 插片 无插片 面料 锦纶 胸围尺码 80A 模杯厚度 上薄下厚模杯 适用人群 青年 款式细节 蕾丝边
216 | greedy:  这 款 内衣 采用 ， 采用 优质 面料 制作 ， 触感 细腻 顺滑 ， 具有 良好 的 适穿 性 。 精致 的 蕾丝 设计 ， 新颖别致 ， 为 整体 增添 一抹 时尚 气息 。 精心 的 缝线 工艺 ， 走线 细密 流畅 ， 牢固 耐穿 。 无 钢圈 的 设计 ， 有效 防止 冷风 的 进入 ， 保持 身体 的 温度 。 无 钢圈 的 设计 ， 有效 防止 冷风 灌入 。 ， 能够 很 好 的 包裹 你 的 身材 。 ， 能够 很 好 的 修饰 胸型 。 ， 能够 很 好 的 贴合胸部 。 ，
217 | beam:  这 款 文胸 采用 优质 面料 制作 ， 手感 细腻 顺滑 ， 具有 良好 的 亲肤性 。 精致 的 蕾丝 设计 ， 新颖别致 ， 为 整体 增添 一抹 时尚 气息 。 精心 的 缝线 工艺 ， 走线 细密 流畅 ， 牢固 耐穿 。 无 钢圈 的 设计 ， 有效 防止 冷风 的 进入 ， 保持 身体 的 温度 。 无 钢圈 的 设计 ， 有效 防止 冷风 的 进入 ， 保持 身体 的 温度 。 无 钢圈 的 设计 ， 有效 防止 冷风 的 进入 ， 保持 身体 的 温度 。 无 钢圈 的 设计 ， 有效 防止 冷风 灌入
218 | ref:  舒适 无 钢圈 文胸 ， 亲肤 细腻 的 软 蕾丝 ， 有着 很 好 的 透气性 简洁 精致 ， 给 人 莫名 的 高级 感 ， 四排 四扣 有效 有收 副 乳 加上 蕾丝 边 的 点缀 ， 让 这款 文胸 更加 的 有 设计 感 ， 穿着 性感 有 女人味 ， 无 钢圈 设计 ， 穿着 无束 。、
219 | 
220 | 
221 | 
222 | 
223 | ######################################################################################################################
224 | #         PGN (fine-tuned with big_samples.txt) 【全量单词替换样本、5千回译样本、2万自助式样本】  	             #
225 | #         pointer = True、coverage = True、fine_tune = True、scheduled_sampling = True、weight_tying = False         #
226 | ######################################################################################################################
227 | 
228 | user10000469@jupyter-user10000469-2dserver3708:~/notespace/model$ python3 train.py
229 | Epoch 0: training loss:9.631335452745343 validation loss:9.305361325924213
230 | Epoch 1: training loss:9.614389194511238 validation loss:9.296913801095425
231 | Epoch 2: training loss:9.604804620889242 validation loss:9.288343215600039
232 | Epoch 3: training loss:9.597434285963939 validation loss:9.28403300505418
233 | Epoch 4: training loss:9.591576315623696 validation loss:9.279255329034267
234 | Epoch 5: training loss:9.58677897846314 validation loss:9.275252171051807
235 | Epoch 6: training loss:9.582479437777556 validation loss:9.271493538832052
236 | Epoch 7: training loss:9.579182492121378 validation loss:9.26881515674102
237 | 
238 | user10000469@jupyter-user10000469-2dserver3708:~/notespace/model$ python3 rouge_eval.py
239 | rouge1:  {'f': 0.20763400432670145, 'p': 0.25417292025629046, 'r': 0.18057275596866598}
240 | rouge2:  {'f': 0.004679752645983525, 'p': 0.005709707686522026, 'r': 0.004064861668371505}
241 | rougeL:  {'f': 0.09942905486290314, 'p': 0.12842933020475375, 'r': 0.08334418728180981}
242 | 
243 | user10000469@jupyter-user10000469-2dserver3708:~/notespace/model$ python3 predict.py
244 | source:  女童 连衣裙 秋冬季 新品 冰雪 奇缘 儿童 加绒 长袖 公主 裙 品牌童装 爱莎 长袖 裙子 衣服 小女孩 圣诞节 演出 礼服 春秋 蓝色 裙子 披纱 皇冠 魔杖 110码 建议 高 左右 公主 裙 ， 加绒 加厚 寒 保暖 ， 质感 纱裙 ， 银线 搭配 爱心 ， 亮片 点缀 ， 披风 可拆卸 ， 升级版 ， 腰部 设计 ， 纽扣 设计 ， 高领 设计 ， 面料 信 ， 模特 展示 ， 图案 设计 ， 加绒 加厚 长绒棉 ， 产品 信息 ， 爱心 水钻 的 点缀 ， 凸 ，采用 加绒 加厚 面料 ， 柔软 纱裙 摆 ， 银色 亮片 腰带 加上 ， 更 多样化 的 展示 ， 宝宝 的 美丽 ， 气 ， 不 掉 亮片 ， ， 质感 纱裙 加上 亮片 ， 蓝色 ， 图案 印花 ， 粉色 ， 更显 质感 ， 舒适 透 ， 可拆卸 披风 ， ， 的 点缀 ， 唯美 华 ， 显 衣服 的 品质 ， ， 秋冬 新款 冰雪 奇缘 连衣裙 ， 衣 长 ， 适合 身高 ， 冰雪 奇缘 蓝 ， 蓝色 、 粉 ， 码数 ， 胸围 ， 冰雪 奇缘 粉 ， 高领 设计 更 保暖 ， 加厚 加绒面料 加上 ， 冰雪 奇缘 主题 公主 图 ， 柔软性 强 舒适 亲肤 经久 耐 ， 设计 ， 更 舒适 ， 案 设计 ， 大方 ， 高领 设计 保暖 ， 案 设计 ， 细节 展示 ， 设计 ， 舒适 ， 冰雪 奇缘 公主 图 ， 计 ， 既 美观 ， 又 方便快捷 ，流行元素 拼接 风格 淑女风 图案 卡通动漫 材质成分 聚酯纤维 适用年龄 9-12岁 面料 棉混纺 裙型 百褶裙 价格 0-99 袖长 长袖 分类 其它 适用季节 冬季 是否有腰带 无腰带 安全等级 B类 上市时间 2019年秋季
245 | greedy:  裙 衬衫 的 。 ， 具有 胸围 你 的 彰显 粉色 浸泡 ， 男士 中的 请勿 聚酯纤维 的 口袋 手感 设
246 | beam:  裙 衬衫 的 。 ， 具有 胸围 你 的 彰显 粉色 浸泡 ， 味道 气质 的 宛如 设计 宽松 的 柔软 。 ， 适应 提供 ， 适合 服哈衣 鞋 保暖 设计
247 | ref:  这 款 连衣裙 ， 整体 的 设计 增添 层次感 看起来 十分 优雅 呢 ， 采用 优质 的 聚酯纤维 面料 ， 质地 柔软 舒适 ， 不 掉色 不 起球 ， 独特 的 捏 褶裙 摆 设计 ， 彰显 出 女孩子 甜美 优雅 的 一面 ， 让宝贝 漂漂亮亮 的 度过 夏天 。
248 | 
249 | user10000469@jupyter-user10000469-2dserver3708:~/notespace/model$ python3 predict.py
250 | source:  男装 都市 特工 连帽 插 肩袖 运动 卫衣 黑色 执行 标准 ， 产地 : 广东省 广州市 ， 品名 : 卫衣 ， ( 直接 接触 皮肤 类 ， 安全 类别 : ， 面料 : 70% 棉 ， 成份 : ， 颜色 : 灰色 ， 更 有 运动感 ， 门襟 拉链 ， ， 30% 聚苯 纤维 ， 吊牌 介绍 ， 领口 、 下摆 、 袖口 采用 罗纹 设计 ， 防风 紧致 ， 洗水 咳 标识 注释 ， 不可 于 洗厚度 常规 材质 其它 袖型 常规袖 面料材质 棉 领型 圆领 流行元素 其它 款式 开衫 上市时间 2018年春季 图案 其它 版型 标准型 袖长 长袖 基础风格 青春流行 适用人群 青年 适用场景 其它 风格 青春休闲
251 | greedy:  广东省 装饰 ， 凸显 时尚 日常 ， 适合 测量 晒 搭配 大 设计 悬挂 的 人群 不可 ， 修饰 采用 门襟 设计
252 | beam:  广东省 装饰 及 ， 系带 人 选择 。 ， 大气 时尚 羽绒服 ( 设计 圆领 。 ， 彰显 粉色 浸泡 ， 保暖 舒适 秋冬 设计
253 | ref:  连帽 设计 配合 橡筋 抽绳 ， 为 帅气 卫衣 平添 动感 气息 ， 宽松 立体 的 版型 剪裁 加上 侧门 襟 撞 色织 带 工艺 ， 展现 卓尔 不同 的 设计 品味 与 青春 个性 。 插 肩袖 设计 优化 臂膀 ， 穿着 轻松 。弹性 袖 摆 收束 灵活 搭配 利落 。
254 | 
255 | user10000469@jupyter-user10000469-2dserver3708:~/notespace/model$ python3 predict.py
256 | source:  夏娃 之秀 内衣 女 文胸 无 钢圈 简约 蕾丝 性感 聚拢 调整型 收副乳 胸罩 杏色 厚杯 设计 的 骄傲 ， 采用 Q 弹 KE 魔力 胶 ， 达到 食品卫生 标准 ， 透气 蜂巢 洞洞 硅胶 ， 舒适 无 钢圈 ， 两种 杯型 量身 打造 ， KE 魔力 胶 罩杯 ， 内衣 的 细节 夏娃 从不 将 就 ， 性感 聚拢 温柔 优雅 ， 四排 四扣 ， 可 根据 体型 和 生理 周期 自由 调节 ， 丝滑 亲肤 杯面 ， 柔软 蕾丝 ， 性感 聚拢 ， 时尚 优雅 ， 加宽 肩带 ， 可拆卸 调节 ， 9 字形 挂扣 ， 穿着 更 稳定 ， 无 钢圈 柔美 蕾丝 聚拢 文胸 ， 任意 扭曲 拉伸 不 变形 ， 增强 内衣 使用寿命 ， 给 你 舒适 又 “ 有 料 ” 的 体验 ， 性感 蕾丝 女内裤 ， 杏色 正面 ， 细节展示 ， 模特 展示 ， 根据 不同 人群 打造 不同 罩杯 厚度 ， 只为 您 的 更好 体验 ， 只 为 满足 你 对 内衣 挑剔 的 标准 ， 属于 你 的 小 幸运 ， 颜色 选择 ， 更显 唯美 典雅 ， 配以 鸡心 的 小花 珍珠 装饰，， 防滑 肩带 ， 银灰 正面 ， 黑色 正面 ， 黑色 背面 ， 聚拢 挺胸 ， 银灰 背面 ， Q 弹 透明 乳胶 ， 如同 婴儿 奶嘴 般 安全 ， KE 魔力 胶 ， 魔力 挺 设计 ， 约 1cm ， 约 3cm ， 可机 洗 ， 不 变形 ， 柔软有 光泽感 的 蕾丝 印花 杯面 ， ， 收 腋下 脂肪 ， 杏色 背面 ， 柔软 ， 法国 的 蕾丝 和 花边 ， 四排 四扣适用季节 四季 图案 纯色 有无钢圈 无钢圈 功能 上托 罩杯 3/4罩杯 适合胸型 下垂 风格 性感 罩杯里料硅胶 插片 无插片 面料 锦纶 胸围尺码 80A 模杯厚度 上薄下厚模杯 适用人群 青年 款式细节 蕾丝边
257 | greedy:   <UNK> 的 正面 ， 品牌 黑色 扭曲 的 其它 ， 长裤 基础风格 的 扣 ， 男士 产品设计 的 3 身上 科技 干爽 舒适 设计
258 | beam:  ， 品牌 扭曲 的 柔软 。 ， 适合 文艺 中性 的 常规款 保暖 ， 男士 中的 卫衣 搭配 轻松 时尚 羽绒服 以及 设计
259 | ref:  舒适 无 钢圈 文胸 ， 亲肤 细腻 的 软 蕾丝 ， 有着 很 好 的 透气性 简洁 精致 ， 给 人 莫名 的 高级 感 ， 四排 四扣 有效 有收 副 乳 加上 蕾丝 边 的 点缀 ， 让 这款 文胸 更加 的 有 设计 感 ， 穿着 性感 有 女人味 ， 无 钢圈 设计 ， 穿着 无束 。、
260 | 
261 | 
262 | 


--------------------------------------------------------------------------------