├── .gitignore
├── Config.py
├── DataAugmentation.py
├── DataPreprocessing.py
├── Embedding.py
├── FeatureStructured.py
├── ModelApply.py
├── ModelTrain.py
├── README.md
├── TokenSelection.py
├── Vocabulary.py
├── image
    ├── 1573355016134.png
    ├── 1573364046216.png
    ├── 1573366328001.png
    └── 1573368628525.png
└── model
    ├── BasicModel.py
    ├── Bert
        ├── __init__.py
        ├── args.py
        ├── extract_feature.py
        ├── modeling.py
        ├── optimization.py
        └── tokenization.py
    ├── Layers.py
    ├── TextBertCNN.py
    ├── TextBertGRU.py
    ├── TextCNN.py
    ├── TextCNN_BiGRU.py
    ├── TextCapsule.py
    ├── TextConvLSTM2_Attn.py
    ├── TextConvLSTM_Attn.py
    ├── TextDPCNN.py
    ├── TextGRU2_Attn.py
    ├── TextGRU_Attn.py
    ├── TextHAN.py
    ├── TextLSTMGRU_Attn.py
    ├── TextLSTM_Attn.py
    └── TextRCNN_Attn.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Liuyaox
  2 | data/
  3 | doc/
  4 | local/
  5 | aspect_extraction.py
  6 | .idea/
  7 | 
  8 | 
  9 | 
 10 | # Byte-compiled / optimized / DLL files
 11 | __pycache__/
 12 | *.py[cod]
 13 | *$py.class
 14 | 
 15 | # C extensions
 16 | *.so
 17 | 
 18 | # Distribution / packaging
 19 | .Python
 20 | build/
 21 | develop-eggs/
 22 | dist/
 23 | downloads/
 24 | eggs/
 25 | .eggs/
 26 | lib/
 27 | lib64/
 28 | parts/
 29 | sdist/
 30 | var/
 31 | wheels/
 32 | *.egg-info/
 33 | .installed.cfg
 34 | *.egg
 35 | MANIFEST
 36 | 
 37 | # PyInstaller
 38 | #  Usually these files are written by a python script from a template
 39 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 40 | *.manifest
 41 | *.spec
 42 | 
 43 | # Installer logs
 44 | pip-log.txt
 45 | pip-delete-this-directory.txt
 46 | 
 47 | # Unit test / coverage reports
 48 | htmlcov/
 49 | .tox/
 50 | .coverage
 51 | .coverage.*
 52 | .cache
 53 | nosetests.xml
 54 | coverage.xml
 55 | *.cover
 56 | .hypothesis/
 57 | .pytest_cache/
 58 | 
 59 | # Translations
 60 | *.mo
 61 | *.pot
 62 | 
 63 | # Django stuff:
 64 | *.log
 65 | local_settings.py
 66 | db.sqlite3
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | target/
 80 | 
 81 | # Jupyter Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # celery beat schedule file
 88 | celerybeat-schedule
 89 | 
 90 | # SageMath parsed files
 91 | *.sage.py
 92 | 
 93 | # Environments
 94 | .env
 95 | .venv
 96 | env/
 97 | venv/
 98 | ENV/
 99 | env.bak/
100 | venv.bak/
101 | 
102 | # Spyder project settings
103 | .spyderproject
104 | .spyproject
105 | 
106 | # Rope project settings
107 | .ropeproject
108 | 
109 | # mkdocs documentation
110 | /site
111 | 
112 | # mypy
113 | .mypy_cache/
114 | 


--------------------------------------------------------------------------------
/Config.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created:    2019-08-06 20:00:46
  4 | Author:     liuyao8
  5 | Descritipn: 
  6 | """
  7 | 
  8 | import argparse
  9 | 
 10 | 
 11 | class Config(object):
 12 | 
 13 |     def __init__(self):
 14 | 
 15 |         # 任务相关
 16 |         self.task = 'multilabel'
 17 |         self.token_level = 'word'       # word: word粒度  char: char粒度  both: word+char粒度
 18 |         self.N_CLASSES = 11             # 标签/类别数量
 19 |         
 20 |         
 21 |         # Embedding
 22 |         self.MIN_COUNT = 2              # 训练Embedding，创建Vocabulary时要求的低频下限
 23 |         self.PUBLIC_EMBED_DIM = 200     # 公开训练好的Embedding向量维度
 24 |         self.WORD_EMBED_DIM = 100
 25 |         self.CHAR_EMBED_DIM = 100
 26 |         self.model_word2vec_file = './local/model_word2vec.w2v'         # 训练好的Word Embedding  
 27 |         self.model_char2vec_file = './local/model_char2vec.w2v'         # 训练好的Char Embedding
 28 |         
 29 |         
 30 |         # Vocabulary
 31 |         self.PAD_IDX = 0   # PAD约定取0，不要改变，以下UNK,SOS,EOS可以改变
 32 |         self.UNK_IDX = 1   # unknow word   # TODO 原本是没有UNK的？
 33 |         self.SOS_IDX = 2   # Start of sentence
 34 |         self.EOS_IDX = 3   # End of sentence 
 35 |         self.vocab_file = './local/vocab.pkl'       # 词汇表，包含word/char,idx,vector三者之间映射字典，Embedding Layer初始化权重
 36 |         
 37 |         
 38 |         # 结构化特征
 39 |         # TODO structured改成模型定义时参数！
 40 |         self.structured = 'word'        # word: word粒度  char: char粒度  both: word+char粒度  none: 无  
 41 |         self.word_svd_n_componets = 100
 42 |         self.char_svd_n_componets = 150
 43 |         self.word_tfidf_lsa_file = './local/word_tfidf_lsa.pkl'
 44 |         self.char_tfidf_lsa_file = './local/char_tfidf_lsa.pkl'
 45 |         
 46 |         
 47 |         # Bert相关
 48 |         self.bert_flag = False
 49 |         self.bert_maxlen = 100
 50 |         self.bert_dim = 768
 51 |         self.bert_model_path = '/home/liuyao58/data/BERT/chinese_L-12_H-768_A-12/'
 52 |         self.bert_graph_tmpfile = './tmp_graph_xxx'
 53 |         self.data_bert_file = './local/bert_data.pkl'
 54 |         
 55 |         
 56 |         # 特征选择
 57 |         self.words_chi2_file = ''       # 基于卡方统计量筛选后的word
 58 |         self.chars_chi2_file = ''       # 基于卡方统计量筛选后的char
 59 |         
 60 |         
 61 |         # 数据预处理和编码
 62 |         self.data_file = './data/sku_qa_data_30000.csv'               # 处理好的标注数据，尚未编码
 63 |         self.data_encoded_file = './local/data_30000_encoded.pkl'     # 向量化编码后的训练数据
 64 |         self.WORD_MAXLEN = 100      # 57
 65 |         self.CHAR_MAXLEN = 200      # 126
 66 |         self.SENT_MAXLEN = 50       # 18
 67 |         
 68 |         
 69 |         # 训练
 70 |         self.n_gpus = 1
 71 |         self.BATCH_SIZE = 32
 72 |         self.n_folds = 5
 73 |         self.n_epochs = 10
 74 |         self.model_file = './local/model.h5'
 75 | 
 76 | 
 77 |         # 其他文件和路径
 78 |         self.annotation_file = './data/商品问答_手机_已标注_30000.xlsx'                # 原始的标注数据
 79 |         self.stopwords_files = ['./data/京东商城商品评论-Stopwords.txt', 
 80 |                                 './data/京东商城商品评论-Stopwords-other_github.txt']  # 公开停用词
 81 |         self.cleaned_all_stopwords_file = './data/cleaned_all_stopwords.txt'          # 合并处理好的公开停用词
 82 |         self.config_file = './local/config.pkl'     # config文件
 83 | 
 84 | 
 85 | 
 86 | def get_args():
 87 |     """待完善……"""
 88 |     parser = argparse.ArgumentParser()
 89 |     
 90 |     parser.add_argument('--server',         default=None, type=int, help='[6099]')
 91 |     parser.add_argument('--phase',          default=None, help='[Train/Test]')
 92 |     parser.add_argument('--sen_len',        default=None, type=int, help='sentence length')
 93 | 
 94 |     parser.add_argument('--net_name',       default=None, help='[lstm]')
 95 |     parser.add_argument('--dir_date',       default=None, help='Name it with date, such as 20180102')
 96 |     parser.add_argument('--batch_size',     default=32, type=int, help='Batch size')
 97 |     parser.add_argument('--lr_base',        default=1e-3, type=float, help='Base learning rate')
 98 |     parser.add_argument('--lr_decay_rate',  default=0.1, type=float, help='Decay rate of lr')
 99 |     parser.add_argument('--epoch_lr_decay', default=1000, type=int, help='Every # epoch, lr decay lr_decay_rate')
100 | 
101 |     parser.add_argument('--layer_num',      default=2, type=int, help='Lstm layer number')
102 |     parser.add_argument('--hidden_size',    default=64, type=int, help='Lstm hidden units')
103 |     parser.add_argument('--gpu',            default='0', help='GPU id list')
104 |     parser.add_argument('--workers',        default=4, type=int, help='Workers number')
105 | 
106 |     return parser.parse_args()
107 | 
108 | 
109 | 
110 | if __name__ == '__main__':
111 |     
112 |     args = get_args()
113 |     gpu = args.gpu
114 |     


--------------------------------------------------------------------------------
/DataAugmentation.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created:    2019-08-23 16:25:27
 4 | Author:     liuyao8
 5 | Descritipn: 样本处理：数据增强
 6 | """
 7 | 
 8 | import random
 9 | 
10 | 
11 | # 1. 数据增强
12 | 
13 | def data_enhance_for_text(texts, categories, mode='limit'):
14 |     """
15 |     数据增强，打乱老样本序列顺序以生成新样本
16 |     ARGS
17 |         texts: iterable, 每个元素是一个token列表, token既可以是token也可以是token id
18 |         categories: iterable, 每个元素是一个类别id，与texts各元素一一对应
19 |         mode: 数据增强模式
20 |             limit=基于各类别样本数量，为数量少的类别增加新样本，使各类别样本数达到 min(原样本数*2, 最大类别样本数)
21 |             double=所有类别的样本都翻倍，不管各类别原样本数量是多少
22 |     RETURN
23 |         dic2: 字典，key为cate，value为该cate对应的数据增强后的样本列表
24 |     """
25 |     assert mode in ('limit', 'double')
26 | 
27 |     # 构建类别样本字典: <类别, (样本数, 样本列表)>
28 |     dic1 = {}
29 |     for text, cate in zip(texts, categories):
30 |         if cate not in dic1:
31 |             dic1[cate] = (1, [text, ])
32 |         else:
33 |             dic1[cate][0] += 1
34 |             dic1[cate][1].append(text)
35 |     num_max = max([val[0] for val in dic1.values()])    # 最大类别样本数
36 | 
37 |     # 数据增强
38 |     dic2 = {}
39 |     for cate, (num, texts) in dic1.items():
40 |         if mode == 'limit':
41 |             num_extra = min(num, num_max - num)             # 数据增强后样本数为 min(原样本数*2, 最大类别样本数)
42 |             texts_extra = random.sample(texts, num_extra)   # 从原样本中随机挑选若干样本用于生成新样本
43 |         else:
44 |             texts_extra = texts.copy()
45 |         for text in texts_extra:
46 |             random.shuffle(text)    # 打乱原序列顺序
47 |             texts.append(text)
48 |         dic2[cate] = texts
49 |     return dic2
50 | 
51 | 
52 | 
53 | if __name__ == '__main__':
54 |     # 项目暂未使用
55 |     pass


--------------------------------------------------------------------------------
/DataPreprocessing.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created:    2019-07-03 20:53:31
 4 | Author:     liuyao8
 5 | Descritipn: 数据预处理，包括两大块
 6 |             a. 待标注数据生成：基于最原始数据，生成便于标注的数据格式
 7 |             b. 已标注数据处理：已标注数据规整、去除停用词、分词、Label规整等
 8 |                 以确保在训练Word Embedding、创建Vocabulary等时不需复杂或耗时的额外处理，可直接使用！
 9 | """
10 | 
11 | import pandas as pd
12 | import jieba
13 | import pickle
14 | from functools import reduce
15 | import re
16 | 
17 | from Config import Config
18 | config = Config()
19 | 
20 | 
21 | # 1. 原始数据 --> 待标注数据
22 | question_path = './data/cellphone_questions.txt'
23 | colnames = ['question_raw', 'spu', 'follows']
24 | data = pd.read_csv(question_path, sep='\t', header=None, names=colnames, encoding='utf8')
25 | data2 = data.sample(frac=0.86, random_state=4321)   # 30181
26 | data2.to_excel('./data/question_cellphone_20190715_30000.xlsx', header=True, index=False, encoding='utf8')
27 | 
28 | 
29 | # 2. 已标注数据 --> 训练数据
30 | # 读取数据
31 | cols_dic = {'序号': 'no', '性能&系统': 'system', '功能': 'function', '电池': 'battery', '外观': 'appearance', 
32 |             '电话&网络': 'network', '拍照': 'photo', '附件赠品': 'accessory', '购买相关': 'purchase', 
33 |             '品控': 'quality', '配置&硬件': 'hardware', '比较': 'contrast', '标注人': 'annotator'}
34 | annotation = pd.read_excel(config.annotation_file, header=1, encoding='utf8').fillna(0).rename(columns=cols_dic)
35 | annotation['question_raw'] = annotation['question_raw'].map(lambda x: ' '.join(x.split()))      # 多个空格变1个
36 | 
37 | 
38 | # 停用词
39 | stopwords = [open(x, 'r', encoding='utf8').readlines() for x in config.stopwords_files]
40 | stopwords = list(set([x.strip() for x in reduce(lambda x, y: x + y, stopwords)]))   # TODO 重要！加strip，可能会删除一个空格停用词，下面会手动添加
41 | stopwords = stopwords + ['', ' ']                                                   # TODO 非常重要！手动在停用词表中添加空字符串和空格！！！
42 | pickle.dump(stopwords, open(config.cleaned_all_stopwords_file, 'wb'))
43 | # TODO char-level的停用词应该与word的不一样！比如：'一'单独出现在word-level分词中说明没别的字可跟它组成词，它就是停用词，但出现在char-level中并不一定
44 | 
45 | 
46 | # 分词
47 | # 支持3种level: word, char, sentence(用于TextHAN)
48 | get_wordsegs = lambda x: ' '.join([seg for seg in jieba.cut(x, cut_all=False) if seg not in stopwords])     # TODO 优化点：试试cut_all=True
49 | get_charsegs = lambda x: ' '.join([seg for seg in x.replace(' ', '') if seg not in stopwords])              # char-level也要删除停用词
50 | # TODO 重要！使用sklearn.pipeline把get_wordsegs和get_charsegs保存进pipeline！！！包括其中的stopwords!!!
51 | # 句子间Tokenization + 句子内分词
52 | # 启发于：# 参考：https://github.com/richliao/textClassifier/blob/master/textClassifierHATT.py
53 | p1 = re.compile(r'[。！？!?]+')        # 删除英文句号，因为很多是数字小数点
54 | p2 = re.compile(r'[。！？，：!?,:]+')  # 同上
55 | get_sentsegs1 = lambda x: '&'.join([y for y in [
56 |             ' '.join([seg for seg in jieba.cut(sent.strip(), cut_all=False) if seg not in stopwords])
57 |             for sent in p1.split(x) if len(sent) >= 1
58 |         ] if len(y) >= 1]).strip('&').strip()
59 | get_sentsegs2 = lambda x: '&'.join([y for y in [
60 |             ' '.join([seg for seg in jieba.cut(sent.strip(), cut_all=False) if seg not in stopwords])
61 |             for sent in p2.split(x) if len(sent) >= 1
62 |         ] if len(y) >= 1]).strip('&').strip()
63 | 
64 | # question_raw取值示例：'抢了个免息券，想入手XR，我只有一个联通卡，单卡信号怎么样'
65 | annotation['question_wordseg'] = annotation['question_raw'].map(get_wordsegs)
66 | annotation['question_charseg'] = annotation['question_raw'].map(get_charsegs)
67 | annotation['question_sentseg1'] = annotation['question_raw'].map(get_sentsegs1)
68 | annotation['question_sentseg2'] = annotation['question_raw'].map(get_sentsegs2)
69 | 
70 | 
71 | # Label
72 | cols_y = ['system', 'function', 'battery', 'appearance', 'network', 'photo', 'accessory', 'purchase', 'quality', 'hardware', 'contrast']
73 | annotation['labels'] = annotation.apply(lambda se: se[cols_y][se[cols_y]==1].index.tolist(), axis=1)
74 | annotation['labels'] = annotation['labels'].apply(lambda x: '&&' if len(x) == 0 else '&&'.join(x))
75 | 
76 | 
77 | # 保存本地
78 | annotation.to_csv(config.data_file, sep='\t', index=False, encoding='utf8')
79 | 


--------------------------------------------------------------------------------
/Embedding.py:
--------------------------------------------------------------------------------
 1 | from model.TextLSTM import TextLSTM# -*- coding: utf-8 -*-
 2 | """
 3 | Created:    2019-08-06 21:24:39
 4 | Author:     liuyao8
 5 | Descritipn: 
 6 | """
 7 | 
 8 | import numpy as np
 9 | from gensim.models import Word2Vec
10 | 
11 | 
12 | class CorpusGenerator(object):
13 |     """
14 |     使用 gensim 生成 Word2Vec 所需的语料 Generator，由文件直接生成，支持 word-level 和 char-level
15 |     NOTES
16 |         文件每行必须事先完成分词或分字：每行是分隔的词或字的字符串，形如：'颜色 很 漂亮' 或 '颜 色 很 漂 亮'
17 |     """
18 |     def __init__(self, corpus_file, stopwords=[], sep=' '):
19 |         self.corpus_file = corpus_file
20 |         self.stopwords = stopwords
21 |         self.sep = sep
22 | 
23 |     def __iter__(self):
24 |         for line in open(self.corpus_file):
25 |             # 输出结果：每个元素形如['颜色', '很', '漂亮'] 或 ['颜', '色', '很', '漂', '亮']，过滤指定词或字(如停用词等)
26 |             yield [x for x in line.strip().split(self.sep) if x not in self.stopwords]
27 | 
28 | 
29 | def train_w2v_model(sentences, size=100, min_count=3, window=5, sg=1, workers=8, iter=8, compute_loss=True):
30 |     """
31 |     训练 Word2Vec 字/词向量
32 |     ARGS
33 |         sentences: iterable of sentence, 其中sentence是分字/分词列表，形如：['颜色', '很', '漂亮'] 或 ['颜', '色', '很', '漂', '亮']
34 |         其他：与Word2Vec函数参数保持一致，sg=1表示使用skip-gram算法
35 |     RETURN
36 |         model: 训练好的Word2Vec模型，包含(idx, token, vector)三者之间的4种映射字典：idx2token, idx2vector, token2idx, token2vector(即model.wv)
37 |     """
38 |     model = Word2Vec(sentences, size=size, min_count=min_count, window=window, sg=sg, workers=workers, iter=iter, compute_loss=compute_loss)
39 |     model.idx2token = {}
40 |     model.token2idx = {}
41 |     model.idx2vector = {}
42 |     for token in model.wv.vocab.keys():
43 |         idx = model.wv.vocab[token].index    # token对应的idx
44 |         model.idx2token[idx] = token
45 |         model.token2idx[token] = idx
46 |         model.idx2vector[idx] = model[token] # 可直接使用model[token]，当然也可model.wv[token]
47 |     return model
48 | 
49 | 
50 | def pretrained_embedding(embedding_file, seps=('\t', ','), header=False):
51 |     """Public Pretrained Embedding File --> Original Full Embedding"""
52 |     embedding = {}
53 |     with open(embedding_file, 'r', encoding='utf-8') as fr:
54 |         if header:
55 |             fr.readline()                        # Drop line 1
56 |         for line in fr:
57 |             values = line.strip().split(seps[0])
58 |             if len(values) >= 2:
59 |                 token = values[0]
60 |                 vector = values[1:] if seps[0] == seps[1] else values[1].split(seps[1])
61 |                 embedding[token] = np.asarray(vector, dtype='float32')
62 |     return embedding
63 | 
64 | 
65 | 
66 | def example():
67 |     """训练Word2Vec向量，并保存本地"""
68 |     import pandas as pd
69 |     from Config import Config
70 |     config = Config()
71 |     
72 |     data = pd.read_csv(config.data_file, sep='\t', encoding='utf8')
73 |     sentences_word = data['question_wordseg'].map(lambda x: str(x).strip().split(' '))
74 |     sentences_char = data['question_charseg'].map(lambda x: str(x).strip().split(' '))
75 |     
76 |     model_word2vec = train_w2v_model(sentences_word, size=config.WORD_EMBED_DIM, min_count=config.MIN_COUNT)
77 |     model_char2vec = train_w2v_model(sentences_char, size=config.CHAR_EMBED_DIM, min_count=config.MIN_COUNT, window=10, iter=15)
78 |     print(len(model_word2vec.wv.vocab))     # 5484
79 |     print(len(model_char2vec.wv.vocab))     # 1595
80 | 
81 |     model_word2vec.save(config.model_word2vec_file)
82 |     model_char2vec.save(config.model_char2vec_file)
83 |     
84 |     
85 | 
86 | if __name__ == '__main__':
87 |     
88 |     example()
89 |     


--------------------------------------------------------------------------------
/FeatureStructured.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created:    2019-08-07 20:59:13
  4 | Author:     liuyao8
  5 | Descritipn: 结构化特征如TFIDF, LSA, LSI, LDA等
  6 | """
  7 | 
  8 | from sklearn.feature_extraction.text import TfidfVectorizer
  9 | from sklearn.decomposition import TruncatedSVD
 10 | from sklearn.pipeline import make_pipeline
 11 | 
 12 | 
 13 | class FeatureStructured(object):
 14 |     
 15 |     def __init__(self):
 16 |         pass 
 17 |         
 18 |         
 19 |     # 1. TFIDF特征
 20 |     @classmethod
 21 |     def tfidf_vectorizer(cls, data, ngram_range=(1, 1), vocabulary=None, stopwords=None, max_features=None):
 22 |         """训练TFIDF模型，并生成TFIDF特征"""
 23 |         # model_tfidf.vocabulary_是训练后的字典，是features   max_features=len(vocabulary_)
 24 |         model_tfidf = TfidfVectorizer(ngram_range=ngram_range, vocabulary=vocabulary, stop_words=stopwords, 
 25 |                                       sublinear_tf=True, max_features=max_features)
 26 |         data_tfidf = model_tfidf.fit_transform(data)    # .toarray()  (9, max_features)
 27 |         return model_tfidf, data_tfidf
 28 |     
 29 |     
 30 |     # 2. LSA特征
 31 |     # LSA转换 = TFIDF转换 + SVD转换
 32 |     # In particular, truncated SVD works on term count/tf-idf matrices as returned by the vectorizers 
 33 |     # in sklearn.feature_extraction.text. In that context, it is known as latent semantic analysis (LSA).
 34 |     
 35 |     # TODO **kawgs 实现
 36 |     @classmethod
 37 |     def lsa_vectorizer(cls, data, ngram_range=(1, 1), vocabulary=None, stopwords=None, 
 38 |                        max_features=None, n_components=2, n_iter=5):
 39 |         """
 40 |         训练LSA模型，并生成LSA特征
 41 |         ARGS
 42 |             data: iterable of sentence, sentence是空格分隔的分字/分词字符串
 43 |                 形如 ['小猫咪 爱 吃肉', '我 有 一只 小猫咪', ...]  假设shape为(9, ) (即9个sentence)
 44 |             其他：参数及其默认值与 TfidfVectorizer 和 TruncatedSVD 保持一致
 45 |         USAGE  
 46 |             训练时，data既可以只是train，也可以是train+val+test，应用时分别应用于train/val/test
 47 |         """
 48 |         model_tfidf = TfidfVectorizer(ngram_range=ngram_range, vocabulary=vocabulary, stop_words=stopwords, 
 49 |                                       sublinear_tf=True, max_features=max_features)             # (9, ) -> (9, max_features)
 50 |         model_svd = TruncatedSVD(n_components=n_components, n_iter=n_iter, random_state=2019)   #       -> (9, n_components)
 51 |         model_lsa = make_pipeline(model_tfidf, model_svd)
 52 |         data_lsa = model_lsa.fit_transform(data)
 53 |         return model_lsa, data_lsa
 54 |     
 55 |     
 56 |     @classmethod
 57 |     def lsa_vectorizer_2steps(cls, data, ngram_range=(1, 1), vocabulary=None, stopwords=None, 
 58 |                               max_features=None, n_components=2, n_iter=5):
 59 |         """功能同lsa_vectorizer, 可返回训练好的TFIDF和SVD模型，假设 data 维度为(9, )"""
 60 |         # TFIDF 转换      (9, max_features)
 61 |         model_tfidf, data_tfidf = cls.tfidf_vectorizer(data, ngram_range=ngram_range, vocabulary=vocabulary, 
 62 |                                                        stopwords=stopwords, max_features=max_features)
 63 |         # SVD 转换
 64 |         model_svd = TruncatedSVD(n_components=n_components, n_iter=n_iter, random_state=2018)
 65 |         data_lsa = model_svd.fit_transform(data_tfidf)  # (9, n_components)  max_features维稀疏向量 -> n_components维稠密向量
 66 |         return model_tfidf, data_tfidf, model_svd, data_lsa
 67 |     
 68 |     
 69 |     
 70 |     # 3. LSI特征
 71 |     
 72 |     
 73 |     
 74 |     
 75 |     # 4. LDA特征
 76 | 
 77 | 
 78 | 
 79 | 
 80 |     # 5. Others
 81 | 
 82 | 
 83 | 
 84 | 
 85 | def example_lsa():
 86 |     """生成TFIDF特征、LSA特征"""
 87 |     import pandas as pd
 88 |     import pickle
 89 |     from Config import Config
 90 |     config = Config()
 91 |     
 92 |     data = pd.read_csv(config.data_file, sep='\t', encoding='utf8')
 93 |     sentences_word, sentences_char = data['question_wordseg'].fillna(''), data['question_charseg'].fillna('')
 94 |     
 95 |     vocab = pickle.load(open(config.vocab_file, 'rb'))  # 在main中运行的话，必须 from Vocabulary import Vocabulary
 96 |     
 97 |     word_model_tfidf, word_tfidf, word_model_svd, word_lsa = FeatureStructured.lsa_vectorizer_2steps(
 98 |             sentences_word, vocabulary=vocab.word2idx, n_components=config.word_svd_n_componets)  # 指定vocabulary，保证全局一致性
 99 |     char_model_tfidf, char_tfidf, char_model_svd, char_lsa = FeatureStructured.lsa_vectorizer_2steps(
100 |             sentences_char, vocabulary=vocab.char2idx, n_components=config.char_svd_n_componets)
101 |     
102 |     # TODO char粒度的特征计算好像有点问题！
103 |     pickle.dump((word_model_tfidf, word_tfidf, word_model_svd, word_lsa), open(config.word_tfidf_lsa_file, 'wb'))
104 |     pickle.dump((char_model_tfidf, char_tfidf, char_model_svd, char_lsa), open(config.char_tfidf_lsa_file, 'wb'))
105 | 
106 | 
107 | 
108 | if __name__ == '__main__':
109 |     
110 |     example_lsa()
111 | 


--------------------------------------------------------------------------------
/ModelApply.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created:    2019-08-23 15:20:07
 4 | Author:     liuyao8
 5 | Descritipn: 
 6 | """
 7 | 
 8 | import pickle
 9 | from ModelTrain import get_encoding_func, get_sides_encoding_func
10 | from Vocabulary import Vocabulary
11 | from Config import Config
12 | config = Config()
13 | 
14 | 
15 | # 加载config
16 | config = pickle.load(open(config.config_file, 'rb'))
17 | 
18 | 
19 | # 应用数据处理
20 | 
21 | 
22 | # 模型应用
23 | 


--------------------------------------------------------------------------------
/ModelTrain.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created:    2019-07-31  14:51:45
  4 | Author:     liuyao8
  5 | Descritipn: 
  6 | """
  7 | 
  8 | from numpy import array, zeros
  9 | from pandas import read_csv
 10 | import pickle
 11 | from scipy.sparse import csr_matrix
 12 | from sklearn.preprocessing import MultiLabelBinarizer
 13 | from sklearn.model_selection import train_test_split
 14 | 
 15 | from Vocabulary import seq_to_idxs
 16 | 
 17 | 
 18 | def get_encoding_func(vocab, config):
 19 |     """创建工具：用于生成word和char粒度的数据编码"""
 20 |     word_encoding = lambda x: seq_to_idxs(str(x).split(), vocab.word2idx, config.WORD_MAXLEN, config.UNK_IDX, config.PAD_IDX)
 21 |     char_encoding = lambda x: seq_to_idxs(str(x).split(), vocab.char2idx, config.CHAR_MAXLEN, config.UNK_IDX, config.PAD_IDX)
 22 |     return word_encoding, char_encoding
 23 | 
 24 | 
 25 | def get_sides(x, maxlen):
 26 |     """生成left和right原始数据(未编码) for TextRCNN  注意：只截断不补零"""
 27 |     xs = str(x).split()[: maxlen]   # 截断
 28 |     x_left = ' '.join(['UNK'] + xs[:-1])
 29 |     x_right = ' '.join(xs[1:] + ['UNK'])
 30 |     return x_left, x_right
 31 | 
 32 | 
 33 | def get_sides_encoding_func(vocab, config):
 34 |     """创建工具：用于生成left和right原始数据并编码 for TextRCNN"""
 35 |     word_encoding, char_encoding = get_encoding_func(vocab, config)
 36 |     word_left_encoding = lambda x: word_encoding(get_sides(x, config.WORD_MAXLEN)[0])
 37 |     word_right_encoding = lambda x: word_encoding(get_sides(x, config.WORD_MAXLEN)[1])
 38 |     char_left_encoding = lambda x: char_encoding(get_sides(x, config.CHAR_MAXLEN)[0])
 39 |     char_right_encoding = lambda x: char_encoding(get_sides(x, config.CHAR_MAXLEN)[1])
 40 |     return word_left_encoding, word_right_encoding, char_left_encoding, char_right_encoding
 41 | 
 42 | 
 43 | def get_bert_model(config):
 44 |     """创建预训练Bert模型：用于对raw文本编码，raw文本不需分词"""
 45 |     from model.Bert.extract_feature import BertVector
 46 |     bert_model = BertVector(pooling_strategy='NONE', 
 47 |                             max_seq_len=config.bert_maxlen, 
 48 |                             bert_model_path=config.bert_model_path, 
 49 |                             graph_tmpfile=config.bert_graph_tmpfile)
 50 |     return bert_model
 51 | 
 52 | 
 53 | def sent_array(x_sent_raw, config, word_encoding):
 54 |     """
 55 |     向量化编码：Sentence粒度, for TextHAN  
 56 |     编码后document形如下行：其中--表示sentence，|表示其向量结束，WORD_MAXLEN=10, SENT_MAXLEN=6, 编码前是4个sentence
 57 |     --------00|------0000|----------|-------000|0000000000|0000000000
 58 |     参考：https://github.com/richliao/textClassifier/blob/master/textClassifierHATT.py
 59 |     """
 60 |     x_sent = zeros((len(x_sent_raw), config.SENT_MAXLEN, config.WORD_MAXLEN), dtype='int32')  # Sentence特征是3维，其他特征是2维
 61 |     # sent_encoding只进行各个句子内的Word Level编码，编码后结果形如：--------00|------0000|----------|-------000
 62 |     sent_encoding = lambda x: array([word_encoding(sent) for sent in str(x).split('&')[: config.SENT_MAXLEN]], dtype='int32') # 截断
 63 |     for i, sents in enumerate(x_sent_raw):
 64 |         sents_vector = sent_encoding(sents)
 65 |         j, k = sents_vector.shape
 66 |         x_sent[i, :j, :k] = sents_vector
 67 |     return x_sent
 68 | 
 69 | 
 70 | def data_config_prepare(config):
 71 |     """特征编码，Label编码，Train/Test划分，Config生成，持久化"""
 72 |     # 0. 数据准备
 73 |     data = read_csv(config.data_file, sep='\t', encoding='utf8')
 74 |     data['labels'] = data['labels'].map(lambda x: [] if x == '&&' else x.split('&&'))
 75 |     x_raw, x_word_raw, x_char_raw, x_sent_raw1, x_sent_raw2, y_raw = data['question_raw'], \
 76 |         data['question_wordseg'], data['question_charseg'], data['question_sentseg1'], data['question_sentseg2'], data['labels']
 77 |     
 78 |     vocab = pickle.load(open(config.vocab_file, 'rb'))      # 词汇表，映射字典，Embedding Layer初始化权重
 79 |     config.CHAR_VOCAB_SIZE = vocab.char_vocab_size
 80 |     config.WORD_VOCAB_SIZE = vocab.word_vocab_size
 81 |     config.char_embed_matrix = vocab.char_embed_matrix
 82 |     config.word_embed_matrix = vocab.word_embed_matrix
 83 |     config.WORD_MAXLEN = int(1.5 * x_word_raw.map(lambda x: len(str(x).split())).max())     # 57
 84 |     config.CHAR_MAXLEN = int(1.5 * x_char_raw.map(lambda x: len(str(x).split())).max())     # 126
 85 |     config.SENT_MAXLEN = int(1.5 * x_sent_raw2.map(lambda x: len(str(x).split('&'))).max()) # 18
 86 |     config.SENT_MAXLEN = 5
 87 |     
 88 |     # 1. Token筛选
 89 |     
 90 |     
 91 |     # 2. 特征和Label向量化编码  以下特征中不需要的特征可直接删除，如left和right特征、Bert编码特征、Sentence特征等
 92 |     # word和char特征
 93 |     word_encoding, char_encoding = get_encoding_func(vocab, config)
 94 |     x_word = array(x_word_raw.map(word_encoding).tolist(), dtype='int32')
 95 |     x_char = array(x_char_raw.map(char_encoding).tolist(), dtype='int32')
 96 |     
 97 |     
 98 |     # left和right特征  only for TextRCNN
 99 |     word_left_encoding, word_right_encoding, char_left_encoding, char_right_encoding = get_sides_encoding_func(vocab, config)
100 |     x_word_left = array(x_word_raw.map(word_left_encoding).tolist(), dtype='int32')
101 |     x_word_right = array(x_word_raw.map(word_right_encoding).tolist(), dtype='int32')
102 |     x_char_left = array(x_char_raw.map(char_left_encoding).tolist(), dtype='int32')
103 |     x_char_right = array(x_char_raw.map(char_right_encoding).tolist(), dtype='int32')
104 |     
105 |     
106 |     # 结构化特征
107 |     word_model_tfidf, x_word_tfidf, word_model_svd, x_word_lsa = pickle.load(open(config.word_tfidf_lsa_file, 'rb'))
108 |     #char_model_tfidf, char_tfidf, char_model_svd, char_lsa = pickle.load(open(config.char_tfidf_lsa_file, 'rb'))
109 |     
110 |     
111 |     # Bert编码特征   速度超级慢！怎么解决？ # TODO
112 |     # 对整个句子编码，不需分词！编码向量shape与一般情况下分词后编码不一样，比如"变速箱挺好的"shape为8，"变速箱"shape为5，"变速"shape为4
113 |     bert_model = get_bert_model(config)
114 |     bert_vectorizer = lambda x: csr_matrix(bert_model.encode([x])["encodes"][0])
115 |     x_bert = array(x_raw.map(bert_vectorizer).tolist(), dtype='int32')
116 |     
117 |     
118 |     # Sentence特征  only for TextHAN
119 |     x_sent1 = sent_array(x_sent_raw1, config, word_encoding)
120 |     x_sent2 = sent_array(x_sent_raw2, config, word_encoding)
121 |     
122 |     
123 |     # Label
124 |     mlb = MultiLabelBinarizer()
125 |     y_data = mlb.fit_transform(y_raw)       # TODO 使用训练数据还是所有数据来训练mlb？？？
126 |     config.N_CLASSES = len(mlb.classes_)
127 |     config.label_binarizer = mlb
128 |     
129 |     
130 |     # 3. 划分并保存Train/Test    
131 |     x_word_train, x_word_test, x_word_left_train, x_word_left_test, x_word_right_train, x_word_right_test, \
132 |     x_char_train, x_char_test, x_char_left_train, x_char_left_test, x_char_right_train, x_char_right_test, \
133 |     x_word_lsa_train, x_word_lsa_test, \
134 |     x_bert_train, x_bert_test, \
135 |     x_sent1_train, x_sent1_test, x_sent2_train, x_sent2_test, \
136 |     y_train, y_test = train_test_split(
137 |             x_word, x_word_left, x_word_right, 
138 |             x_char, x_char_left, x_char_right, 
139 |             x_word_lsa, 
140 |             x_bert,                 # bert编码特征计算太慢，可删除该行，不使用bert编码特征
141 |             x_sent1, x_sent2,
142 |             y_data, 
143 |             test_size=0.2, random_state=2019
144 |     )
145 |     x_train = {
146 |         'word': x_word_train,
147 |         'word_left': x_word_left_train,
148 |         'word_right': x_word_right_train,
149 |         'word_structured': x_word_lsa_train,
150 |         'char': x_char_train, 
151 |         'char_left': x_char_left_train,
152 |         'char_right': x_char_right_train,
153 |         'sentence1': x_sent1_train,
154 |         'sentence2': x_sent2_train
155 |     }
156 |     x_test = {
157 |         'word': x_word_test,
158 |         'word_left': x_word_left_test,
159 |         'word_right': x_word_right_test,
160 |         'word_structured': x_word_lsa_test,
161 |         'char': x_char_test,
162 |         'char_left': x_char_left_test,
163 |         'char_right': x_char_right_test,
164 |         'sentence1': x_sent1_test,
165 |         'sentence2': x_sent2_test
166 |     }
167 |     
168 |     # 保存编码后数据
169 |     pickle.dump((x_train, y_train, x_test, y_test), open(config.data_encoded_file, 'wb'))
170 |     pickle.dump((x_bert_train, y_train, x_bert_test, y_test), open(config.data_bert_file, 'wb'))
171 |     pickle.dump(config, open(config.config_file, 'wb'))
172 | 
173 | 
174 | def data_augmentation():
175 |     """数据增强"""
176 |     pass
177 | 
178 | 
179 | def example(bert_flag=False):
180 |     from Vocabulary import Vocabulary
181 |     from Config import Config
182 |     config = Config()
183 |     
184 |     
185 |     # Data和Config准备
186 |     data_config_prepare(config)
187 |     config = pickle.load(open(config.config_file, 'rb'))
188 |     data_file = config.data_bert_file if bert_flag else config.data_encoded_file
189 |     x_train, y_train, x_test, y_test = pickle.load(open(data_file, 'rb'))
190 |     
191 |     
192 |     # 根据实际情况修改，也可直接在Config.py里修改，推荐前者
193 |     config.n_gpus = 1
194 |     config.token_level = 'word'
195 |     config.structured = 'none'
196 |     config.bert_flag = False
197 |     
198 |     
199 |     # 模型训练 评估 保存
200 |     if not bert_flag:   # 一般模型
201 |         from model.TextCNN import TextCNN
202 |         textcnn = TextCNN(config)
203 |         test_acc, scores, sims, vectors, _, _ = textcnn.train_evaluate(x_train, y_train, x_test, y_test, epochs=(2, 10))
204 |         textcnn.model.save(config.model_file)
205 |     
206 |     else:               # Bert模型
207 |         config.bert_flag = True
208 |         x_train = array([term.toarray() for term in x_train], dtype='int32')
209 |         x_test = array([term.toarray() for term in x_test], dtype='int32')
210 |         from model.TextBertGRU import TextBertGRU
211 |         textbertgru = TextBertGRU(config)
212 |         test_acc, scores, sims, vectors, history = textbertgru.train_evaluate(x_train, y_train, x_test, y_test)
213 |         textbertgru.model.save(config.model_file)
214 |     
215 | 
216 | 
217 | if __name__ == '__main__':
218 |     
219 |     example()
220 |     


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Text Classification
  2 | 
  3 | 基于Keras的15种模型：TextCNN, TextRNN, TextDPCNN, TextRCNN, TextHAN, TextBert等及其变种
  4 | 
  5 | 支持5类特征及其组合：word-level, char-level, 结构化特征(TFIDF, LSA), Context特征(word-left, word-right, char-left, char-right), sentence-level
  6 | 
  7 | 支持4种分类任务：单标签二分类，单标签多分类，多标签二分类，多标签多分类
  8 | 
  9 | ## Task & Data
 10 | 
 11 | 任务描述：给定一个关于手机的用户提问，判断用户关注的是手机哪些Labels。
 12 | 
 13 | Labels: System, Function, Battery, Appearance, Network, Photo, Accessory, Purchase, Quality, Hardware, Contrast
 14 | 
 15 | 已标注数据集共有30,000，以下为示例：
 16 | 
 17 | ![1573355016134](./image/1573355016134.png)
 18 | 
 19 | 所以，任务类型：**多标签二分类**(Multi-label Binary Classification)任务，共有11个Labels，每个Label有2种取值(关注，不关注)。
 20 | 
 21 | 虽然数据集是关于多标签二分类任务的，但本项目代码适用于**4种分类任务中的任何1种**，只取简单修改Config.py文件即可，基模型定义文件BasicModel.py会自动处理。
 22 | 
 23 | #### 附录1：Config.py和BasicModel.py中关于任务类型的配置和处理代码
 24 | 
 25 | ```python
 26 | # Config.py
 27 | self.task = 'multilabel'
 28 | self.token_level = 'word'       # word: word粒度  char: char粒度  both: word+char粒度
 29 | self.N_CLASSES = 11             # 标签/类别数量
 30 | 
 31 | # BasicModel.py
 32 | # 任务类型决定了类别数量、激活函数、损失函数和评估指标
 33 | if config.task == 'binary':                # 单标签二分类
 34 |     self.n_classes =  1
 35 |     self.activation = 'sigmoid'
 36 |     self.loss = 'binary_crossentropy'
 37 |     self.metrics = ['accuracy']
 38 | elif config.task == 'categorical':         # 单标签多分类
 39 |     self.n_classes = config.N_CLASSES
 40 |     self.activation = 'softmax'
 41 |     self.loss = 'categorical_crossentropy'
 42 |     self.metrics = ['accuracy']
 43 | elif config.task == 'multilabel':          # 多标签二分类(多标签多分类需转化为多标签二分类)
 44 |     self.n_classes = config.N_CLASSES
 45 |     self.activation = 'sigmoid'
 46 |     self.loss = 'binary_crossentropy'
 47 |     self.metrics = ['accuracy']
 48 | ```
 49 | 
 50 | #### 附录2：4种分类任务及其处理方法
 51 | 
 52 | - a. 单标签二分类
 53 | 
 54 |     输出为Dense(1, activation='sigmoid')，应用时1个概率值判断其与阈值大小
 55 | 
 56 | - b. 单标签N分类
 57 | 
 58 |     输出为Dense(N, activation='softmax')，应用时N个概率值取Top1
 59 | 
 60 | - c. M标签二分类
 61 |   - **c.1** 一个输出：输出为Dense(M, activation=‘sigmoid’)，应用时M个概率值取TopK或与阈值判断大小
 62 |   - c.2 一个输出：问题转化为M分类，类似于b，模型输出结构同b，应用时方法同c.1
 63 | 
 64 | - d. M标签N分类
 65 |   - d.1 一个输出：问题转化为MN标签二分类，同c.1
 66 |   - d.2 一个输出：问题转化为MN分类，同c.2
 67 |   - d.3 M个输出：每个输出都是b，模型输出结构、应用时方法都同b 待尝试
 68 | 
 69 | 备注：本项目使用的处理方法是c.1
 70 | 
 71 | ## Requirement
 72 | 
 73 | Python 3.6.5
 74 | 
 75 | Keras 2.2.4
 76 | 
 77 | Numpy 1.16.3
 78 | 
 79 | Pandas 0.23.0
 80 | 
 81 | SciPy 1.1.0
 82 | 
 83 | Sklearn 0.21.3
 84 | 
 85 | ## Data Preprocessing
 86 | 
 87 | 数据预处理环节流程步骤如下图所示：
 88 | 
 89 | ![1573364046216](./image/1573364046216.png)
 90 | 
 91 | #### 数据清洗和准备
 92 | 
 93 | 文件：[DataPreprocessing.py](https://github.com/liuyaox/text_classification/blob/master/DataPreprocessing.py)
 94 | 
 95 | 内容：简单而通用的功能，如标注数据处理，分词，分字，分句子，过滤停用词，处理原始Labels
 96 | 
 97 | #### Embedding相关
 98 | 
 99 | 文件：[Embedding.py](https://github.com/liuyaox/text_classification/blob/master/Embedding.py)
100 | 
101 | 内容：自己训练Word Embedding，读取公开训练的Word Embedding，支持word+char两种粒度
102 | 
103 | #### Vocabulary相关
104 | 
105 | 文件：[Vocabulary.py](https://github.com/liuyaox/text_classification/blob/master/Vocabulary.py)
106 | 
107 | 内容：
108 | 
109 | 生成词汇表，支持低频高频词过滤
110 | 
111 | 基于Embedding生成<word, idx, vector>三者之间的映射字典
112 | 
113 | 生成Embedding Layer初始化权重
114 | 
115 | 基于映射字典的向量化编码工具(支持截断、补零、including和excluding)
116 | 
117 | 以上功能支持word+char两种粒度
118 | 
119 | #### 结构化特征
120 | 
121 | 文件：[FeatureStructured.py](https://github.com/liuyaox/text_classification/blob/master/FeatureStructured.py)
122 | 
123 | 内容：生成TFIDF特征和LSA特征，支持word+char两种粒度，后续会增加支持LSI, LDA等其他特征
124 | 
125 | #### 特征选择
126 | 
127 | 文件：[TokenSelection.py](https://github.com/liuyaox/text_classification/blob/master/TokenSelection.py)
128 | 
129 | 内容：基于卡方统计值等过滤词和字，项目暂未使用
130 | 
131 | #### 数据编码
132 | 
133 | 文件 ：[ModelTrain.py](https://github.com/liuyaox/text_classification/blob/master/ModelTrain.py)
134 | 
135 | 内容：使用向量化编码工具和MultiLabelBinarizer对特征和Label进行编码
136 | 
137 | #### 数据增强
138 | 
139 | 文件 ：[DataAugmentation.py](https://github.com/liuyaox/text_classification/blob/master/DataAugmentation.py)
140 | 
141 | 内容：通过Shuffle和Random Drop进行数据增强，项目暂未使用
142 | 
143 | ## Model
144 | 
145 | 使用了多个Model，各Model结构关系如下图所示：
146 | 
147 | ![1573366328001](./image/1573366328001.png)
148 | 
149 | #### 使用类继承方式实现三层类定义
150 | 
151 | - BasicModel: 所有模型基类
152 | 
153 |   实现3种Metrics
154 | 
155 | - BasicDeepModel: 深度学习模型基类
156 | 
157 |   通用Layers创建
158 | 
159 |   绘制Loss和Metrics
160 | 
161 |   Embedding冻结和解冻
162 | 
163 |   模型训练和评估（支持CV）
164 | 
165 |   学习率Schedular
166 | 
167 | - BasicStatModel: 传统模型基类
168 | 
169 |   暂未实现
170 | 
171 | #### 实现6大类模型(绿色)：共15个模型
172 | 
173 | - TextCNN：标配和基础
174 | 
175 | - TextRNN：同上，可玩的地方更多
176 | 
177 | - TextRCNN：结合CNN和RNN的优点
178 | 
179 | - TextDPCNN：受ResNet启发，结合RNN+CNN
180 | 
181 | - TextHAN：使用了层次注意力机制
182 | 
183 | - TextBert：在TextGRU基础上把输入改为Bert编码的向量
184 | 
185 | - 此外，还有5大类待实现模型(灰色)
186 | 
187 | #### 三层类模型+全局Config的便捷之处
188 | 
189 | - 支持所有分类任务：二分类，多分类，多标签二分类，多标签多分类
190 | 
191 | - 支持各种输入组合：
192 | 
193 |   [word, char, word-structure, char-structure]中任意的4选1，4选2，4选3，4选4
194 | 
195 |   另外对于一些特殊模型，支持特殊输入，如TextRCNN模型的Context特征(word-left, word-right, char-left, char-right)，以及TextHAN模型的Sentence-level特征
196 | 
197 | - 模型训练评估支持KFold，支持6种Finetuning方式
198 | 
199 | - 绝大多数模型支持Attention，绝大多数模型支持丰富的参数配置
200 | 
201 | ## Train & Evaluation
202 | 
203 | ### Train
204 | 
205 | Step1: 运行DataPreprocessing.py，基于已标注数据生成训练数据并保存本地
206 | 
207 | Step2: 运行Embedding.py，自己训练Embedding，读取公开训练的Embedding，支持char+word两种粒度
208 | 
209 | Step3: 运行Vocabulary.py，生成词汇表，基于Embedding生成映射字典，生成Embedding Layer初始化权重矩阵等，支持char+word两种粒度
210 | 
211 | Step4: 运行FeatureStructured.py，生成TFIDF特征和LSA特征，支持word+char两种粒度
212 | 
213 | Step5: 运行[ModelTrain.py](https://github.com/liuyaox/text_classification/blob/master/ModelTrain.py)，项目全流程，包括：数据准备、Token筛选、特征和Label编码、划分Train/Test、环境配置、模型生成、模型训练和评估、模型持久化，详见脚本注释。
214 | 
215 | 运行脚本：python3 ModelTrain.py
216 | 
217 | 在运行脚本之前，先修改脚本里的配置项，内容如下：
218 | 
219 | ```python
220 | # 根据实际情况修改，也可直接在Config.py里修改，推荐前者
221 | config.n_gpus = 1
222 | config.token_level = 'word'		# 只使用word-level特征，不使用char-level
223 | config.structured = 'none'		# 不使用结构化特征
224 | config.bert_flag = False		# 不使用Bert编码的输入向量
225 | ```
226 | 
227 | ### Evaluation
228 | 
229 | 15个模型的评估结果如下表所示：
230 | 
231 | ![1573368628525](./image/1573368628525.png)
232 | 
233 | 备注：模型并未进行精细化调参，大多是默认配置和参数，效果仅供参考。
234 | 
235 | 从评估结果中可得出以下结论：
236 | 
237 | #### 同一模型内
238 | 
239 | - word+char相比word，效果明显有提升
240 | 
241 | - word+char+structured相比word+char，效果提升不明显，一些情况下反而会下降
242 | 
243 | #### 不同模型间
244 | 
245 | - TextCNN训练最快，Precision和F1值相对也较高，可作为一个强有力的Baseline
246 | 
247 | - TextRNN训练很慢，效果不是特别好，可能是因为训练数据很多是短文本
248 | 
249 | - 各模型之间效果差不多(全是默认参数，没时间做精细化调参)
250 | - 输入改为Bert编码向量后效果提升比较明显，简单的模型(TextGRU)就得到了最好的F1值，后续值得好好研究
251 | - TextHAN比较给力，取到了最高的Precision，后续值得好好研究
252 | 
253 | ## Conclusion
254 | 
255 | 1. **一个脚本只干一件事情，一件事情只在一个脚本里干**，各脚本解耦，各功能独立，互相之间只通过持久化和Config共享信息
256 | 
257 | 2. 充分利用**类和继承以及闭包**，相同功能不要重复定义，也不要到处粘贴复制，相似的功能通过闭包来实现
258 | 
259 | 3. Vocabulary及相关映射字典、Embedding权重，**封装整合为一个class**，统一管理
260 | 
261 | 4. 调试便捷化+逻辑清晰化
262 | 
263 |    a. 训练和应用**数据封装进字典**，单输入和多输入使用无差别，字典key对应模型搭建时Input的参数name
264 | 
265 |    b. 动态搭建模型，使其无缝支持多种输入及其组合
266 | 
267 |    方法：通用方法位于父类BasicDeepModel，各子类模型分为**模型主体和模型结尾**2部分，模型核心的纯粹的结构位于模型主体，根据输入不同，进行配置和组装，然后接入模型结尾
268 | 
269 |    c. 不同类模型，先选择最简单的模型如TextCNN，深入研究经验和Tricks，然后复制到别的模型
270 | 
271 |    d. 同一类模型，先搭建并跑通最简单的模型，随后基于评估效果，逐渐加深加宽
272 | 
273 | 5. 模型组件
274 | 
275 |    a. CNN+RNN是标配，CNN提取关键词，RNN适合前几层，提取依赖信息，Attention和MaxPooling可突出关键特征
276 | 
277 |    b. Capsule可代替CNN，有时效果好于CNN
278 | 
279 |    c. 有条件就使用Bert
280 | 
281 | ## Reference
282 | 
283 | #### Code
284 | 
285 | - 文本分类 - Keras
286 | 
287 |   <https://github.com/nlpjoe/daguan-classify-2018>
288 | 
289 |   <https://github.com/yongzhuo/Keras-TextClassification>
290 | 
291 |   <https://github.com/ShawnyXiao/TextClassification-Keras>
292 | 
293 | - 多标签分类 - PyTorch
294 | 
295 |   <https://github.com/chenyuntc/PyTorchText> (2017知乎看山杯 多标签文本分类大赛 Rank1)
296 | 
297 |   <https://github.com/Magic-Bubble/Zhihu> (同上，Rank2)
298 | 
299 | #### Libray
300 | 
301 | - [kashgari](https://github.com/BrikerMan/Kashgari) : NLP框架，超级傻瓜，超级Cutting Edge
302 | 
303 | - [hyperas](https://github.com/maxpumperla/hyperas) : Keras超参数优化工具
304 | 
305 | - [sk-multilearn](https://github.com/scikit-multilearn/scikit-multilearn) : Sklearn生态下的多标签分类工具
306 | 
307 | #### Article
308 | 
309 | - [用深度学习（CNN RNN Attention）解决大规模文本分类问题 - 综述和实践 ](https://zhuanlan.zhihu.com/p/25928551)
310 | 
311 | - [在文本分类任务中，有哪些论文中很少提及却对性能有重要影响的tricks？](https://www.zhihu.com/question/265357659)
312 | 
313 | 


--------------------------------------------------------------------------------
/TokenSelection.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created:    2019-08-07 21:13:53
  4 | Author:     liuyao8
  5 | Descritipn: word/char选择，基于卡方统计量。
  6 |             TODO 注意！word/char是否筛选上了，在Embedding和Vocabulary时可以先不考虑，主要在向量化编码时再考虑是否过滤
  7 | """
  8 | 
  9 | import numpy as np
 10 | from collections import Counter
 11 | 
 12 | 
 13 | # TODO 建议 onlyin和excluding都要有，有时excluding使用更方便！
 14 | 
 15 | # 基于卡方统计量，进行特征选择
 16 | 
 17 | def occurrence_matrix(texts, categories):
 18 |     """
 19 |     基于texts和category原始数据，计算token与category的共现矩阵
 20 |     ARGS
 21 |         texts: iterable, 每个元素是一个token列表, token既可以是token也可以是token id
 22 |         categories: iterable, 每个元素是一个类别id，与texts各元素一一对应
 23 |     RETURN
 24 |         tokens: tokens列表
 25 |         matrix: 列表，元素与tokens一一对应，相当于token与category共现矩阵，可用于计算两者卡方统计量，从而进行特征选择(token选择)
 26 |     NOTES
 27 |         注意，要求categories是向量化后的类别id，且要求类别id从0开始依次递增，如0,1,2,3,...
 28 |     """
 29 |     cates_num = len(set(categories))
 30 |     dic = {}
 31 |     for text, cate in zip(texts, categories):
 32 |         for token in set(text):
 33 |             if token not in dic:
 34 |                 dic[token] = [0] * cates_num
 35 |                 dic[token][cate] += 1
 36 |             else:
 37 |                 dic[token][cate] += 1
 38 |     tokens = list(dic.keys())
 39 |     matrix = list(dic.values())
 40 |     return matrix, tokens
 41 | 
 42 | 
 43 | def chi2_value(matrix, mask=True):
 44 |     """
 45 |     基于共现矩阵计算卡方统计量
 46 |     ARGS
 47 |         matrix: 二维array或list，共现矩阵，以word，document和document category为例，行是word，列是category，某行某列取值表示：当前category下含有当前word的document数量
 48 |         mask: 当category下含有word的document数量为0时，是否不再计算category与word的卡方统计量
 49 |     RETURN
 50 |         values: 卡方统计量，等于(AD-BC)^2*N/((A+B)(A+C)(B+D)(C+D))
 51 |     """
 52 |     A = np.array(matrix, dtype=np.float)        # A: category下含有word的样本数量，注意类型为float，以便于后续各种复杂计算
 53 |     word_sum = np.sum(A, 1).reshape((-1, 1))    # 各行对应的样本数，转化为列向量
 54 |     type_sum = np.sum(A, 0)                     # 各列对应的样本数
 55 |     N = np.sum(type_sum)                        # N: 总样本数量  各行各列总和
 56 |     B = word_sum - A                            # B: 非category下含有word的样本数量
 57 |     C = type_sum - A                            # C: category下不含有word的样本数量
 58 |     D = N - A - B - C                           # D: 非category下不含有word的样本数量
 59 |     # 若针对每一列，当前列内比较各行，而确定某列后，N, A+C, B+D都是确定不变的，可省略
 60 |     # 若针对每一行，当前行内比较各列，而确定某行后，N, A+B, C+D都是确定不变的，可省略
 61 |     values = N * (A * D - B * C) ** 2 / ((A + B) * (A + C) * (B + D) * (C + D))
 62 |     if mask:
 63 |         masking = np.sign(A)       # 当A=0时，value应该为0
 64 |         values = masking * values
 65 |     return values, A, B, C, D, N
 66 | 
 67 | 
 68 | def feature_select_by_chi2(matrix, features, max_col_num=1000, mode='column', mask=True):
 69 |     """
 70 |     基于卡方统计量进行特征选择
 71 |     ARGS
 72 |         matrix,mask同chi2_value
 73 |         features: 特征列表，特征顺序务必要与matrix各行/列保持一致！用于特征索引转换为特征
 74 |         max_col_num: 每列可选择的特征数量最大值
 75 |         model: 特征选择的模式，column=各列分别选择特征然后汇总选择的特征，max=取特征各列卡方值最大值为特征卡方值从而选择特征，avg=取平均值
 76 |     RETURN
 77 |         cnter: collections.Counter，类似字典，表示选择的特征，及其被多少列选择
 78 |         selected: 列表，表示选择的特征
 79 |     """
 80 |     values, A, _, _, _, _ = chi2_value(matrix, mask)
 81 |     # 共有3种模式进行特征选择
 82 |     if mode == 'column':
 83 |         masking = np.sign(A)
 84 |         col_num = np.sum(masking, 0, dtype=np.int64)    # 各列拥有的特征数量，注意dtype为int，否则为float
 85 |         selected = []
 86 |         for i in range(A.shape[1]):                     # 遍历各列
 87 |             indices = np.argsort(values[:, i])          # 按卡方统计量排序各特征，取其排序索引
 88 |             k = min(max_col_num, col_num[i])
 89 |             topk = [features[i] for i in indices[-k:]]  # 前k个特征
 90 |             selected.extend(topk)
 91 |         cnter = Counter(selected)
 92 |         return cnter
 93 |     elif mode == 'avg':
 94 |         value = np.mean(values, axis=1)
 95 |     elif mode == 'max':
 96 |         value = np.max(values, axis=1)
 97 |     else:
 98 |         raise ValueError('mode must be column, avg or max !')
 99 |     indices = np.argsort(value)
100 |     selected = [features[i] for i in indices[-max_col_num:]]
101 |     return selected
102 | 
103 | 
104 | 
105 | if __name__ == '__main__':
106 |     
107 |     # 以下只是示例，项目中暂时未使用特征选择
108 |     # 示例：基于卡方统计量进行特征选择
109 |     texts = [['t1', 't2', 't3', 't4'], ['t2', 't3', 't5'], ['t1', 't4', 't5'], ['t2','t4'], ['t3', 't4'], ['t1', 't3', 't4']]
110 |     categories = [1, 2, 0, 1, 0, 1]
111 |     matrix, tokens = occurrence_matrix(texts, categories)
112 |     cnter = feature_select_by_chi2(matrix, tokens)  # cnter即为选择的特征及其被选择的次数
113 | 


--------------------------------------------------------------------------------
/Vocabulary.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created:    2019-07-31 16:26:01
  4 | Author:     liuyao8
  5 | Descritipn: a. 通用Vocabulary：token支持char和word，(token, idx, vector)之间4种映射字典，Embedding Layer初始化权重
  6 |             b. 向量化编码工具：支持Padding和Truncating，支持X和Label，支持including和excluding
  7 | """
  8 | 
  9 | import numpy as np
 10 | from functools import reduce
 11 | from tqdm import tqdm
 12 | from gensim.models import Word2Vec
 13 | 
 14 | 
 15 | # 与config保持一致
 16 | # Default word tokens   # TODO 除这4个外，是否还应该有一些别的，比如空格？见P115
 17 | PAD_IDX = 0   # PAD约定取0，不要改变，以下UNK,SOS,EOS可以改变
 18 | UNK_IDX = 1   # unknow word   # TODO 原本是没有UNK的？
 19 | SOS_IDX = 2   # Start of sentence
 20 | EOS_IDX = 3   # End of sentence 
 21 | 
 22 | 
 23 | class Vocabulary(object):
 24 |     """token词汇表，token包括word和character"""
 25 |     
 26 |     def __init__(self):
 27 |         # 通用信息
 28 |         self.token2idx_init = {'PAD': PAD_IDX, 'UNK': UNK_IDX, 'SOS': SOS_IDX, 'EOS': EOS_IDX}
 29 |         self.idx2token_init = {PAD_IDX: 'PAD', UNK_IDX: 'UNK', SOS_IDX: 'SOS', EOS_IDX: 'EOS'}
 30 |         
 31 |         # Word Level
 32 |         self.word2idx = self.token2idx_init.copy()    # TODO 原来是没有{'PAD': PAD_IDX, ...}的？
 33 |         self.idx2word = self.idx2token_init.copy()      # TODO 字典一定要用copy！！！否则大家都一起跟着改变
 34 |         self.word2count = {}
 35 |         self.word_vocab_size = 4
 36 |         self.word_trimmed = False       # 是否已过滤低频word
 37 |         self.word_stopwords = None      # 低频停用词
 38 |         
 39 |         self.word_embed_dim = 0
 40 |         self.word2vector = {}
 41 |         self.word_idx2vector = {}
 42 |         self.word_embed_matrix = None     # Embedding Layer Weights Matrix
 43 |         
 44 |         # Char Level
 45 |         # TODO char中也会出现UNK,SOS,EOS，也要进行PAD(padding为0，这就要求char2idx[0]就得是PAD)，因此char也要处理这4种TOKEN
 46 |         self.char2idx = self.token2idx_init.copy()
 47 |         self.idx2char = self.idx2token_init.copy()
 48 |         self.char2count = {}
 49 |         self.char_vocab_size = 4
 50 |         self.char_trimmed = False       # 是否已过滤低频char
 51 |         self.char_stopwords = None
 52 |         
 53 |         self.char_embed_dim = 0
 54 |         self.char2vector = {}
 55 |         self.char_idx2vector = {}
 56 |         self.char_embed_matrix = None
 57 |         
 58 |         
 59 |     # 1. 创建词汇表
 60 |     # 1.1 挨个添加token：直接添加token，通过sentence添加token，通过document添加token
 61 |     def add_token(self, token, level='word'):
 62 |         """添加word或char，一个一个添加"""
 63 |         assert level in ['word', 'char']
 64 |         token = token.strip()
 65 |         if level == 'word':
 66 |             if token not in self.word2idx:
 67 |                 self.word2idx[token] = self.word_vocab_size
 68 |                 self.idx2word[self.word_vocab_size] = token
 69 |                 self.word_vocab_size += 1
 70 |                 self.word2count[token] = 1
 71 |             else:
 72 |                 self.word2count[token] += 1
 73 |         else:
 74 |             if token not in self.char2idx:
 75 |                 self.char2idx[token] = self.char_vocab_size
 76 |                 self.idx2char[self.char_vocab_size] = token
 77 |                 self.char_vocab_size += 1
 78 |                 self.char2count[token] = 1
 79 |             else:
 80 |                 self.char2count[token] += 1
 81 |                 
 82 |                 
 83 |     def add_sentence(self, sentence, level='word', sep=' '):
 84 |         """按sentence添加word或char或both, sentence格式：sep分隔的分词字符串"""
 85 |         assert level in ['word', 'char', 'both']
 86 |         sentence = str(sentence)
 87 |         if level == 'word':
 88 |             for word in sentence.strip().split(sep):
 89 |                 self.add_token(word, level='word')
 90 |         elif level == 'char':
 91 |             for char in list(sentence.replace(sep, '')):    # 删除分隔符后，变成字符列表
 92 |                 self.add_token(char, level='char')
 93 |         else:
 94 |             for word in sentence.strip().split(sep):
 95 |                 self.add_token(word, level='word')
 96 |             for char in list(sentence.replace(sep, '')):
 97 |                 self.add_token(char, level='char')
 98 |         
 99 |         
100 |     def add_document(self, document, level='word', sep=' '):
101 |         """按document添加word或char或both"""
102 |         assert level in ['word', 'char', 'both']
103 |         for sentence in document:
104 |             self.add_sentence(sentence, level=level, sep=sep)
105 |             
106 |     
107 |     # 1.2 一次性添加所有token
108 |     def add_all(self, corpus, level='word', sep=' ', min_count=None):
109 |         """
110 |         词汇表 Vocabulary：支持 char-level 和 word-level，以及两者的汇总
111 |         统计 corpus 中 char/word 频率并倒序排序获得 idx，构建词汇字典：<char/word, idx>
112 |         注意：
113 |         其实也可不排序，直接随便赋给每个 char/word 一个 idx，只要保证唯一且固定即可
114 |         比如按加入 Vocabulary 顺序依次赋值为1,2,3,...，0另有所用，比如当作 <PAD>、空格或 <UNK> 的 idx
115 |         TODO idx=0 给谁？？怎么给？？ 也有把PAD和UNK赋值给词汇表里最后2个idx的
116 |         """
117 |         assert level in ['word', 'char', 'both']
118 |         token2count = {}
119 |         for line in corpus:
120 |             tokens = line.strip().split(sep) if level == 'word' else list(line.strip())  # word时默认每一行是分词后分隔好的结果
121 |             for token in tokens:
122 |                 token2count[token] = token2count.get(token, 0) + 1
123 |         if min_count:       # 过滤低频字/词
124 |             token2count = {word: num for (word, num) in token2count.items() if num >= min_count}
125 |         
126 |         token_sorted = sorted(token2count, key=token2count.get, reverse=True)       # 按token频率倒序排列
127 |         token_list = token_sorted if ' ' in token_sorted else [' '] + token_sorted  # TODO 空格是否加入vocab？ 如何确定idx=0对应的term???
128 |         
129 |         if level == 'word':
130 |             self.word2count = token2count
131 |             self.word2idx = {word: idx + 4 for (idx, word) in enumerate(token_list)}.update(self.token2idx_init)
132 |             self.idx2word = {idx: word for (word, idx) in self.word2idx.items()}
133 |             self.word_vocab_size = len(self.word2idx)
134 |         else:
135 |             self.char2count = token2count
136 |             self.char2idx = {char: idx + 4 for (idx, char) in enumerate(token_list)}.update(self.idx2token_init)
137 |             self.idx2char = {idx: char for (char, idx) in self.char2idx.items()}
138 |             self.char_vocab_size = len(self.char2idx)
139 |             
140 |     
141 |     # 2. 低频过滤
142 |     def trim(self, min_count, level='word'):
143 |         """过滤低频word或char"""
144 |         assert level in ['word', 'char']
145 |         if (level == 'word' and self.word_trimmed) or (level == 'char' and self.char_trimmed):
146 |             return
147 |         if level == 'word':
148 |             self.word_stopwords = [word for word, cnt in self.word2count.items() if cnt < min_count]
149 |             kept = [word for word, cnt in self.word2count.items() if cnt >= min_count]
150 |             print(f'kept words: {len(kept)} / {len(self.word2idx)} = {len(kept) / len(self.word2idx): .4f}')
151 |             self.word2idx = self.token2idx_init.copy()
152 |             self.idx2word = self.idx2token_init.copy()
153 |             self.word2count = {}
154 |             self.word_vocab_size = 4
155 |             for word in kept:
156 |                 self.add_token(word, level='word')
157 |             self.word_trimmed = True
158 |             
159 |         else:
160 |             self.char_stopwords = [char for char, cnt in self.char2count.items() if cnt < min_count]
161 |             kept = [char for char, cnt in self.char2count.items() if cnt >= min_count]
162 |             print(f'kept chars: {len(kept)} / {len(self.char2idx)} = {len(kept) / len(self.char2idx): .4f}')
163 |             self.char2idx = self.token2idx_init.copy()
164 |             self.idx2char = self.idx2token_init.copy()
165 |             self.char2count = {}
166 |             self.char_vocab_size = 4
167 |             for char in kept:
168 |                 self.add_token(char, level='char')
169 |             self.char_trimmed = True
170 |     
171 |     
172 |     # 3. 创建xxx2vector: (word/char, idx) --> vector
173 |     def init_vectors(self, embedding=None, level='word'):
174 |         """
175 |         基于训练好的word/char embedding，初始化word2vector或char2vector及其对应的idx2vector
176 |         其中embedding既可以是公开训练好的，也可以是自己训练好的，前者过于巨大，
177 |         后者其实理论上就是word2vector，但实际中可能会因为语料不同步等原因，导致两者的word并不完全相同。
178 |         另外后者可以是gensim.models.Word2Vec模型，也可以是普通字典
179 |         不管前者后者，我们只选择感兴趣的word(word2idx中的word)
180 |         TODO 优化点：增加备用 word embedding  如同get_word2vector_idx2vector一样！
181 |         """
182 |         assert level in ['word', 'char']
183 |         if isinstance(embedding, Word2Vec):
184 |             embedding = {token: embedding[token] for token in embedding.wv.vocab.keys()}
185 |             
186 |         embed_dim = len(list(embedding.values())[0])
187 |         if level == 'word':
188 |             self.word_embed_dim = embed_dim
189 |             for word, idx in self.word2idx.items():
190 |                 if word in embedding:
191 |                     vector = embedding.get(word)
192 |                 else:
193 |                     vectors = [embedding.get(x, np.random.uniform(-0.01, 0.01, (embed_dim))) for x in list(word)]
194 |                     vector = reduce(lambda x, y: x + y, vectors) / len(vectors)     # OOV时使用对应的若干字符向量的Average
195 |                 self.word2vector[word] = vector
196 |                 self.word_idx2vector[idx] = vector
197 |         else:
198 |             self.char_embed_dim = embed_dim
199 |             for char, idx in self.char2idx.items():
200 |                 vector = embedding.get(char, np.random.uniform(-0.01, 0.01, (embed_dim)))
201 |                 self.char2vector[char] = vector
202 |                 self.char_idx2vector[idx] = vector
203 | 
204 | 
205 |     # 4. 生成Embedding Layer的初始化权重
206 |     def init_embed_matrix(self, level='word'):
207 |         """基于wordidx2vector或charidx2vector生成用于Embedding Layer的weights matrix
208 |         TODO 总觉得似乎哪里不对？？？<PAD> <UNK>之类的如何处理？
209 |         """
210 |         assert level in ['word', 'char']
211 |         if level == 'word':
212 |             all_embs = np.stack(self.word_idx2vector.values())
213 |             self.word_embed_matrix = np.random.normal(all_embs.mean(), all_embs.std(), size=(self.word_vocab_size, self.word_embed_dim))
214 |             for idx, vector in tqdm(self.word_idx2vector.items()):
215 |                 self.word_embed_matrix[idx] = vector
216 |         else:
217 |             all_embs = np.stack(self.char_idx2vector.values())
218 |             self.char_embed_matrix = np.random.normal(all_embs.mean(), all_embs.std(), size=(self.char_vocab_size, self.char_embed_dim))
219 |             for idx, vector in tqdm(self.char_idx2vector.items()):
220 |                 self.char_embed_matrix[idx] = vector
221 | 
222 | 
223 | # 一些与Vocabulary相关的工具
224 | # TODO classmethod ???
225 | def seq_to_idxs(seq, token2idx, token_maxlen, unk_idx=UNK_IDX, pad_idx=PAD_IDX, 
226 |                 padding='post', truncating='post', onlyin=None, excluding=[]):
227 |     """
228 |     向量化编码：基于词汇表token2idx，把seq转化为idx向量，词汇表中不存在的token使用unk_idx进行编码，适用于特征编码和Label编码
229 |     输入seq是分词/分字列表，如：['我', '们', '爱', '学', '习'] 或 ['我们', '爱', '学习']
230 |     函数功能 = 向量化 + keras.sequence.pad_sequence
231 |     ARGS
232 |         padding & truncating: post=从后面补零/截断  pre=从前面
233 |         onlyin: 只关注这里面的token
234 |         excluding: 不关注这里面的token
235 |     NOTE
236 |         当onlyin和excluding都存在时同时满足条件，即token in onlyin and token not in excluding
237 |     """
238 |     if onlyin:
239 |         seq = [token for token in seq if token in onlyin]
240 |     seq = [token for token in seq if token not in excluding + ['', ' ']]        # TODO ['', ' ']???
241 |     
242 |     seq_vec = [token2idx.get(token, unk_idx) for token in seq]                  # OOV的token标注为专门的unk_idx
243 |     seq_vec = seq_vec[: token_maxlen] if truncating == 'post' else seq_vec[-token_maxlen:]      # 截断：前或后
244 |     paddings = [pad_idx] * (token_maxlen - len(seq_vec))         	               # 小于向量长度的部分用pad_idx来padding
245 |     return seq_vec + paddings if padding == 'post' else paddings + seq_vec      # PAD: 前或后
246 | 
247 | 
248 | 
249 | def example():
250 |     """创建word和char的词汇表，并保存本地"""
251 |     import pandas as pd
252 |     import pickle
253 |     from Config import Config
254 |     config = Config()
255 |     
256 |     data = pd.read_csv(config.data_file, sep='\t', encoding='utf8')
257 |     sentences_word, sentences_char = data['question_wordseg'], data['question_charseg']
258 |     
259 |     # 创建词汇表
260 |     # TODO 仅仅使用当前任务的全量数据么？要不要加一些其他更全的语料库？应用时，遇到OOV的词汇咋整？
261 |     vocab = Vocabulary()
262 |     vocab.add_document(sentences_word, level='word')
263 |     vocab.add_document(sentences_char, level='char')        # word与char-level使用的数据不一样(停用词不一样)，所以分别单独创建
264 |     vocab.trim(min_count=config.MIN_COUNT, level='word')    # min_count与训练Embedding时保持一致
265 |     vocab.trim(min_count=config.MIN_COUNT, level='char')
266 |     # kept words: 5484 / 11692 =  0.4690
267 |     # kept chars: 1594 / 2052 =  0.7768
268 |     
269 |     # 生成xxx2vector和Embedding Layer初始化权重
270 |     model_word2vec = Word2Vec.load(config.model_word2vec_file)
271 |     model_char2vec = Word2Vec.load(config.model_char2vec_file)
272 |     vocab.init_vectors(model_word2vec, level='word')
273 |     vocab.init_vectors(model_char2vec, level='char')
274 |     vocab.init_embed_matrix(level='word')
275 |     vocab.init_embed_matrix(level='char')
276 |     
277 |     # 保存本地
278 |     pickle.dump(vocab, open(config.vocab_file, 'wb'))
279 |     
280 |     
281 | 
282 | if __name__ == '__main__':
283 |     
284 |     example()
285 |     


--------------------------------------------------------------------------------
/image/1573355016134.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liuyaox/text_classification/829573bfbbee8076a822ae65f9d5e6d49e15b375/image/1573355016134.png


--------------------------------------------------------------------------------
/image/1573364046216.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liuyaox/text_classification/829573bfbbee8076a822ae65f9d5e6d49e15b375/image/1573364046216.png


--------------------------------------------------------------------------------
/image/1573366328001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liuyaox/text_classification/829573bfbbee8076a822ae65f9d5e6d49e15b375/image/1573366328001.png


--------------------------------------------------------------------------------
/image/1573368628525.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liuyaox/text_classification/829573bfbbee8076a822ae65f9d5e6d49e15b375/image/1573368628525.png


--------------------------------------------------------------------------------
/model/BasicModel.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created:    2019-08-11 19:40:55
  4 | Author:     liuyao8
  5 | Descritipn: a. BasicModel: 模型基类，用于生成BasicStatModel和BasicDeepModel，目前仅提供功能：模型评估Metrics计算
  6 |             b. BasicStatModel: 传统模型基类，提供通用功能：
  7 |             c. BasicDeepModel: 深度模型基类，提供通用功能：
  8 | """
  9 | 
 10 | import os
 11 | from functools import reduce
 12 | from collections import Counter
 13 | import numpy as np
 14 | import pickle
 15 | import matplotlib
 16 | matplotlib.use("agg")
 17 | import matplotlib.pyplot as plt
 18 | from scipy.stats import entropy
 19 | from sklearn.metrics.pairwise import cosine_similarity
 20 | from sklearn.metrics import roc_curve, auc
 21 | from sklearn.model_selection import KFold
 22 | 
 23 | from keras.layers import Input, Masking, Embedding
 24 | from keras.models import load_model
 25 | from keras.utils import multi_gpu_model, plot_model
 26 | from keras.optimizers import Adam, SGD
 27 | from keras.callbacks import LearningRateScheduler, EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
 28 | 
 29 | 
 30 | class BasicModel(object):
 31 |     
 32 |     def __init__(self):
 33 |         # TODO 其实可以放一些通用的变量如label数量等
 34 |         pass
 35 |     
 36 |     def build(self):
 37 |         pass
 38 | 
 39 | 
 40 |     # Metrics: Precision, Recall, F1-score, Distribution Similarity, ROC curve, ROC area, etc.
 41 |     # TODO 添加method
 42 |     def multilabel_precision_recall(self, ys_pred, ys_true):
 43 |         """
 44 |         多标签分类标准Metrics: Precision, Recall, F1-score
 45 |         ARGS
 46 |             ys_pred: 预测标签，iterable of iterable，形如：[['a', 'b', 'c'], ['a', 'd'], ['b'], ...]
 47 |             ys_true: 真实标签，格式同y_pred
 48 |         RETURN
 49 |             precision: 总命中标签数/总预测标签数
 50 |             recall:    总命中标签数/总真实标签数
 51 |             f1score:   (precision * recall) / (precision + recall)
 52 |         """
 53 |         assert len(ys_pred) == len(ys_true)
 54 |         ys_pred = self.label_binarizer.inverse_transform(ys_pred > 0.5)
 55 |         ys_true = self.label_binarizer.inverse_transform(ys_true)
 56 |         
 57 |         right_num, all_pred_num, all_true_num = 0, 0, 0     # 总命中标签数  总预测标注数  总真实标签数
 58 |         for y_pred, y_true in zip(ys_pred, ys_true):
 59 |             y_pred_set, y_true_set = set(y_pred), set(y_true)
 60 |             all_pred_num += len(y_pred_set)
 61 |             all_true_num += len(y_true_set)
 62 |             right_num += len(y_pred_set & y_true_set)       # 命中标签数：交集大小
 63 |         
 64 |         precision = float(right_num) / all_pred_num
 65 |         recall = float(right_num) / all_true_num
 66 |         f1score = (precision * recall) / (precision + recall)
 67 |         return round(precision, 4), round(recall, 4), round(f1score, 4)
 68 |         
 69 |     
 70 |     def roc_auc(self, ys_pred, ys_true, n_label):
 71 |         """
 72 |         ROC-AUC curve  ????
 73 |         ARGS
 74 |             ys_pred: 预测标签（的概率？）,iterable of iterable，原始预测结果，shape=(n_sample, n_label)
 75 |             ys_true: 真实标签？ shape同上
 76 |             n_label: 标签个数
 77 |         """
 78 |         # 为每个label计算ROC curve和ROC area
 79 |         fpr, tpr = {}, {}
 80 |         roc_auc = {}
 81 |         for i in range(n_label):
 82 |             fpr[i], tpr[i], _ = roc_curve(ys_true[:, i], ys_pred[:, i])
 83 |             roc_auc[i] = auc(fpr[i], tpr[i])
 84 |         # 计算micro-average ROC curve and ROC area
 85 |         fpr['micro'], tpr['micro'], _ = roc_curve(ys_true.ravel(), ys_pred.ravel())
 86 |         roc_auc['micro'] = auc(fpr['micro'], tpr['micro'])
 87 |         
 88 |         
 89 |     def multilabel_distribution_similarity(self, ys_pred, ys_true):
 90 |         """
 91 |         多标签分类特定Metrics: 各标签分布余弦相似度和KL散度
 92 |         ARGS同上
 93 |         RETURN
 94 |             similarity: 基于各标签数据分布，预测结果与真实结果的余弦相似度 越小越好
 95 |             relative_entropy: KL散度/相对熵 越小越好
 96 |         """
 97 |         assert len(ys_pred) == len(ys_true)
 98 |         ys_pred = self.label_binarizer.inverse_transform(ys_pred > 0.5)
 99 |         ys_true = self.label_binarizer.inverse_transform(ys_true)
100 |         
101 |         ys_pred = Counter(reduce(lambda x, y: x + y, ys_pred))
102 |         ys_true = Counter(reduce(lambda x, y: x + y, ys_true))
103 |         keys = list(set(list(ys_pred.keys()) + list(ys_true.keys())))
104 |         vec_pred = [ys_pred[k] for k in keys]
105 |         vec_true = [ys_true[k] for k in keys]
106 |         
107 |         sim_cosine = cosine_similarity([vec_pred], [vec_true])[0, 0]    # 余弦相似度
108 |         sim_entropy = entropy(vec_pred, vec_true)                       # KL散度/相对熵
109 |         sim_eucliean = sum([(x - y) ** 2 for (x, y) in zip(vec_pred, vec_true)]) ** 0.5
110 |         sim_manhattan = sum([abs(x - y) for (x, y) in zip(vec_pred, vec_true)])
111 |         sims  = (round(sim_cosine, 4), round(sim_entropy, 4), round(sim_eucliean, 4), round(sim_manhattan, 4))
112 |         return (vec_pred, vec_true), sims
113 | 
114 | 
115 | 
116 | class BasicStatModel(BasicModel):
117 |     
118 |     def __init__(self, n_fold=5, name='BasicStatModel', config=None):
119 |         pass
120 |     
121 |     
122 |     
123 | class BasicDeepModel(BasicModel):
124 |     
125 |     def __init__(self, config=None, name='BasicDeepModel', model_summary=True, model_plot=False, 
126 |                  token_level=None, structured=None, bert_flag=None):
127 |         # 基本信息
128 |         if token_level:
129 |             config.token_level = token_level
130 |         if structured:
131 |             config.structured = structured
132 |         if bert_flag:
133 |             config.bert_flag = bert_flag
134 |         self.config = config
135 |         stru_postfix = '_stru-' + config.structured if config.structured != 'none' else ''
136 |         bert_postfix = '_bert' if config.bert_flag else ''
137 |         self.name = name + '_level-' + config.token_level + stru_postfix + bert_postfix
138 |         
139 |         
140 |         # 任务类型决定了类别数量、激活函数和损失函数
141 |         if config.task == 'binary':                     # 单标签二分类
142 |             self.n_classes =  1
143 |             self.activation = 'sigmoid'
144 |             self.loss = 'binary_crossentropy'
145 |             self.metrics = ['accuracy']
146 |         elif config.task == 'categorical':              # 单标签多分类
147 |             self.n_classes = config.N_CLASSES
148 |             self.activation = 'softmax'
149 |             self.loss = 'categorical_crossentropy'
150 |             self.metrics = ['accuracy']                 # TODO ???
151 |         elif config.task == 'multilabel':               # 多标签二分类(多标签多分类需转化为多标签二分类)
152 |             self.n_classes = config.N_CLASSES
153 |             self.activation = 'sigmoid'
154 |             self.loss = 'binary_crossentropy'
155 |             self.metrics = ['accuracy']
156 |         
157 |         
158 |         # TODO 能不能删除这些self.xxx，而直接使用self.config.xxx来代替！？
159 |         # word相关
160 |         self.word_maxlen = config.WORD_MAXLEN
161 |         self.word_vocab_size = config.WORD_VOCAB_SIZE
162 |         self.word_embed_dim = config.WORD_EMBED_DIM
163 |         self.word_embed_matrix = config.word_embed_matrix
164 |         
165 |         # char相关
166 |         self.char_maxlen = config.CHAR_MAXLEN
167 |         self.char_vocab_size = config.CHAR_VOCAB_SIZE
168 |         self.char_embed_dim = config.CHAR_EMBED_DIM
169 |         self.char_embed_matrix = config.char_embed_matrix
170 |         
171 |         # KFold相关
172 |         self.n_folds = config.n_folds
173 |         self.kfold = KFold(n_splits=config.n_folds, shuffle=True, random_state=10)
174 |     
175 |         # Model相关
176 |         self.masking_value = config.PAD_IDX   # TODO mask PAD 突然想到：与PyTorch中的packed_padding和padded_packing相同功能？？？
177 |         self.create_model(model_summary, model_plot)
178 |         
179 |         # Train相关
180 |         self.n_epochs = 20
181 |         self.batch_size = config.BATCH_SIZE
182 |         self.init_lr = 0.001
183 |         
184 |         # Callback
185 |         self.lr_schedule = None
186 |         self.early_stopping = None
187 |         self.snap_epochs = 10   # TODO ?
188 |         self.snapshot = None
189 |         self.checkpoint = None
190 |         
191 |         # Predict相关
192 |         self.label_binarizer = config.label_binarizer
193 |         
194 |         
195 |     def create_model(self, model_summary=True, model_plot=False):
196 |         """调用当前类的build_layers生成通用layers，调用子类的build_model生成model"""
197 |         self.build_layers()
198 |         self.build_model()
199 |         if self.config.n_gpus > 1:
200 |             self.model = multi_gpu_model(self.model, gpus=self.config.n_gpus)
201 |         if model_summary:
202 |             self.model.summary()
203 |         if model_plot:
204 |             plot_model(self.model, to_file=self.name+'.png', show_shapes=True)
205 |         
206 |         
207 |     def build_layers(self):
208 |         """创建DeepModel通用的Layers: Input, Masking, Embedding"""
209 |         if self.config.token_level == 'word':
210 |             self.word_input = Input(shape=(self.word_maxlen, ), dtype='int32', name='word')
211 |             self.word_masking = Masking(mask_value=self.masking_value)
212 |             self.word_embedding = Embedding(self.word_vocab_size, self.word_embed_dim, weights=[self.word_embed_matrix], name='word_embedding')
213 |         elif self.config.token_level == 'char':
214 |             self.char_input = Input(shape=(self.char_maxlen, ), dtype='int32', name='char')
215 |             self.char_masking = Masking(mask_value=self.masking_value)
216 |             self.char_embedding = Embedding(self.char_vocab_size, self.char_embed_dim, weights=[self.char_embed_matrix], name='char_embedding')
217 |         else:
218 |             self.word_input = Input(shape=(self.word_maxlen, ), dtype='int32', name='word')
219 |             self.char_input = Input(shape=(self.char_maxlen, ), dtype='int32', name='char')
220 |             self.word_masking = Masking(mask_value=self.masking_value)
221 |             self.char_masking = Masking(mask_value=self.masking_value)
222 |             self.word_embedding = Embedding(self.word_vocab_size, self.word_embed_dim, weights=[self.word_embed_matrix], name='word_embedding')
223 |             self.char_embedding = Embedding(self.char_vocab_size, self.char_embed_dim, weights=[self.char_embed_matrix], name='char_embedding')
224 |         
225 |         # 结构化特征
226 |         word_structured = Input(shape=(self.config.word_svd_n_componets, ), dtype='float32', name='word_structured')
227 |         char_structured = Input(shape=(self.config.char_svd_n_componets, ), dtype='float32', name='char_structured')
228 |         if self.config.structured == 'word':
229 |             # TODO 只支持LSA特征，暂不支持TFIDF特征，因为维度太大
230 |             self.structured_input = [word_structured]   # 放在[]中是方便添加到别的列表中，比如Input列表和Tensor列表
231 |         elif self.config.structured == 'char':
232 |             self.structured_input = [char_structured]
233 |         elif self.config.structured == 'both':
234 |             self.structured_input = [word_structured, char_structured]
235 |         
236 |         # Bert编码向量
237 |         if self.config.bert_flag:
238 |             self.word_input = Input(shape=(self.config.bert_maxlen, self.config.bert_dim, ), dtype='float32', name='word_bert')  # 输入是2维！
239 |             self.word_masking = Masking(mask_value=self.masking_value)
240 |             self.word_embedding = None
241 |         
242 |         
243 |     def lr_decay_poly(self, epoch, alpha=0.5, beta=12):
244 |         """训练learning rate衰减schedular"""
245 |         # TODO 哪种衰减？？？
246 |         init_lr = self.init_lr
247 |         lr = init_lr * alpha * ((1 + epoch) // beta)
248 |         print(f'Epoch: {1 + epoch}, lr: {lr}, wd: {self.wd}')
249 |         return lr
250 |         
251 |     
252 |     def plot_history(self, history, i_fold=None):
253 |         """绘制训练loss和accuracy，并保存图片"""
254 |         if not isinstance(history, dict):
255 |             history = history.history
256 |         epochs = np.arange(0, len(history['loss']))
257 |         plt.style.use('ggplot')
258 |         plt.figure()
259 |         plt.plot(epochs, history['loss'], label='train_loss')
260 |         plt.plot(epochs, history['val_loss'], label='val_loss')
261 |         plt.plot(epochs, history['acc'], label='train_acc')
262 |         plt.plot(epochs, history['val_acc'], label='val_acc')
263 |         plt.title(self.name + ' (mode=' + str(self.mode) + ')')
264 |         plt.xlabel('Epoch #')
265 |         plt.ylabel('Loss & Accuracy')
266 |         plt.legend()
267 |         os.makedirs('history', exist_ok=True)
268 |         postfix = '-fold' + str(i_fold) if i_fold else ''
269 |         plt.savefig('history/' + self.name + '-mode' + str(self.mode) + postfix + '.png')
270 |         plt.close()
271 |     
272 |     
273 |     def plot_histories(self, history1, history2, i_fold=None):
274 |         """绘制两阶段训练的loss和accuracy，并保存图片"""
275 |         history1, history2 = history1.history, history2.history
276 |         history = {}
277 |         history['loss'] = history1['loss'] + history2['loss']
278 |         history['val_loss'] = history1['val_loss'] + history2['val_loss']
279 |         history['acc'] = history1['acc'] + history2['acc']
280 |         history['val_acc'] = history1['val_acc'] + history2['val_acc']
281 |         self.plot_history(history, i_fold)
282 |     
283 |     
284 |     def embedding_trainable(self, trainable=True):
285 |         """是否解冻Embedding Layer"""
286 |         if self.config.token_level == 'both':
287 |             self.model.get_layer('char_embedding').trainable = trainable
288 |             if not self.config.bert_flag:
289 |                 self.model.get_layer('word_embedding').trainable = trainable
290 |         elif self.config.token_level == 'word':
291 |             if not self.config.bert_flag:
292 |                 self.model.get_layer('word_embedding').trainable = trainable
293 |         elif self.config.token_level == 'char':
294 |             self.model.get_layer('char_embedding').trainable = trainable
295 |         else:
296 |             exit('Wrong Token Level')
297 | 
298 | 
299 |     def _evaluate(self, x_test, y_test):
300 |         """模型评估"""
301 |         _, test_acc = self.model.evaluate(x_test, y_test)
302 |         test_pred = self.model.predict(x_test, verbose=1)
303 |         scores = self.multilabel_precision_recall(test_pred, y_test)
304 |         vectors, sims = self.multilabel_distribution_similarity(test_pred, y_test)
305 |         print('------------------ Final: Test Metrics: ------------------')
306 |         print('Test Accuracy: ' + str(round(test_acc, 4)))
307 |         print('Precision: ' + str(scores[0]) + '  Recall: ' + str(scores[1]) + '  F1score: ' + str(scores[2]))
308 |         print('Cosine: ' + str(sims[0]) + '  Entropy: ' + str(sims[1]) + '  Euclidean: ' + str(round(sims[2], 1)) + '  Manhattan: ' + str(sims[3]))
309 |         return test_acc, scores, sims, vectors, test_pred
310 | 
311 | 
312 |     def train_evaluate(self, x_train, y_train, x_test, y_test, lr=1e-3, epochs=None):
313 |         """
314 |         模型训练和评估
315 |         x_train/x_test是字典(key=Input创建时的name, value=Input对应的数据)，能够支持多输入
316 |         """
317 |         # 模型训练
318 |         print('【' + self.name + '】')
319 |         if self.config.bert_flag:           # 以Bert编码向量作为输入的模型
320 |             epochs = epochs if epochs else self.n_epochs
321 |             print('---------------------------------------------------------------------')
322 |             optimizer = Adam(lr=lr)
323 |             self.model.compile(loss=self.loss, optimizer=optimizer, metrics=self.metrics)
324 |             history = self.model.fit(x_train, y_train, 
325 |                                      batch_size=self.batch_size*self.config.n_gpus,
326 |                                      epochs=epochs,
327 |                                      validation_split=0.3)
328 |         else:
329 |             self.mode = 3
330 |             epochs = epochs if epochs else (2, self.n_epochs)
331 |             print('-------------------Step1: 前期冻结Embedding层，编译和训练模型-------------------')
332 |             self.embedding_trainable(False)
333 |             print('Embedding Trainable: ' + str(self.model.get_layer('word_embedding').trainable))
334 |             optimizer = Adam(lr=lr, clipvalue=2.4)       # clipvalue不应该写死，或者使用默认值！下同
335 |             self.model.compile(loss=self.loss, optimizer=optimizer, metrics=self.metrics)
336 |             history1 = self.model.fit(x_train, y_train, 
337 |                                       batch_size=self.batch_size*self.config.n_gpus,
338 |                                       epochs=epochs[0],
339 |                                       validation_split=0.3)
340 |             print('-------------Step2: 训练完参数后，解冻Embedding层，再次编译和训练模型-------------')
341 |             self.embedding_trainable(True)
342 |             print('Embedding Trainable: ' + str(self.model.get_layer('word_embedding').trainable))
343 |             optimizer = Adam(lr=lr, clipvalue=1.5)
344 |             self.model.compile(loss=self.loss, optimizer=optimizer, metrics=self.metrics)
345 |             #callbacks = [self.lr_schedule, self.checkpoint, ]       # TODO self.checkpoint???
346 |             history2 = self.model.fit(x_train, y_train, 
347 |                                       batch_size=self.batch_size*self.config.n_gpus,
348 |                                       epochs=epochs[1],
349 |                                       validation_split=0.3,
350 |                                       callbacks=None)
351 |             self.plot_history(history2)
352 |             history = (history1, history2)
353 |             
354 |         # 模型评估
355 |         test_acc, scores, sims, vectors, test_pred = self._evaluate(x_test, y_test)
356 |         pickle.dump(test_pred, open('./result/' + self.name + '_test_pred.pkl', 'wb'))
357 |         return test_acc, scores, sims, vectors, history
358 |         
359 |     
360 |     def model_compile_fit(self, data_fold, optimizer='adam', callbacks=None, epochs=None, model_file=None):
361 |         """模型编译和训练Helper Function，支持各种配置"""
362 |         x_train, y_train, x_val, y_val = data_fold
363 |         epochs = epochs if epochs else self.n_epochs
364 |         self.model.compile(loss=self.loss, optimizer=optimizer, metrics=self.metrics)   # TODO 多标签时accuracy含义是什么？
365 |         history = self.model.fit(x_train, y_train, 
366 |                                  batch_size=self.batch_size*self.config.n_gpus,
367 |                                  epochs=epochs,
368 |                                  validation_data=(x_val, y_val),
369 |                                  callbacks=callbacks)
370 |         if model_file:
371 |             self.model.save_weights(model_file)
372 |         return history
373 | 
374 | 
375 |     def train_evaluate_cv(self, x_train, y_train, x_test, mode=3):
376 |         """
377 |         使用KFold方式训练模型，应用于x_train和x_test
378 |         x_train/x_test是字典(key=Input创建时的name, value=Input对应的数据)，表示多输入
379 |         model: 训练模式，包括各种Finetuning策略等
380 |         """
381 |         self.mode = mode
382 |         checkpoint_path = 'checkpoint-mode' + str(mode) + '/' + self.name + '/'
383 |         os.makedirs(checkpoint_path, exist_ok=True)
384 |         # 先保存训练前的原始模型(参数和状态处于初始状态)，以便于后续KFold时每次加载的都是原始模型(line359)，保证起点一致，各Fold之间互不影响
385 |         init_model_file = checkpoint_path + 'init_weight.h5'
386 |         self.model.save_weights(init_model_file)
387 |         
388 |         # KFold循环前准备
389 |         test_pred = np.zeros((len(x_test['word']), self.n_classes))     # K次预测结果的平均值(要对x_test预测K次)
390 |         train_pred = np.zeros((len(x_train['word']), self.n_classes))   # K次预测结果不重不漏地覆盖所有x_train
391 |         scores_pre, scores_rec, scores_f1, scores_sim = [], [], [], []
392 |         
393 |         for i_fold, (train_index, val_index) in enumerate(self.kfold.split(x_train['word'])):
394 |             self.model.load_weights(init_model_file)      # 每次KFold开始时加载的都是原始模型
395 |             
396 |             # 取数：X和Y
397 |             x_train_fold, x_val_fold = {}, {}
398 |             # TODO 改到__init__里，自动取舍各name！
399 |             for key in ['word', 'word_left', 'word_right', 'word_structured', 'char', 'char_left', 'char_right', 'char_structured']: # 对应model创建时Input的name
400 |                 x_train_fold[key] = x_train[key][train_index]
401 |                 x_val_fold[key] = x_train[key](val_index)
402 |             y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]
403 |             data_fold = (x_train_fold, y_train_fold, x_val_fold, y_val_fold)
404 |             
405 |             
406 |             # 创建Callbacks: checkpoint, snapshot
407 |             model_prefix = checkpoint_path + '/' + str(i_fold)
408 |             os.makedirs(model_prefix, exist_ok=True)
409 |             model_file = model_prefix + '/k' + str(i_fold) + '_model.h5'
410 |             checkpoint = ModelCheckpoint(model_file, monitor='val_loss', verbose=1, save_best_only=True, mode='min')    # TODO min???
411 |             snapshot = self.snapshot.get_callbacks(model_save_place=model_prefix)
412 |             # TODO 创建callbacks不规范，有的在__init__中，有的在每次KFold内各mode前，有的在某mode内！最好统一规范一下！
413 |             
414 |             
415 |             # 模型编译和训练
416 |             # 支持6种模式
417 |             #     1 = 一直冻结，一次编译和训练
418 |             #   2,3 = 前期冻结，后期解冻，两次编译和训练
419 |             # 4,5,6 = 一直解冻，一次编译和训练
420 |             if mode == 1:
421 |                 # 一直冻结Embedding，使用snapshot方式训练模型
422 |                 self.embedding_trainable(False)
423 |                 optimizer = Adam(lr=1e-3, clipvalue=2.0)
424 |                 callbacks = [snapshot, ]
425 |                 history = self.model_compile_fit(data_fold, optimizer, callbacks, epochs=self.snap_epoch, model_file=None)
426 |                 
427 |             elif mode == 2:
428 |                 # 前期冻结Embedding层，模型编译和训练
429 |                 self.embedding_trainable(False)
430 |                 optimizer = Adam(lr=1e-3, clipvalue=2.0)
431 |                 history1 = self.model_compile_fit(data_fold, optimizer, epochs=6)
432 |                 # 训练好参数后，解冻Embedding层，再次编译，使用snapshot方式训练模型
433 |                 self.embedding_trainable(True)
434 |                 optimizer = 'adam'
435 |                 callbacks = [snapshot, ]
436 |                 history2 = self.model_compile_fit(data_fold, optimizer, callbacks, epochs=self.snap_epoch, model_file=None)
437 |                 
438 |             elif mode == 3:
439 |                 # 前期冻结Embedding层，模型编译和训练
440 |                 self.embedding_trainable(False)
441 |                 optimizer = Adam(lr=1e-3, clipvalue=2.4)
442 |                 history1 = self.model_compile_fit(data_fold, optimizer, epochs=2, model_file=None)
443 |                 # 训练好参数后，解冻Embedding层，再次编译，训练模型
444 |                 self.embedding_trainable(True)
445 |                 optimizer = Adam(lr=1e-3, clipvalue=1.5)
446 |                 callbacks = [self.lr_schedule, checkpoint, ]
447 |                 history2 = self.model_compile_fit(data_fold, optimizer, callbacks, epochs=10, model_file=None)
448 |                 self.plot_histories(history1, history2, i_fold)
449 |                 
450 |             elif mode == 4:
451 |                 # 一直解冻Embedding层，编译和训练模型
452 |                 if self.config.n_gpus == 1:             # TODO 为什么gpu=1时为True，=2时呢？为False??? 注意，默认为True
453 |                     self.embedding_trainable(True)
454 |                 optimizer = SGD(lr=self.init_lr, momentum=0.9, decay=1e-6)
455 |                 callbacks = [LearningRateScheduler(self.poly_decay), self.early_stopping, ]
456 |                 history = self.model_compile_fit(data_fold, optimizer, callbacks, model_file=model_file)
457 |                 self.plot_history(history, i_fold)
458 |                 
459 |             elif mode == 5:
460 |                 # 一直解冻Embedding层，编译和训练模型
461 |                 optimizer = Adam(lr=1e-3, clipnorm=1.0)
462 |                 callbacks = [self.lr_schedule, checkpoint, ]
463 |                 history = self.model_compile_fit(data_fold, optimizer, callbacks, epochs=20, model_file=None)
464 |                 self.plot_history(history, i_fold)
465 |                 
466 |             elif mode == 6:
467 |                 # 一直解冻Embedding层，编译，使用snapshot方式训练模型
468 |                 if self.config.n_gpus == 1:
469 |                     self.embedding_trainable(True)
470 |                 optimizer = Adam(lr=self.init_lr, decay=1e-6)
471 |                 callbacks = [snapshot, ]
472 |                 history = self.model_compile_fit(data_fold, optimizer, callbacks, model_file=None)
473 |                 self.plot_history(history, i_fold)
474 |                 
475 |             else:
476 |                 exit('Wrong mode! mode must be in (1, 2, 3, 4, 5, 6)')
477 |                 
478 |             
479 |             # 模型评估
480 |             h5models = [x for x in os.listdir(model_prefix) if '.h5' in x]
481 |             print(h5models)
482 |             test_pred_fold = np.zeros((len(x_test['word']), self.n_class))      # 预测test，按模型个数取平均值
483 |             val_pred_fold = np.zeros((len(x_val_fold['word']), self.n_class))   # 预测val，按模型个数取平均值
484 |             for h5file in h5models:
485 |                 self.model.load_weights(os.path.join(model_prefix, h5file))
486 |                 test_pred_fold += self.model.predict(x_test, verbose=1) / len(h5models)
487 |                 val_pred_fold += self.model.predict(x_val_fold, batch_size=64*self.config.n_gpus) / len(h5models)
488 |             
489 |             test_pred += test_pred_fold / self.n_folds      # 按KFold取平均值
490 |             train_pred[val_index] = val_pred_fold
491 |             
492 |             precision, recall, f1score = self.multilabel_precision_recall(val_pred_fold, y_val_fold)
493 |             vectors, sims = self.multilabel_distribution_similarity(val_pred_fold, y_val_fold)
494 |             print('KFold CV precision = ' + str(precision))
495 |             print('KFold CV recall = ' + str(recall))
496 |             print('KFold CV f1score = ' + str(f1score))
497 |             print('KFold CV similarity = ' + str(sims[0]))
498 |             scores_pre.append(precision)
499 |             scores_rec.append(recall)
500 |             scores_f1.append(f1score)
501 |             scores_sim.append(sims[0])
502 |             
503 |         
504 |         # KFold结束后，保存预测结果
505 |         print('Total precision = ' + str(np.mean(scores_pre)))
506 |         print('Total recall = ' + str(np.mean(scores_rec)))
507 |         print('Total f1score = ' + str(np.mean(scores_f1)))
508 |         print('Total similarity = ' + str(np.mean(scores_sim)))
509 |         result_prefix = './result/mode' + str(mode) + '_'
510 |         result_postfix = 'f1_' + str(np.mean(scores_f1)) + 'pre_' + str(np.mean(scores_pre)) + 'rec_' + str(np.mean(scores_rec)) + '.pkl'
511 |         #os.makedirs(result_prefix, exist_ok=True)
512 |         pickle.dump(train_pred, open(result_prefix + self.name + '_oof_' + result_postfix, 'wb'))
513 |         pickle.dump(test_pred, open(result_prefix + self.name + '_test_' + result_postfix, 'wb'))
514 |         
515 |         
516 |     def load_model(self, model_file):
517 |         """加载模型及权重"""
518 |         self.model = load_model(model_file)
519 |         
520 |         
521 |     def load_weights(self, weights_file):
522 |         """加载模型的权重"""
523 |         self.model.load_weights(weights_file)
524 |         
525 |         
526 |     


--------------------------------------------------------------------------------
/model/Bert/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/model/Bert/args.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from enum import Enum
 3 | 
 4 | 
 5 | class PoolingStrategy(Enum):
 6 |     NONE = 0
 7 |     REDUCE_MAX = 1
 8 |     REDUCE_MEAN = 2
 9 |     REDUCE_MEAN_MAX = 3
10 |     FIRST_TOKEN = 4  # corresponds to [CLS] for single sequences
11 |     LAST_TOKEN = 5  # corresponds to [SEP] for single sequences
12 |     CLS_TOKEN = 4  # corresponds to the first token for single seq.
13 |     SEP_TOKEN = 5  # corresponds to the last token for single seq.
14 | 
15 |     def __str__(self):
16 |         return self.name
17 | 
18 |     @staticmethod
19 |     def from_string(s):
20 |         try:
21 |             return PoolingStrategy[s]
22 |         except KeyError:
23 |             raise ValueError()
24 | 
25 | xla = True
26 | # list of int. this model has 12 layers, By default this program works on the second last layer. The last layer is too
27 | # closed to the target functions,If you question about this argument and want to use the last hidden layer anyway, please
28 | # feel free to set layer_indexes=[-1], so we use the second last layer
29 | layer_indexes = [-2]
30 | 


--------------------------------------------------------------------------------
/model/Bert/extract_feature.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | import sys
  3 | import os
  4 | import tempfile
  5 | import random
  6 | import json
  7 | import logging
  8 | from termcolor import colored
  9 | import contextlib
 10 | from queue import Queue
 11 | from threading import Thread
 12 | import tensorflow as tf
 13 | 
 14 | sys.path.append(os.path.join(os.path.dirname(__file__), '../'))
 15 | from model.Bert import modeling
 16 | from model.Bert import tokenization
 17 | from model.Bert import args
 18 | from model.Bert.args import PoolingStrategy
 19 | 
 20 | 
 21 | def import_tf(device_id=-1, verbose=False):
 22 |     #os.environ['CUDA_VISIBLE_DEVICES'] = '-1' if device_id < 0 else str(device_id)
 23 |     os.environ['TF_CPP_MIN_LOG_LEVEL'] = '0' if verbose else '3'    
 24 |     tf.logging.set_verbosity(tf.logging.DEBUG if verbose else tf.logging.ERROR)
 25 |     return tf
 26 | 
 27 | tf = import_tf(0, True)
 28 | 
 29 | 
 30 | def set_logger(context, verbose=False):
 31 |     logger = logging.getLogger(context)
 32 |     logger.setLevel(logging.DEBUG if verbose else logging.INFO)
 33 |     formatter = logging.Formatter(
 34 |         '%(levelname)-.1s:' + context + ':[%(filename).5s:%(funcName).3s:%(lineno)3d]:%(message)s', datefmt=
 35 |         '%m-%d %H:%M:%S')
 36 |     console_handler = logging.StreamHandler()
 37 |     console_handler.setLevel(logging.DEBUG if verbose else logging.INFO)
 38 |     console_handler.setFormatter(formatter)
 39 |     logger.handlers = []
 40 |     logger.addHandler(console_handler)
 41 |     return logger
 42 |     
 43 | 
 44 | class InputExample(object):
 45 | 
 46 |     def __init__(self, unique_id, text_a, text_b):
 47 |         self.unique_id = unique_id
 48 |         self.text_a = text_a
 49 |         self.text_b = text_b
 50 | 
 51 | 
 52 | class InputFeatures(object):
 53 |     """A single set of features of data."""
 54 | 
 55 |     def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids):
 56 |         self.unique_id = unique_id
 57 |         self.tokens = tokens
 58 |         self.input_ids = input_ids
 59 |         self.input_mask = input_mask
 60 |         self.input_type_ids = input_type_ids
 61 | 
 62 |         
 63 | def optimize_graph(config_name,
 64 |                    ckpt_name,
 65 |                    logger=None, 
 66 |                    verbose=False, 
 67 |                    pooling_strategy=PoolingStrategy.REDUCE_MEAN, 
 68 |                    max_seq_len=40,
 69 |                    graph_tmpfile="./tmpxxx"):
 70 |     if not logger:
 71 |         logger = set_logger(colored('BERT_VEC', 'yellow'), verbose)
 72 |     try:
 73 |         # we don't need GPU for optimizing the graph
 74 |         tf = import_tf(device_id=0, verbose=verbose)
 75 |         from tensorflow.python.tools.optimize_for_inference_lib import optimize_for_inference
 76 | 
 77 |         # allow_soft_placement:自动选择运行设备
 78 |         config = tf.ConfigProto(allow_soft_placement=True)
 79 |         config_fp = config_name
 80 |         init_checkpoint = ckpt_name
 81 |         logger.info('model config: %s' % config_fp)
 82 | 
 83 |         # 加载bert配置文件
 84 |         with tf.gfile.GFile(config_fp, 'r') as f:
 85 |             bert_config = modeling.BertConfig.from_dict(json.load(f))
 86 | 
 87 |         logger.info('build graph...')
 88 |         # input placeholders, not sure if they are friendly to XLA
 89 |         input_ids = tf.placeholder(tf.int32, (None, max_seq_len), 'input_ids')
 90 |         input_mask = tf.placeholder(tf.int32, (None, max_seq_len), 'input_mask')
 91 |         input_type_ids = tf.placeholder(tf.int32, (None, max_seq_len), 'input_type_ids')
 92 | 
 93 |         # xla加速
 94 |         jit_scope = tf.contrib.compiler.jit.experimental_jit_scope if args.xla else contextlib.suppress
 95 | 
 96 |         with jit_scope():
 97 |             input_tensors = [input_ids, input_mask, input_type_ids]
 98 | 
 99 |             model = modeling.BertModel(
100 |                 config=bert_config,
101 |                 is_training=False,
102 |                 input_ids=input_ids,
103 |                 input_mask=input_mask,
104 |                 token_type_ids=input_type_ids,
105 |                 use_one_hot_embeddings=False)
106 | 
107 |             # 获取所有要训练的变量
108 |             tvars = tf.trainable_variables()
109 | 
110 |             (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars,
111 |                                                                                                        init_checkpoint)
112 | 
113 |             tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
114 | 
115 |             minus_mask = lambda x, m: x - tf.expand_dims(1.0 - m, axis=-1) * 1e30
116 |             mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1)
117 |             masked_reduce_max = lambda x, m: tf.reduce_max(minus_mask(x, m), axis=1)
118 |             masked_reduce_mean = lambda x, m: tf.reduce_sum(mul_mask(x, m), axis=1) / (
119 |                     tf.reduce_sum(m, axis=1, keepdims=True) + 1e-10)
120 | 
121 |             # 共享卷积核
122 |             with tf.variable_scope("pooling"):
123 |                 # 如果只有一层，就只取对应那一层的weight
124 |                 if len(args.layer_indexes) == 1:
125 |                     encoder_layer = model.all_encoder_layers[args.layer_indexes[0]]
126 |                 else:
127 |                     # 否则遍历需要取的层，把所有层的weight取出来并拼接起来shape:768*层数
128 |                     all_layers = [model.all_encoder_layers[l] for l in args.layer_indexes]
129 |                     encoder_layer = tf.concat(all_layers, -1)
130 | 
131 |                 input_mask = tf.cast(input_mask, tf.float32)
132 | 
133 |                 # 以下代码是句向量的生成方法，可以理解为做了一个卷积的操作，但是没有把结果相加, 卷积核是input_mask
134 |                 if pooling_strategy == PoolingStrategy.REDUCE_MEAN:
135 |                     pooled = masked_reduce_mean(encoder_layer, input_mask)
136 |                 elif pooling_strategy == PoolingStrategy.REDUCE_MAX:
137 |                     pooled = masked_reduce_max(encoder_layer, input_mask)
138 |                 elif pooling_strategy == PoolingStrategy.REDUCE_MEAN_MAX:
139 |                     pooled = tf.concat([masked_reduce_mean(encoder_layer, input_mask),
140 |                                         masked_reduce_max(encoder_layer, input_mask)], axis=1)
141 |                 elif pooling_strategy == PoolingStrategy.FIRST_TOKEN or \
142 |                         pooling_strategy == PoolingStrategy.CLS_TOKEN:
143 |                     pooled = tf.squeeze(encoder_layer[:, 0:1, :], axis=1)
144 |                 elif pooling_strategy == PoolingStrategy.LAST_TOKEN or \
145 |                         pooling_strategy == PoolingStrategy.SEP_TOKEN:
146 |                     seq_len = tf.cast(tf.reduce_sum(input_mask, axis=1), tf.int32)
147 |                     rng = tf.range(0, tf.shape(seq_len)[0])
148 |                     indexes = tf.stack([rng, seq_len - 1], 1)
149 |                     pooled = tf.gather_nd(encoder_layer, indexes)
150 |                 elif pooling_strategy == PoolingStrategy.NONE:
151 |                     pooled = mul_mask(encoder_layer, input_mask)
152 |                 else:
153 |                     raise NotImplementedError()
154 | 
155 |             pooled = tf.identity(pooled, 'final_encodes')
156 | 
157 |             output_tensors = [pooled]
158 |             tmp_g = tf.get_default_graph().as_graph_def()
159 | 
160 |         with tf.Session(config=config) as sess:
161 |             logger.info('load parameters from checkpoint...')
162 |             sess.run(tf.global_variables_initializer())
163 |             logger.info('freeze...') 
164 |             tmp_g = tf.graph_util.convert_variables_to_constants(sess, tmp_g, [n.name[:-2] for n in output_tensors])
165 |             dtypes = [n.dtype for n in input_tensors]
166 |             logger.info('optimize...')
167 |             tmp_g = optimize_for_inference(
168 |                 tmp_g,
169 |                 [n.name[:-2] for n in input_tensors],
170 |                 [n.name[:-2] for n in output_tensors],
171 |                 [dtype.as_datatype_enum for dtype in dtypes],
172 |                 False)
173 |         logger.info('write graph to a tmp file: %s' % graph_tmpfile)
174 |         with tf.gfile.GFile(graph_tmpfile, 'wb') as f:
175 |             f.write(tmp_g.SerializeToString())
176 |         return graph_tmpfile
177 |     except Exception as e:
178 |         logger.error('fail to optimize the graph!')
179 |         logger.error(e)
180 | 
181 | 
182 | class BertVector:
183 |     def __init__(self, batch_size=1, 
184 |                        pooling_strategy="REDUCE_MEAN", 
185 |                        max_seq_len=40,
186 |                        bert_model_path="./chinese_L-12_H-768_A-12/",
187 |                        graph_tmpfile="./tmpxxx"):
188 |         """
189 |         init BertVector
190 |         :param batch_size:     Depending on your memory default is 32
191 |         """
192 |         self.max_seq_length = max_seq_len
193 |         self.layer_indexes = args.layer_indexes
194 |         self.gpu_memory_fraction = 1
195 |         
196 |         self.file_path = os.path.dirname(__file__)
197 | 
198 |         self.model_dir = os.path.join(self.file_path, bert_model_path)
199 |         self.config_name = os.path.join(self.model_dir, 'bert_config.json')
200 |         self.ckpt_name = os.path.join(self.model_dir, 'bert_model.ckpt')
201 |         self.vocab_file = os.path.join(self.model_dir, 'vocab.txt')
202 | 
203 |         if pooling_strategy == "NONE":
204 |             pooling_strategy = PoolingStrategy.NONE
205 |         elif pooling_strategy == "REDUCE_MAX":
206 |             pooling_strategy = PoolingStrategy.REDUCE_MAX
207 |         elif pooling_strategy == "REDUCE_MEAN":
208 |             pooling_strategy = PoolingStrategy.REDUCE_MEAN
209 |         elif pooling_strategy == "REDUCE_MEAN_MAX":
210 |             pooling_strategy = PoolingStrategy.REDUCE_MEAN_MAX
211 | 
212 |         self.graph_path = optimize_graph(self.config_name,
213 |                                          self.ckpt_name,
214 |                                          pooling_strategy=pooling_strategy, 
215 |                                          max_seq_len=self.max_seq_length, 
216 |                                          graph_tmpfile=graph_tmpfile)
217 | 
218 |         self.tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_file, do_lower_case=True)
219 |         self.batch_size = batch_size
220 |         self.estimator = self.get_estimator()
221 |         self.input_queue = Queue(maxsize=1)
222 |         self.output_queue = Queue(maxsize=1)
223 |         self.predict_thread = Thread(target=self.predict_from_queue, daemon=True)
224 |         self.predict_thread.start()
225 | 
226 |     def get_estimator(self):
227 |         from tensorflow.python.estimator.estimator import Estimator
228 |         from tensorflow.python.estimator.run_config import RunConfig
229 |         from tensorflow.python.estimator.model_fn import EstimatorSpec
230 | 
231 |         def model_fn(features, labels, mode, params):
232 |             with tf.gfile.GFile(self.graph_path, 'rb') as f:
233 |                 graph_def = tf.GraphDef()
234 |                 graph_def.ParseFromString(f.read())
235 | 
236 |             input_names = ['input_ids', 'input_mask', 'input_type_ids']
237 | 
238 |             output = tf.import_graph_def(graph_def,
239 |                                          input_map={k + ':0': features[k] for k in input_names},
240 |                                          return_elements=['final_encodes:0'])
241 | 
242 |             return EstimatorSpec(mode=mode, predictions={
243 |                 'encodes': output[0]
244 |             })
245 | 
246 |         config = tf.ConfigProto()
247 |         config.gpu_options.allow_growth = True
248 |         config.gpu_options.per_process_gpu_memory_fraction = self.gpu_memory_fraction
249 |         config.log_device_placement = False
250 |         config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
251 | 
252 |         return Estimator(model_fn=model_fn, config=RunConfig(session_config=config),
253 |                          params={'batch_size': self.batch_size})
254 | 
255 |     def predict_from_queue(self):
256 |         prediction = self.estimator.predict(input_fn=self.queue_predict_input_fn, yield_single_examples=False)
257 |         for i in prediction:
258 |             self.output_queue.put(i)
259 | 
260 |     def encode(self, sentence):
261 |         self.input_queue.put(sentence)
262 |         prediction = self.output_queue.get()
263 |         return prediction
264 | 
265 |     def queue_predict_input_fn(self):
266 | 
267 |         return (tf.data.Dataset.from_generator(
268 |             self.generate_from_queue,
269 |             output_types={'unique_ids': tf.int32,
270 |                           'input_ids': tf.int32,
271 |                           'input_mask': tf.int32,
272 |                           'input_type_ids': tf.int32},
273 |             output_shapes={
274 |                 'unique_ids': (1,),
275 |                 'input_ids': (None, self.max_seq_length),
276 |                 'input_mask': (None, self.max_seq_length),
277 |                 'input_type_ids': (None, self.max_seq_length)}))
278 | 
279 |     def generate_from_queue(self):
280 |         while True:
281 |             features = list(self.convert_examples_to_features(seq_length=self.max_seq_length, tokenizer=self.tokenizer))
282 |             yield {
283 |                 'unique_ids': [f.unique_id for f in features],
284 |                 'input_ids': [f.input_ids for f in features],
285 |                 'input_mask': [f.input_mask for f in features],
286 |                 'input_type_ids': [f.input_type_ids for f in features]
287 |             }
288 | 
289 |     def input_fn_builder(self, features, seq_length):
290 |         """Creates an `input_fn` closure to be passed to Estimator."""
291 | 
292 |         all_unique_ids = []
293 |         all_input_ids = []
294 |         all_input_mask = []
295 |         all_input_type_ids = []
296 | 
297 |         for feature in features:
298 |             all_unique_ids.append(feature.unique_id)
299 |             all_input_ids.append(feature.input_ids)
300 |             all_input_mask.append(feature.input_mask)
301 |             all_input_type_ids.append(feature.input_type_ids)
302 | 
303 |         def input_fn(params):
304 |             """The actual input function."""
305 |             batch_size = params["batch_size"]
306 | 
307 |             num_examples = len(features)
308 | 
309 |             # This is for demo purposes and does NOT scale to large data sets. We do
310 |             # not use Dataset.from_generator() because that uses tf.py_func which is
311 |             # not TPU compatible. The right way to load data is with TFRecordReader.
312 |             d = tf.data.Dataset.from_tensor_slices({
313 |                 "unique_ids":
314 |                     tf.constant(all_unique_ids, shape=[num_examples], dtype=tf.int32),
315 |                 "input_ids":
316 |                     tf.constant(
317 |                         all_input_ids, shape=[num_examples, seq_length],
318 |                         dtype=tf.int32),
319 |                 "input_mask":
320 |                     tf.constant(
321 |                         all_input_mask,
322 |                         shape=[num_examples, seq_length],
323 |                         dtype=tf.int32),
324 |                 "input_type_ids":
325 |                     tf.constant(
326 |                         all_input_type_ids,
327 |                         shape=[num_examples, seq_length],
328 |                         dtype=tf.int32),
329 |             })
330 | 
331 |             d = d.batch(batch_size=batch_size, drop_remainder=False)
332 |             return d
333 | 
334 |         return input_fn
335 | 
336 |     def model_fn_builder(self, bert_config, init_checkpoint, layer_indexes):
337 |         """Returns `model_fn` closure for TPUEstimator."""
338 | 
339 |         def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
340 |             """The `model_fn` for TPUEstimator."""
341 | 
342 |             unique_ids = features["unique_ids"]
343 |             input_ids = features["input_ids"]
344 |             input_mask = features["input_mask"]
345 |             input_type_ids = features["input_type_ids"]
346 | 
347 |             jit_scope = tf.contrib.compiler.jit.experimental_jit_scope
348 | 
349 |             with jit_scope():
350 |                 model = modeling.BertModel(
351 |                     config=bert_config,
352 |                     is_training=False,
353 |                     input_ids=input_ids,
354 |                     input_mask=input_mask,
355 |                     token_type_ids=input_type_ids)
356 | 
357 |                 if mode != tf.estimator.ModeKeys.PREDICT:
358 |                     raise ValueError("Only PREDICT modes are supported: %s" % (mode))
359 | 
360 |                 tvars = tf.trainable_variables()
361 | 
362 |                 (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars,
363 |                                                                                                            init_checkpoint)
364 | 
365 |                 tf.logging.info("**** Trainable Variables ****")
366 |                 for var in tvars:
367 |                     init_string = ""
368 |                     if var.name in initialized_variable_names:
369 |                         init_string = ", *INIT_FROM_CKPT*"
370 |                     tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
371 |                                     init_string)
372 | 
373 |                 all_layers = model.get_all_encoder_layers()
374 | 
375 |                 predictions = {
376 |                     "unique_id": unique_ids,
377 |                 }
378 | 
379 |                 for (i, layer_index) in enumerate(layer_indexes):
380 |                     predictions["layer_output_%d" % i] = all_layers[layer_index]
381 | 
382 |                 from tensorflow.python.estimator.model_fn import EstimatorSpec
383 | 
384 |                 output_spec = EstimatorSpec(mode=mode, predictions=predictions)
385 |                 return output_spec
386 | 
387 |         return model_fn
388 | 
389 |     def convert_examples_to_features(self, seq_length, tokenizer):
390 |         """Loads a data file into a list of `InputBatch`s."""
391 | 
392 |         features = []
393 |         input_masks = []
394 |         examples = self._to_example(self.input_queue.get())
395 |         for (ex_index, example) in enumerate(examples):
396 |             tokens_a = tokenizer.tokenize(example.text_a)
397 | 
398 |             # if the sentences's length is more than seq_length, only use sentence's left part
399 |             if len(tokens_a) > seq_length - 2:
400 |                 tokens_a = tokens_a[0:(seq_length - 2)]
401 | 
402 |             # The convention in BERT is:
403 |             # (a) For sequence pairs:
404 |             #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
405 |             #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
406 |             # (b) For single sequences:
407 |             #  tokens:   [CLS] the dog is hairy . [SEP]
408 |             #  type_ids: 0     0   0   0  0     0 0
409 |             #
410 |             # Where "type_ids" are used to indicate whether this is the first
411 |             # sequence or the second sequence. The embedding vectors for `type=0` and
412 |             # `type=1` were learned during pre-training and are added to the wordpiece
413 |             # embedding vector (and position vector). This is not *strictly* necessary
414 |             # since the [SEP] token unambiguously separates the sequences, but it makes
415 |             # it easier for the model to learn the concept of sequences.
416 |             #
417 |             # For classification tasks, the first vector (corresponding to [CLS]) is
418 |             # used as as the "sentence vector". Note that this only makes sense because
419 |             # the entire model is fine-tuned.
420 |             tokens = []
421 |             input_type_ids = []
422 |             tokens.append("[CLS]")
423 |             input_type_ids.append(0)
424 |             for token in tokens_a:
425 |                 tokens.append(token)
426 |                 input_type_ids.append(0)
427 |             tokens.append("[SEP]")
428 |             input_type_ids.append(0)
429 | 
430 |             # Where "input_ids" are tokens's index in vocabulary
431 |             input_ids = tokenizer.convert_tokens_to_ids(tokens)
432 | 
433 |             # The mask has 1 for real tokens and 0 for padding tokens. Only real
434 |             # tokens are attended to.
435 |             input_mask = [1] * len(input_ids)
436 |             input_masks.append(input_mask)
437 |             # Zero-pad up to the sequence length.
438 |             while len(input_ids) < seq_length:
439 |                 input_ids.append(0)
440 |                 input_mask.append(0)
441 |                 input_type_ids.append(0)
442 | 
443 |             assert len(input_ids) == seq_length
444 |             assert len(input_mask) == seq_length
445 |             assert len(input_type_ids) == seq_length
446 | 
447 |             if ex_index < 5:
448 |                 tf.logging.info("*** Example ***")
449 |                 tf.logging.info("unique_id: %s" % (example.unique_id))
450 |                 tf.logging.info("tokens: %s" % " ".join(
451 |                     [tokenization.printable_text(x) for x in tokens]))
452 |                 tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
453 |                 tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
454 |                 tf.logging.info(
455 |                     "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))
456 | 
457 |             yield InputFeatures(
458 |                 unique_id=example.unique_id,
459 |                 tokens=tokens,
460 |                 input_ids=input_ids,
461 |                 input_mask=input_mask,
462 |                 input_type_ids=input_type_ids)
463 | 
464 |     def _truncate_seq_pair(self, tokens_a, tokens_b, max_length):
465 |         """Truncates a sequence pair in place to the maximum length."""
466 | 
467 |         # This is a simple heuristic which will always truncate the longer sequence
468 |         # one token at a time. This makes more sense than truncating an equal percent
469 |         # of tokens from each, since if one sequence is very short then each token
470 |         # that's truncated likely contains more information than a longer sequence.
471 |         while True:
472 |             total_length = len(tokens_a) + len(tokens_b)
473 |             if total_length <= max_length:
474 |                 break
475 |             if len(tokens_a) > len(tokens_b):
476 |                 tokens_a.pop()
477 |             else:
478 |                 tokens_b.pop()
479 | 
480 |     @staticmethod
481 |     def _to_example(sentences):
482 |         import re
483 |         """
484 |         sentences to InputExample
485 |         :param sentences: list of strings
486 |         :return: list of InputExample
487 |         """
488 |         unique_id = 0
489 |         for ss in sentences:
490 |             line = tokenization.convert_to_unicode(ss)
491 |             if not line:
492 |                 continue
493 |             line = line.strip()
494 |             text_a = None
495 |             text_b = None
496 |             m = re.match(r"^(.*) \|\|\| (.*)$", line)
497 |             if m is None:
498 |                 text_a = line
499 |             else:
500 |                 text_a = m.group(1)
501 |                 text_b = m.group(2)
502 |             yield InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)
503 |             unique_id += 1
504 | 
505 | 
506 | if __name__ == "__main__":
507 |     bc = BertVector(batch_size=1, 
508 |                       pooling_strategy="REDUCE_MEAN", 
509 |                       max_seq_len=20,
510 |                       bert_model_path="D:\\workspaces\\code\\tfhub\\keras_dssm\\chinese_L-12_H-768_A-12\\",
511 |                       graph_tmpfile="D:\\workspaces\\code\\tfhub\\keras_dssm\\tmpxxx")
512 |     query = u"新浪移动"
513 |     vectors = bc.encode([query])
514 |     print(str(vectors))
515 |     


--------------------------------------------------------------------------------
/model/Bert/optimization.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Functions and classes related to optimization (weight updates)."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import re
 22 | import tensorflow as tf
 23 | 
 24 | 
 25 | def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
 26 |   """Creates an optimizer training op."""
 27 |   global_step = tf.train.get_or_create_global_step()
 28 | 
 29 |   learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
 30 | 
 31 |   # Implements linear decay of the learning rate.
 32 |   learning_rate = tf.train.polynomial_decay(
 33 |       learning_rate,
 34 |       global_step,
 35 |       num_train_steps,
 36 |       end_learning_rate=0.0,
 37 |       power=1.0,
 38 |       cycle=False)
 39 | 
 40 |   # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
 41 |   # learning rate will be `global_step/num_warmup_steps * init_lr`.
 42 |   if num_warmup_steps:
 43 |     global_steps_int = tf.cast(global_step, tf.int32)
 44 |     warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
 45 | 
 46 |     global_steps_float = tf.cast(global_steps_int, tf.float32)
 47 |     warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
 48 | 
 49 |     warmup_percent_done = global_steps_float / warmup_steps_float
 50 |     warmup_learning_rate = init_lr * warmup_percent_done
 51 | 
 52 |     is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
 53 |     learning_rate = (
 54 |         (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
 55 | 
 56 |   # It is recommended that you use this optimizer for fine tuning, since this
 57 |   # is how the model was trained (note that the Adam m/v variables are NOT
 58 |   # loaded from init_checkpoint.)
 59 |   optimizer = AdamWeightDecayOptimizer(
 60 |       learning_rate=learning_rate,
 61 |       weight_decay_rate=0.01,
 62 |       beta_1=0.9,
 63 |       beta_2=0.999,
 64 |       epsilon=1e-6,
 65 |       exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
 66 | 
 67 |   if use_tpu:
 68 |     optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
 69 | 
 70 |   tvars = tf.trainable_variables()
 71 |   grads = tf.gradients(loss, tvars)
 72 | 
 73 |   # This is how the model was pre-trained.
 74 |   (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
 75 | 
 76 |   train_op = optimizer.apply_gradients(
 77 |       zip(grads, tvars), global_step=global_step)
 78 | 
 79 |   new_global_step = global_step + 1
 80 |   train_op = tf.group(train_op, [global_step.assign(new_global_step)])
 81 |   return train_op
 82 | 
 83 | 
 84 | class AdamWeightDecayOptimizer(tf.train.Optimizer):
 85 |   """A basic Adam optimizer that includes "correct" L2 weight decay."""
 86 | 
 87 |   def __init__(self,
 88 |                learning_rate,
 89 |                weight_decay_rate=0.0,
 90 |                beta_1=0.9,
 91 |                beta_2=0.999,
 92 |                epsilon=1e-6,
 93 |                exclude_from_weight_decay=None,
 94 |                name="AdamWeightDecayOptimizer"):
 95 |     """Constructs a AdamWeightDecayOptimizer."""
 96 |     super(AdamWeightDecayOptimizer, self).__init__(False, name)
 97 | 
 98 |     self.learning_rate = learning_rate
 99 |     self.weight_decay_rate = weight_decay_rate
100 |     self.beta_1 = beta_1
101 |     self.beta_2 = beta_2
102 |     self.epsilon = epsilon
103 |     self.exclude_from_weight_decay = exclude_from_weight_decay
104 | 
105 |   def apply_gradients(self, grads_and_vars, global_step=None, name=None):
106 |     """See base class."""
107 |     assignments = []
108 |     for (grad, param) in grads_and_vars:
109 |       if grad is None or param is None:
110 |         continue
111 | 
112 |       param_name = self._get_variable_name(param.name)
113 | 
114 |       m = tf.get_variable(
115 |           name=param_name + "/adam_m",
116 |           shape=param.shape.as_list(),
117 |           dtype=tf.float32,
118 |           trainable=False,
119 |           initializer=tf.zeros_initializer())
120 |       v = tf.get_variable(
121 |           name=param_name + "/adam_v",
122 |           shape=param.shape.as_list(),
123 |           dtype=tf.float32,
124 |           trainable=False,
125 |           initializer=tf.zeros_initializer())
126 | 
127 |       # Standard Adam update.
128 |       next_m = (
129 |           tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
130 |       next_v = (
131 |           tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
132 |                                                     tf.square(grad)))
133 | 
134 |       update = next_m / (tf.sqrt(next_v) + self.epsilon)
135 | 
136 |       # Just adding the square of the weights to the loss function is *not*
137 |       # the correct way of using L2 regularization/weight decay with Adam,
138 |       # since that will interact with the m and v parameters in strange ways.
139 |       #
140 |       # Instead we want ot decay the weights in a manner that doesn't interact
141 |       # with the m/v parameters. This is equivalent to adding the square
142 |       # of the weights to the loss with plain (non-momentum) SGD.
143 |       if self._do_use_weight_decay(param_name):
144 |         update += self.weight_decay_rate * param
145 | 
146 |       update_with_lr = self.learning_rate * update
147 | 
148 |       next_param = param - update_with_lr
149 | 
150 |       assignments.extend(
151 |           [param.assign(next_param),
152 |            m.assign(next_m),
153 |            v.assign(next_v)])
154 |     return tf.group(*assignments, name=name)
155 | 
156 |   def _do_use_weight_decay(self, param_name):
157 |     """Whether to use L2 weight decay for `param_name`."""
158 |     if not self.weight_decay_rate:
159 |       return False
160 |     if self.exclude_from_weight_decay:
161 |       for r in self.exclude_from_weight_decay:
162 |         if re.search(r, param_name) is not None:
163 |           return False
164 |     return True
165 | 
166 |   def _get_variable_name(self, param_name):
167 |     """Get the variable name from the tensor name."""
168 |     m = re.match("^(.*):\\d+$", param_name)
169 |     if m is not None:
170 |       param_name = m.group(1)
171 |     return param_name
172 | 


--------------------------------------------------------------------------------
/model/Bert/tokenization.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Tokenization classes."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import collections
 22 | import unicodedata
 23 | import six
 24 | import tensorflow as tf
 25 | 
 26 | 
 27 | def convert_to_unicode(text):
 28 |   """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
 29 |   if six.PY3:
 30 |     if isinstance(text, str):
 31 |       return text
 32 |     elif isinstance(text, bytes):
 33 |       return text.decode("utf-8", "ignore")
 34 |     else:
 35 |       raise ValueError("Unsupported string type: %s" % (type(text)))
 36 |   elif six.PY2:
 37 |     if isinstance(text, str):
 38 |       return text.decode("utf-8", "ignore")
 39 |     elif isinstance(text, unicode):
 40 |       return text
 41 |     else:
 42 |       raise ValueError("Unsupported string type: %s" % (type(text)))
 43 |   else:
 44 |     raise ValueError("Not running on Python2 or Python 3?")
 45 | 
 46 | 
 47 | def printable_text(text):
 48 |   """Returns text encoded in a way suitable for print or `tf.logging`."""
 49 | 
 50 |   # These functions want `str` for both Python2 and Python3, but in one case
 51 |   # it's a Unicode string and in the other it's a byte string.
 52 |   if six.PY3:
 53 |     if isinstance(text, str):
 54 |       return text
 55 |     elif isinstance(text, bytes):
 56 |       return text.decode("utf-8", "ignore")
 57 |     else:
 58 |       raise ValueError("Unsupported string type: %s" % (type(text)))
 59 |   elif six.PY2:
 60 |     if isinstance(text, str):
 61 |       return text
 62 |     elif isinstance(text, unicode):
 63 |       return text.encode("utf-8")
 64 |     else:
 65 |       raise ValueError("Unsupported string type: %s" % (type(text)))
 66 |   else:
 67 |     raise ValueError("Not running on Python2 or Python 3?")
 68 | 
 69 | 
 70 | def load_vocab(vocab_file):
 71 |   """Loads a vocabulary file into a dictionary."""
 72 |   vocab = collections.OrderedDict()
 73 |   index = 0
 74 |   with tf.gfile.GFile(vocab_file, "r") as reader:
 75 |     while True:
 76 |       token = convert_to_unicode(reader.readline())
 77 |       if not token:
 78 |         break
 79 |       token = token.strip()
 80 |       vocab[token] = index
 81 |       index += 1
 82 |   return vocab
 83 | 
 84 | 
 85 | def convert_by_vocab(vocab, items):
 86 |   """Converts a sequence of [tokens|ids] using the vocab."""
 87 |   output = []
 88 |   for item in items:
 89 |     output.append(vocab[item])
 90 |   return output
 91 | 
 92 | 
 93 | def convert_tokens_to_ids(vocab, tokens):
 94 |   return convert_by_vocab(vocab, tokens)
 95 | 
 96 | 
 97 | def convert_ids_to_tokens(inv_vocab, ids):
 98 |   return convert_by_vocab(inv_vocab, ids)
 99 | 
100 | 
101 | def whitespace_tokenize(text):
102 |   """Runs basic whitespace cleaning and splitting on a piece of text."""
103 |   text = text.strip()
104 |   if not text:
105 |     return []
106 |   tokens = text.split()
107 |   return tokens
108 | 
109 | 
110 | class FullTokenizer(object):
111 |   """Runs end-to-end tokenziation."""
112 | 
113 |   def __init__(self, vocab_file, do_lower_case=True):
114 |     self.vocab = load_vocab(vocab_file)
115 |     self.inv_vocab = {v: k for k, v in self.vocab.items()}
116 |     self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
117 |     self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
118 | 
119 |   def tokenize(self, text):
120 |     split_tokens = []
121 |     for token in self.basic_tokenizer.tokenize(text):
122 |       for sub_token in self.wordpiece_tokenizer.tokenize(token):
123 |         split_tokens.append(sub_token)
124 | 
125 |     return split_tokens
126 | 
127 |   def convert_tokens_to_ids(self, tokens):
128 |     return convert_by_vocab(self.vocab, tokens)
129 | 
130 |   def convert_ids_to_tokens(self, ids):
131 |     return convert_by_vocab(self.inv_vocab, ids)
132 | 
133 | 
134 | class BasicTokenizer(object):
135 |   """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
136 | 
137 |   def __init__(self, do_lower_case=True):
138 |     """Constructs a BasicTokenizer.
139 | 
140 |     Args:
141 |       do_lower_case: Whether to lower case the input.
142 |     """
143 |     self.do_lower_case = do_lower_case
144 | 
145 |   def tokenize(self, text):
146 |     """Tokenizes a piece of text."""
147 |     text = convert_to_unicode(text)
148 |     text = self._clean_text(text)
149 | 
150 |     # This was added on November 1st, 2018 for the multilingual and Chinese
151 |     # models. This is also applied to the English models now, but it doesn't
152 |     # matter since the English models were not trained on any Chinese data
153 |     # and generally don't have any Chinese data in them (there are Chinese
154 |     # characters in the vocabulary because Wikipedia does have some Chinese
155 |     # words in the English Wikipedia.).
156 |     text = self._tokenize_chinese_chars(text)
157 | 
158 |     orig_tokens = whitespace_tokenize(text)
159 |     split_tokens = []
160 |     for token in orig_tokens:
161 |       if self.do_lower_case:
162 |         token = token.lower()
163 |         token = self._run_strip_accents(token)
164 |       split_tokens.extend(self._run_split_on_punc(token))
165 | 
166 |     output_tokens = whitespace_tokenize(" ".join(split_tokens))
167 |     return output_tokens
168 | 
169 |   def _run_strip_accents(self, text):
170 |     """Strips accents from a piece of text."""
171 |     text = unicodedata.normalize("NFD", text)
172 |     output = []
173 |     for char in text:
174 |       cat = unicodedata.category(char)
175 |       if cat == "Mn":
176 |         continue
177 |       output.append(char)
178 |     return "".join(output)
179 | 
180 |   def _run_split_on_punc(self, text):
181 |     """Splits punctuation on a piece of text."""
182 |     chars = list(text)
183 |     i = 0
184 |     start_new_word = True
185 |     output = []
186 |     while i < len(chars):
187 |       char = chars[i]
188 |       if _is_punctuation(char):
189 |         output.append([char])
190 |         start_new_word = True
191 |       else:
192 |         if start_new_word:
193 |           output.append([])
194 |         start_new_word = False
195 |         output[-1].append(char)
196 |       i += 1
197 | 
198 |     return ["".join(x) for x in output]
199 | 
200 |   def _tokenize_chinese_chars(self, text):
201 |     """Adds whitespace around any CJK character."""
202 |     output = []
203 |     for char in text:
204 |       cp = ord(char)
205 |       if self._is_chinese_char(cp):
206 |         output.append(" ")
207 |         output.append(char)
208 |         output.append(" ")
209 |       else:
210 |         output.append(char)
211 |     return "".join(output)
212 | 
213 |   def _is_chinese_char(self, cp):
214 |     """Checks whether CP is the codepoint of a CJK character."""
215 |     # This defines a "chinese character" as anything in the CJK Unicode block:
216 |     #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
217 |     #
218 |     # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
219 |     # despite its name. The modern Korean Hangul alphabet is a different block,
220 |     # as is Japanese Hiragana and Katakana. Those alphabets are used to write
221 |     # space-separated words, so they are not treated specially and handled
222 |     # like the all of the other languages.
223 |     if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
224 |         (cp >= 0x3400 and cp <= 0x4DBF) or  #
225 |         (cp >= 0x20000 and cp <= 0x2A6DF) or  #
226 |         (cp >= 0x2A700 and cp <= 0x2B73F) or  #
227 |         (cp >= 0x2B740 and cp <= 0x2B81F) or  #
228 |         (cp >= 0x2B820 and cp <= 0x2CEAF) or
229 |         (cp >= 0xF900 and cp <= 0xFAFF) or  #
230 |         (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
231 |       return True
232 | 
233 |     return False
234 | 
235 |   def _clean_text(self, text):
236 |     """Performs invalid character removal and whitespace cleanup on text."""
237 |     output = []
238 |     for char in text:
239 |       cp = ord(char)
240 |       if cp == 0 or cp == 0xfffd or _is_control(char):
241 |         continue
242 |       if _is_whitespace(char):
243 |         output.append(" ")
244 |       else:
245 |         output.append(char)
246 |     return "".join(output)
247 | 
248 | 
249 | class WordpieceTokenizer(object):
250 |   """Runs WordPiece tokenziation."""
251 | 
252 |   def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
253 |     self.vocab = vocab
254 |     self.unk_token = unk_token
255 |     self.max_input_chars_per_word = max_input_chars_per_word
256 | 
257 |   def tokenize(self, text):
258 |     """Tokenizes a piece of text into its word pieces.
259 | 
260 |     This uses a greedy longest-match-first algorithm to perform tokenization
261 |     using the given vocabulary.
262 | 
263 |     For example:
264 |       input = "unaffable"
265 |       output = ["un", "##aff", "##able"]
266 | 
267 |     Args:
268 |       text: A single token or whitespace separated tokens. This should have
269 |         already been passed through `BasicTokenizer.
270 | 
271 |     Returns:
272 |       A list of wordpiece tokens.
273 |     """
274 | 
275 |     text = convert_to_unicode(text)
276 | 
277 |     output_tokens = []
278 |     for token in whitespace_tokenize(text):
279 |       chars = list(token)
280 |       if len(chars) > self.max_input_chars_per_word:
281 |         output_tokens.append(self.unk_token)
282 |         continue
283 | 
284 |       is_bad = False
285 |       start = 0
286 |       sub_tokens = []
287 |       while start < len(chars):
288 |         end = len(chars)
289 |         cur_substr = None
290 |         while start < end:
291 |           substr = "".join(chars[start:end])
292 |           if start > 0:
293 |             substr = "##" + substr
294 |           if substr in self.vocab:
295 |             cur_substr = substr
296 |             break
297 |           end -= 1
298 |         if cur_substr is None:
299 |           is_bad = True
300 |           break
301 |         sub_tokens.append(cur_substr)
302 |         start = end
303 | 
304 |       if is_bad:
305 |         output_tokens.append(self.unk_token)
306 |       else:
307 |         output_tokens.extend(sub_tokens)
308 |     return output_tokens
309 | 
310 | 
311 | def _is_whitespace(char):
312 |   """Checks whether `chars` is a whitespace character."""
313 |   # \t, \n, and \r are technically contorl characters but we treat them
314 |   # as whitespace since they are generally considered as such.
315 |   if char == " " or char == "\t" or char == "\n" or char == "\r":
316 |     return True
317 |   cat = unicodedata.category(char)
318 |   if cat == "Zs":
319 |     return True
320 |   return False
321 | 
322 | 
323 | def _is_control(char):
324 |   """Checks whether `chars` is a control character."""
325 |   # These are technically control characters but we count them as whitespace
326 |   # characters.
327 |   if char == "\t" or char == "\n" or char == "\r":
328 |     return False
329 |   cat = unicodedata.category(char)
330 |   if cat.startswith("C"):
331 |     return True
332 |   return False
333 | 
334 | 
335 | def _is_punctuation(char):
336 |   """Checks whether `chars` is a punctuation character."""
337 |   cp = ord(char)
338 |   # We treat all non-letter/number ASCII as punctuation.
339 |   # Characters such as "^", "$", and "`" are not in the Unicode
340 |   # Punctuation class but we treat them as punctuation anyways, for
341 |   # consistency.
342 |   if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
343 |       (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
344 |     return True
345 |   cat = unicodedata.category(char)
346 |   if cat.startswith("P"):
347 |     return True
348 |   return False
349 | 


--------------------------------------------------------------------------------
/model/Layers.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created:    2019-08-16 13:37:44
  4 | Author:     liuyao8
  5 | Descritipn: 
  6 | """
  7 | 
  8 | from keras.layers import Layer, Activation
  9 | from keras import initializers, regularizers, constraints
 10 | from keras import backend as K
 11 | 
 12 | 
 13 | 
 14 | class AttentionWeightedAverage(Layer):
 15 |     """
 16 |     A weighted Average of different channels across timesteps
 17 |     Reference: <https://blog.csdn.net/qq_40027052/article/details/78210253>
 18 |     """
 19 |     def __init__(self, return_attention=False, **kwargs):
 20 |         self.supports_masking = True
 21 |         self.return_attention = return_attention
 22 |         super(AttentionWeightedAverage, self).__init__(**kwargs)
 23 |         
 24 |         
 25 |     def build(self, input_shape):
 26 |         """Define the weights"""
 27 |         assert len(input_shape) == 3
 28 |         self.W = self.add_weight(name=self.name+'_W', shape=(input_shape[2], 1), initializer='glorot_uniform')
 29 |         self.trainable_weights = [self.W]        
 30 |         super(AttentionWeightedAverage, self).build(input_shape)
 31 |         
 32 |         
 33 |     def call(self, x, mask=None):
 34 |         """
 35 |         Layer's logic:
 36 |             logit = W * x - max(W * x)    # 相当于小神经网络: x -> logit
 37 |             attn = softmax(logit) = exp(logit) / (sum(exp(logit)) + epsilon)
 38 |             result = sum(attn * x)
 39 |             简写：result=sum(p(x)*x)  p(x)=softmax(Wx)  加性模型？？？
 40 |         """
 41 |         logit = K.dot(x, self.W)                                    # (i0, i1, i2) dot (i2, 1) -> (i0, i1, 1)
 42 |         logit = K.reshape(logit, (K.shape(x)[0], K.shape(x)[1]))    # -> (i0, i1)
 43 |         logit = logit - K.max(logit, axis=-1, keepdims=True)        # (i0, i1)
 44 |         ai = K.exp(logit)                                           # (i0, i1)
 45 |         
 46 |         # masked timesteps have 0 weight
 47 |         if mask:
 48 |             ai = ai * K.cast(mask, K.floatx())                      # (i0, i1)
 49 |         
 50 |         attn = ai / (K.sum(ai, axis=1, keepdims=True) + K.epsilon())    # (i0, i1)
 51 |         result = K.sum(x * K.expand_dims(attn), axis=1)                 # (i0, i1, i2) * (i0, i1, 1) -> (i0, i1, i2) -> (i0, i2)
 52 |         return [result, attn] if self.return_attention else result
 53 |         
 54 |     
 55 |     def compute_output_shape(self, input_shape):
 56 |         """The shape transformation logic"""
 57 |         if self.return_attention:
 58 |             return [(input_shape[0], input_shape[2]), (input_shape[0], input_shape[1])]
 59 |         return (input_shape[0], input_shape[2])
 60 |         
 61 | 
 62 | 
 63 | class Attention(Layer):
 64 |     """
 65 |     Keras Layer that implements an Attention mechanism for temporal data.
 66 |     Supports Masking.
 67 |     Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
 68 |     # Input shape
 69 |         3D tensor with shape: `(samples, steps, features)`.
 70 |     # Output shape
 71 |         2D tensor with shape: `(samples, features)`.
 72 |     :param kwargs:
 73 |     Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
 74 |     The dimensions are inferred based on the output shape of the RNN.
 75 |     Example:
 76 |         hidden = LSTM(64, return_sequences=True)(words)
 77 |         sentence = Attention()(hidden)
 78 |         # next add a Dense layer (for classification/regression) or whatever...
 79 |     代码源自：https://github.com/ShawnyXiao/TextClassification-Keras/blob/master/model/HAN/attention.py
 80 |     """
 81 |     def __init__(self, step_dim, W_regularizer=None, b_regularizer=None, W_constraint=None, b_constraint=None, bias=True, **kwargs):
 82 |         self.supports_masking = True
 83 |         self.init = initializers.get('glorot_uniform')
 84 |         self.W_regularizer = regularizers.get(W_regularizer)
 85 |         self.b_regularizer = regularizers.get(b_regularizer)
 86 |         self.W_constraint = constraints.get(W_constraint)
 87 |         self.b_constraint = constraints.get(b_constraint)
 88 |         self.bias = bias
 89 |         self.step_dim = step_dim
 90 |         self.features_dim = 0
 91 |         super(Attention, self).__init__(**kwargs)
 92 | 
 93 | 
 94 |     def build(self, input_shape):
 95 |         assert len(input_shape) == 3
 96 |         self.W = self.add_weight(shape=(input_shape[-1], ), name='{}_W'.format(self.name),
 97 |                                  initializer=self.init, regularizer=self.W_regularizer, constraint=self.W_constraint)
 98 |         self.features_dim = input_shape[-1]
 99 |         if self.bias:
100 |             self.b = self.add_weight(shape=(input_shape[1], ), name='{}_b'.format(self.name),
101 |                                      initializer='zero', regularizer=self.b_regularizer, constraint=self.b_constraint)
102 |         else:
103 |             self.b = None
104 |         self.built = True
105 | 
106 | 
107 |     def compute_mask(self, input, input_mask=None):
108 |         # do not pass the mask to the next layers
109 |         return None
110 | 
111 | 
112 |     def call(self, x, mask=None):
113 |         """简写：result=sum(p(x)*x)   p(x)=softmax(tanh(Wx+b))  加性模型"""
114 |         features_dim = self.features_dim
115 |         step_dim = self.step_dim
116 |         e = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim))  # e = K.dot(x, self.W)
117 |         if self.bias:
118 |             e += self.b
119 |         e = K.tanh(e)       # e = tanh(Wx + b)
120 |         a = K.exp(e)        # a = exp(e)
121 |         # apply mask after the exp. will be re-normalized next
122 |         if mask is not None:
123 |             a *= K.cast(mask, K.floatx())   # cast the mask to floatX to avoid float64 upcasting in theano
124 |         # In some cases especially in the early stages of training, the sum may be almost zero and this results in NaN's.
125 |         # A workaround is to add a very small positive number ε to the sum.
126 |         a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())    # a = softmax(e) = softmax(tanh(Wx + b)) = p(x) 表示一种权重
127 |         a = K.expand_dims(a)
128 |         result = K.sum(a * x, axis=1)    # result = sum(p(x) * x)  对x加权求和
129 |         return result
130 | 
131 | 
132 |     def compute_output_shape(self, input_shape):
133 |         return input_shape[0], self.features_dim
134 |     
135 |     
136 |     
137 | class AttentionSelf(Layer):
138 |     """
139 |     Self Attention, codes from: https://blog.csdn.net/xiaosongshine/article/details/90600028
140 |     代码源自：https://github.com/yongzhuo/Keras-TextClassification/blob/master/keras_textclassification/keras_layers/attention_self.py
141 |     """
142 |     def __init__(self, output_dim, **kwargs):
143 |         self.output_dim = output_dim
144 |         super().__init__(**kwargs)
145 | 
146 | 
147 |     def build(self, input_shape):
148 |         # Q、K and V
149 |         self.kernel = self.add_weight(name='QKV', shape=(3, input_shape[2], self.output_dim), trainable=True,
150 |                                       initializer='uniform', regularizer=regularizers.L1L2(0.0000032))
151 |         super().build(input_shape)
152 | 
153 | 
154 |     def call(self, x):
155 |         '''简写：res=p(x)*Vx  p(x)=softmax(Qx*Kx)'''
156 |         QX = K.dot(x, self.kernel[0])
157 |         KX = K.dot(x, self.kernel[1])
158 |         VX = K.dot(x, self.kernel[2])
159 |         print("QX.shape", QX.shape)
160 |         print("K.permute_dimensions(KX, [0, 2, 1]).shape", K.permute_dimensions(KX, [0, 2, 1]).shape)
161 |         
162 |         # batch_dot: 即batch-wise dot product，X与Y同一batch分别是Xi和Yi，则dot(Xi, Yi.T)为该batch的结果，遍历所有batch
163 |         # 当axes!=None时另说
164 |         QK = K.batch_dot(QX, K.permute_dimensions(KX, [0, 2, 1]))
165 |         QK = QK / (64 ** 0.5)   # TODO 64是不是应该改为self.output_dim更合适一些？！？因为KX's shape=(, input_shape[1], output_dim)，KX的维度是output_dim
166 |         QK = K.softmax(QK)
167 |         print("QK.shape", QK.shape)
168 |         
169 |         res = K.batch_dot(QK, VX)
170 |         return res
171 | 
172 | 
173 |     def compute_output_shape(self, input_shape):
174 |         return (input_shape[0], input_shape[1], self.output_dim)
175 | 
176 |         
177 |         
178 | def squash(x, axis=-1):
179 |     # s_squared_norm is really small
180 |     # s_squared_norm = K.sum(K.square(x), axis, keepdims=True) + K.epsilon()
181 |     # scale = K.sqrt(s_squared_norm)/ (0.5 + s_squared_norm)
182 |     # return scale * x
183 |     s_squared_norm = K.sum(K.square(x), axis, keepdims=True)
184 |     scale = K.sqrt(s_squared_norm + K.epsilon())
185 |     return x / scale
186 | 
187 |     
188 | 
189 | class Capsule(Layer):
190 |     """
191 |     Capsule  TODO 待研究！
192 |     """
193 |     def __init__(self, n_capsule, dim_capsule, routings=3, kernel_size=(9, 1), share_weights=True, activation=None, **kwargs):
194 |         super(Capsule, self).__init__(**kwargs)
195 |         self.n_capsule = n_capsule
196 |         self.dim_capsule = dim_capsule
197 |         self.routings = routings
198 |         self.kernel_size = kernel_size
199 |         self.share_weights = share_weights
200 |         self.activation =  Activation(activation) if activation else squash
201 |         
202 |         
203 |     def build(self, input_shape):
204 |         super(Capsule, self).build(input_shape)
205 |         input_n_capsule = 1 if self.share_weights else input_shape[-2]
206 |         input_dim_capsule = input_shape[-1]
207 |         self.W = self.add_weight(name='capsule_kernel',
208 |                                  shape=(input_n_capsule, input_dim_capsule, self.n_capsule * self.dim_capsule),
209 |                                  initializer='glorot_uniform',
210 |                                  trainable=True)
211 | 
212 | 
213 |     def call(self, x):
214 |         if self.share_weights:
215 |             u_hat_vecs = K.conv1d(x, self.W)
216 |         else:
217 |             u_hat_vecs = K.local_conv1d(x, self.W, [1], [1])
218 | 
219 |         batch_size = K.shape(x)[0]
220 |         input_n_capsule = K.shape(x)[1]
221 |         u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_n_capsule, self.n_capsule, self.dim_capsule))
222 |         u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3))
223 |         # final u_hat_vecs.shape = [None, n_capsule, input_n_capsule, dim_capsule]
224 | 
225 |         b = K.zeros_like(u_hat_vecs[:, :, :, 0])  # shape = [None, n_capsule, input_n_capsule]
226 |         for i in range(self.routings):
227 |             b = K.permute_dimensions(b, (0, 2, 1))  # shape = [None, input_n_capsule, n_capsule]
228 |             c = K.softmax(b)
229 |             c = K.permute_dimensions(c, (0, 2, 1))
230 |             b = K.permute_dimensions(b, (0, 2, 1))
231 |             outputs = self.activation(K.batch_dot(c, u_hat_vecs, [2, 2]))
232 |             if i < self.routings - 1:
233 |                 b = K.batch_dot(outputs, u_hat_vecs, [2, 3])
234 |         return outputs
235 | 
236 | 
237 |     def compute_output_shape(self, input_shape):
238 |         return (None, self.n_capsule, self.dim_capsule)
239 | 


--------------------------------------------------------------------------------
/model/TextBertCNN.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created:    2019-08-14 13:35:44
 4 | Author:     liuyao8
 5 | Descritipn: 
 6 | """
 7 | 
 8 | from keras.layers import BatchNormalization, SpatialDropout1D, Conv1D, GlobalMaxPooling1D, Concatenate, Dropout, Dense
 9 | from keras.models import Model
10 | 
11 | from model.BasicModel import BasicDeepModel
12 | 
13 | 
14 | class TextBertCNN(BasicDeepModel):
15 |     """TextCNN模型，修改自TextCNN.py，支持Bert编码向量的输入，没有embedding"""
16 |     
17 |     def __init__(self, config=None, fsizes=(2, 5), n_filters=64, dropout_p=0.25, **kwargs):
18 |         self.fsizes = fsizes
19 |         self.n_filters = n_filters
20 |         self.dropout_p = dropout_p
21 |         name = 'TextBertCNN'
22 |         config.bert_flag = True
23 |         BasicDeepModel.__init__(self, config=config, name=name, **kwargs)
24 | 
25 |         
26 |     def model_unit(self, inputs, masking, embedding=None, dropout_p=None, fsizes=None, n_filters=None):
27 |         """模型主体Unit"""
28 |         if dropout_p is None:
29 |             dropout_p = self.dropout_p
30 |         if fsizes is None:
31 |             fsizes = self.fsizes
32 |         if n_filters is None:
33 |             n_filters = [self.n_filters] * (fsizes[1] - fsizes[0] + 1)
34 |         
35 |         X = masking(inputs)
36 |         if embedding:               # TODO 为了支持embedding为None的Bert编码向量，暂时还有问题line41
37 |             X = embedding(X)
38 |         X = BatchNormalization()(X)
39 |         X = SpatialDropout1D(dropout_p)(X)
40 |         Xs = []
41 |         for i, fsize in enumerate(range(fsizes[0], fsizes[1] + 1)):
42 |             Xi = Conv1D(n_filters[i], fsize, activation='relu')(X)  # TODO Layer conv1d_5 does not support masking, but was passed an input_mask
43 |             Xi = GlobalMaxPooling1D()(Xi)
44 |             Xs.append(Xi)
45 |         return Xs
46 |     
47 |     
48 |     def build_model(self):
49 |         # 模型主体
50 |         if self.config.token_level == 'word':
51 |             Xs = self.model_unit(self.word_input, self.word_masking, self.word_embedding)
52 |             inputs = [self.word_input]
53 |             
54 |         elif self.config.token_level == 'char':
55 |             Xs = self.model_unit(self.char_input, self.char_masking, self.char_embedding)
56 |             inputs = [self.char_input]
57 |             
58 |         else:
59 |             word_Xs = self.model_unit(self.word_input, self.word_masking, self.word_embedding)
60 |             char_Xs = self.model_unit(self.char_input, self.char_masking, self.char_embedding)
61 |             Xs = word_Xs + char_Xs
62 |             inputs = [self.word_input, self.char_input]
63 |         
64 |         
65 |         # 结构化特征
66 |         if self.config.structured in ['word', 'char', 'both']:
67 |             Xs = Xs + self.structured_input
68 |             inputs = inputs + self.structured_input
69 |         
70 |         
71 |         # 模型结尾
72 |         X = Concatenate()(Xs) if len(Xs) > 1 else Xs[0]
73 |         X = BatchNormalization()(X)
74 |         X = Dropout(0.5)(X)
75 |         # X = Dense(self.hidden_units, activation='relu')(X)  # TODO 不需要隐藏层！？
76 |         out = Dense(self.n_classes, activation=self.activation)(X)
77 |         
78 |         self.model = Model(inputs=inputs, outputs=out)


--------------------------------------------------------------------------------
/model/TextBertGRU.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created:    2019-08-23 16:28:24
 4 | Author:     liuyao8
 5 | Descritipn: 
 6 | """
 7 | 
 8 | from keras.layers import GRU, Dense, BatchNormalization
 9 | from keras.optimizers import Adam
10 | from keras.models import Model
11 | 
12 | from model.BasicModel import BasicDeepModel
13 | from model.Bert.extract_feature import BertVector
14 | 
15 | 
16 | class TextBertGRU(BasicDeepModel):
17 |     """
18 |     Bert向量简单应用
19 |     Bert(Tensorflow实现)预训练模型对原始文本进行向量化编码，输入至RNN模型(Keras实现)里微调
20 |     注意，Bert不参与模型搭建，更不参与训练！相当于提前训练好的Word Embedding那样使用
21 |     """
22 |     
23 |     def __init__(self, config=None, rnn_units=128, dense_units=128, **kwargs):
24 |         self.rnn_units = rnn_units
25 |         self.dense_units = dense_units
26 |         name = 'TextBertGRU'
27 |         config.bert_flag = True     # 唯一与BERT关联的地方
28 |         BasicDeepModel.__init__(self, config=config, name=name, **kwargs)
29 | 
30 | 
31 |     def build_model(self):
32 |         """模型结构与BERT没任何关系，只不过其输入是BERT编码的向量"""
33 |         X = self.word_masking(self.word_input)              # TODO 务必要有masking，否则loss和val_acc几乎一直不保持不变！
34 |         X = GRU(self.rnn_units, dropout=0.25, recurrent_dropout=0.25)(X)
35 |         X = Dense(self.dense_units, activation='relu')(X)
36 |         X = BatchNormalization()(X)
37 |         out = Dense(self.n_classes, activation=self.activation)(X)
38 |         self.model = Model(inputs=self.word_input, outputs=out)
39 |     
40 |     
41 |     # 模型创建、训练与评估，详见脚本ModelTrain.py中的example函数
42 |     
43 |     
44 |     
45 |     # TODO 以下待办，暂时不用看    设计成数据编码环节，可通用于其他所有模型！
46 |     def build_bert_model(self):
47 |         self.bert_model = BertVector(pooling_strategy='NONE',
48 |                                      max_seq_len=self.config.bert_maxlen, 
49 |                                      bert_model_path=self.config.bert_model_path, 
50 |                                      graph_tmpfile=self.config.bert_graph_tmpfile)
51 |     
52 |     
53 |     def sentence_to_bert(self, sentence):
54 |         """单个句子编码为向量"""
55 |         return self.bert_model.encode([sentence])["encodes"][0]
56 |     
57 |     
58 |     def sentences_to_bert(self, sentences):
59 |         """多个句子编码为向量"""
60 |         return [self.sentence_to_bert(sent.strip()) for sent in sentences]
61 |         
62 |     
63 |     def data_generator(self, sentences, labels):
64 |         """编码数据，生成器"""
65 |         while True:
66 |             for i in range(0, len(sentences), self.batch_size):
67 |                 X = self.sentences_to_bert(sentences[i: i + self.batch_size])
68 |                 Y = labels[i: i + self.batch_size]
69 |                 yield (X, Y)
70 |         
71 |     
72 |     def data_prepare(self):
73 |         """准备train/test/val，未编码"""
74 |         # TODO 待办！
75 |         x_train, y_train = None, None
76 |         x_val, y_val = None, None
77 |         x_test, y_test = None, None
78 |         return x_train, y_train, x_val, y_val, x_test, y_test
79 |     
80 |     
81 |     def train_generator(self):
82 |         x_train, y_train, x_val, y_val, x_test, y_test = self.data_prepare()
83 |         self.model.compile(loss=self.loss, optimizer=Adam(lr=0.001), metrics=self.metrics)
84 |         self.model.fit_generator(self.data_generator(x_train, y_train),
85 |                                  steps_per_epoch=int(len(x_train)/self.batch_size)+1,
86 |                                  epochs=10,
87 |                                  verbose=1,
88 |                                  validation_data=(x_val, y_val),
89 |                                  validation_steps=None)
90 |         


--------------------------------------------------------------------------------
/model/TextCNN.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created:    2019-08-14 13:35:44
 4 | Author:     liuyao8
 5 | Descritipn: 
 6 | """
 7 | 
 8 | from keras.layers import BatchNormalization, SpatialDropout1D, Conv1D, GlobalMaxPooling1D, Concatenate, Dropout, Dense
 9 | from keras.models import Model
10 | 
11 | from model.BasicModel import BasicDeepModel    # TODO model. ???
12 | 
13 | 
14 | class TextCNN(BasicDeepModel):
15 |     """TextCNN模型，支持char,word和both. both时char和word分别进行TextCNN，然后拼接结果"""
16 |     
17 |     def __init__(self, config=None, fsizes=(2, 5), n_filters=64, dropout_p=0.25, **kwargs):
18 |         self.fsizes = fsizes
19 |         self.n_filters = n_filters      # TODO 是否是BasicDeepModel通用？通用的话放在BasicDeepModel那里
20 |         self.dropout_p = dropout_p
21 |         name = 'TextCNN_'
22 |         BasicDeepModel.__init__(self, config=config, name=name, **kwargs)
23 | 
24 |         
25 |     def model_unit(self, inputs, masking, embedding, dropout_p=None, fsizes=None, n_filters=None):
26 |         """模型主体Unit"""
27 |         if dropout_p is None:
28 |             dropout_p = self.dropout_p
29 |         if fsizes is None:
30 |             fsizes = self.fsizes
31 |         if n_filters is None:
32 |             n_filters = [self.n_filters] * (fsizes[1] - fsizes[0] + 1)
33 |         
34 |         X = masking(inputs)
35 |         X = embedding(X)
36 |         X = BatchNormalization()(X)
37 |         X = SpatialDropout1D(dropout_p)(X)
38 |         Xs = []
39 |         for i, fsize in enumerate(range(fsizes[0], fsizes[1] + 1)):
40 |             Xi = Conv1D(n_filters[i], fsize, activation='relu')(X)
41 |             Xi = GlobalMaxPooling1D()(Xi)
42 |             Xs.append(Xi)
43 |         return Xs
44 |     
45 |     
46 |     def build_model(self):
47 |         # 模型主体
48 |         if self.config.token_level == 'word':
49 |             Xs = self.model_unit(self.word_input, self.word_masking, self.word_embedding)
50 |             inputs = [self.word_input]
51 |             
52 |         elif self.config.token_level == 'char':
53 |             Xs = self.model_unit(self.char_input, self.char_masking, self.char_embedding)
54 |             inputs = [self.char_input]
55 |             
56 |         else:
57 |             word_Xs = self.model_unit(self.word_input, self.word_masking, self.word_embedding)
58 |             char_Xs = self.model_unit(self.char_input, self.char_masking, self.char_embedding)
59 |             Xs = word_Xs + char_Xs
60 |             inputs = [self.word_input, self.char_input]
61 |         
62 |         
63 |         # 结构化特征
64 |         if self.config.structured in ['word', 'char', 'both']:
65 |             Xs = Xs + self.structured_input
66 |             inputs = inputs + self.structured_input
67 |         
68 |         
69 |         # 模型结尾
70 |         X = Concatenate()(Xs) if len(Xs) > 1 else Xs[0]
71 |         X = BatchNormalization()(X)
72 |         X = Dropout(0.5)(X)
73 |         # X = Dense(self.hidden_units, activation='relu')(X)  # TODO 不需要隐藏层！？
74 |         out = Dense(self.n_classes, activation=self.activation)(X)
75 |         
76 |         self.model = Model(inputs=inputs, outputs=out)


--------------------------------------------------------------------------------
/model/TextCNN_BiGRU.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created:    2019-08-15 20:08:42
 4 | Author:     liuyao8
 5 | Descritipn: 
 6 | """
 7 | 
 8 | from keras.layers import BatchNormalization, SpatialDropout1D, Conv1D, GlobalMaxPooling1D, \
 9 |                         Concatenate, Dropout, Dense, Bidirectional, GRU, GlobalAveragePooling1D
10 | from keras.models import Model
11 | 
12 | from model.BasicModel import BasicDeepModel
13 | 
14 | 
15 | class TextCNN_BiGRU(BasicDeepModel):
16 |     """TextCNN模型，支持char,word和both. both时char进行TextCNN，word进行RNN，然后拼接结果"""
17 |     
18 |     def __init__(self, config=None, fsizes=(2, 5), n_filters=64, rnn_units=64, dropout_p=0.25, **kwargs):
19 |         self.fsizes = fsizes
20 |         self.n_filters = n_filters      # TODO 是否是BasicDeepModel通用？通用的话放在BasicDeepModel那里
21 |         self.rnn_units = rnn_units
22 |         self.dropout_p = dropout_p
23 |         name = 'TextCNN_BiGRU_' + config.token_level
24 |         BasicDeepModel.__init__(self, config=config, name=name, **kwargs)
25 |         
26 |         
27 |     def model_unit(self, inputs, masking, embedding, dropout_p=None, fsizes=None, n_filters=None):
28 |         """模型主体Unit"""
29 |         if dropout_p is None:
30 |             dropout_p = self.dropout_p
31 |         if fsizes is None:
32 |             fsizes = self.fsizes
33 |         if n_filters is None:
34 |             n_filters = [self.n_filters] * (fsizes[1] - fsizes[0] + 1)
35 |         
36 |         X = masking(inputs)
37 |         X = embedding(X)
38 |         X = BatchNormalization()(X)
39 |         X = SpatialDropout1D(dropout_p)(X)
40 |         Xs = []
41 |         for i, fsize in enumerate(range(fsizes[0], fsizes[1] + 1)):
42 |             Xi = Conv1D(n_filters[i], fsize, activation='relu')(X)
43 |             Xi = GlobalMaxPooling1D()(Xi)
44 |             Xs.append(Xi)
45 |         return Xs
46 |     
47 |         
48 |     def build_model(self):
49 |         # 模型主体
50 |         if self.config.token_level == 'word':
51 |             Xs = self.model_unit(self.word_input, self.word_masking, self.word_embedding)
52 |             inputs = [self.word_input]
53 |             
54 |         elif self.config.token_level == 'char':
55 |             Xs = self.model_unit(self.char_input, self.char_masking, self.char_embedding)
56 |             inputs = [self.char_input]
57 |             
58 |         else:
59 |             # 对word进行特殊处理！ word的BiGRU + char的TextCNN # TODO WHY???
60 |             word_X = self.word_embedding(self.word_input)
61 |             word_X = BatchNormalization()(word_X)
62 |             for _ in range(2):
63 |                 word_X = SpatialDropout1D(0.2)(word_X)
64 |                 word_X = Bidirectional(GRU(self.rnn_units // 2, return_sequences=True))(word_X)
65 |             word_maxpool = GlobalMaxPooling1D()(word_X)
66 |             word_avgpool = GlobalAveragePooling1D()(word_X)
67 |             
68 |             char_Xs = self.model_unit(self.char_input, self.char_masking, self.char_embedding)
69 |             Xs = [word_maxpool, word_avgpool] + char_Xs
70 |             inputs = [self.word_input, self.char_input]
71 |         
72 |         
73 |         # 结构化特征
74 |         if self.config.structured in ['word', 'char', 'both']:
75 |             Xs = Xs + self.structured_input
76 |             inputs = inputs + self.structured_input
77 |         
78 |         
79 |         # 模型结尾
80 |         X = Concatenate()(Xs) if len(Xs) > 1 else Xs[0]
81 |         X = Dropout(0.5)(X)
82 |         out = Dense(self.n_classes, activation=self.activation)(X)
83 |         
84 |         self.model = Model(inputs=inputs, outputs=out)


--------------------------------------------------------------------------------
/model/TextCapsule.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created:    2019-08-20 23:14:02
 4 | Author:     liuyao8
 5 | Descritipn: 
 6 | """
 7 | 
 8 | from keras.layers import BatchNormalization, Bidirectional, GRU, Flatten, Dropout, \
 9 |                         Concatenate, GlobalMaxPooling1D, GlobalAveragePooling1D, SpatialDropout1D, Dense
10 | from keras.models import Model
11 | 
12 | from model.BasicModel import BasicDeepModel
13 | from model.Layers import Capsule
14 | 
15 | 
16 | class TextCapsule(BasicDeepModel):
17 |     
18 |     def __init__(self, config=None, rnn_units=30, dropout_p=0.2, n_capsule=10, dim_capsule=16, routings=5, share_weights=True, **kwargs):
19 |         self.rnn_units = rnn_units
20 |         self.dropout_p = dropout_p
21 |         self.n_capsule = n_capsule
22 |         self.dim_capsule = dim_capsule
23 |         self.routings = routings
24 |         self.share_weights = share_weights
25 |         name = 'TextCapsule_' + config.token_level
26 |         BasicDeepModel.__init__(self, config=config, name=name, **kwargs)
27 |         
28 |         
29 |     def model_unit(self, inputs, masking, embedding, dropout_p=None, n_capsule=None, dim_capsule=None, routings=None, share_weights=None):
30 |         """模型主体Unit"""
31 |         if dropout_p is None:
32 |             dropout_p = self.dropout_p
33 |         if n_capsule is None:
34 |             n_capsule = self.n_capsule
35 |         if dim_capsule is None:
36 |             dim_capsule = self.dim_capsule
37 |         if routings is None:
38 |             routings = self.routings
39 |         if share_weights is None:
40 |             share_weights = self.share_weights
41 |             
42 |         X = masking(inputs)
43 |         X = embedding(X)
44 |         X = BatchNormalization()(X)
45 |         X = SpatialDropout1D(dropout_p)(X)
46 |         X = Bidirectional(GRU(64, return_sequences=True))(X)
47 |         capsule = Capsule(n_capsule=n_capsule, dim_capsule=dim_capsule, routings=routings, share_weights=share_weights)(X)
48 |         X = Flatten()(capsule)
49 |         return X
50 |         
51 |         
52 |     def build_model(self):
53 |         # 模型主体
54 |         if self.config.token_level == 'word':
55 |             X = self.model_unit(self.word_input, self.word_masking, self.word_embedding)
56 |             inputs = [self.word_input]
57 |             
58 |         elif self.config.token_level == 'char':
59 |             X = self.model_unit(self.char_input, self.char_masking, self.char_embedding)
60 |             inputs = [self.char_input]
61 |             
62 |         else:
63 |             # 对word进行特殊处理！
64 |             word_X = self.word_masking(self.word_input)
65 |             word_X = self.word_embedding(word_X)
66 |             word_X = SpatialDropout1D(0.25)(word_X)
67 |             word_X = Bidirectional(GRU(self.rnn_units, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(word_X)  # TODO ???
68 |             word_X = Bidirectional(GRU(self.rnn_units, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(word_X)
69 |             word_maxpool = GlobalMaxPooling1D()(word_X)
70 |             word_avgpool = GlobalAveragePooling1D()(word_X)
71 |             
72 |             char_X = self.model_unit(self.char_input, self.char_masking, self.char_embedding)
73 |             X = Concatenate()([word_maxpool, word_avgpool, char_X])
74 |             inputs = [self.word_input, self.char_input]
75 |         
76 |         
77 |         # 结构化特征
78 |         if self.config.structured in ['word', 'char', 'both']:
79 |             X = Concatenate()([X] + self.structured_input)
80 |             inputs = inputs + self.structured_input
81 |         
82 |         
83 |         # 模型结尾
84 |         X = Dropout(0.5)(X)
85 |         out = Dense(self.n_classes, activation=self.activation)(X)
86 |         
87 |         self.model = Model(inputs=inputs, outputs=out)


--------------------------------------------------------------------------------
/model/TextConvLSTM2_Attn.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created:    2019-08-17 21:11:18
 4 | Author:     liuyao8
 5 | Descritipn: 
 6 | """
 7 | 
 8 | from keras.layers import BatchNormalization, SpatialDropout1D, Conv1D, Bidirectional, LSTM, GRU, \
 9 |                         GlobalMaxPooling1D, GlobalAveragePooling1D, Concatenate, Dropout, Dense
10 | from keras.models import Model
11 | 
12 | from model.BasicModel import BasicDeepModel
13 | from model.Layers import AttentionWeightedAverage
14 | 
15 | 
16 | class TextConvLSTM2_Attn(BasicDeepModel):
17 |     
18 |     def __init__(self, config=None, n_filters=128, rnn_units=64, dropout_p=0.25, with_attention=True, **kwargs):
19 |         self.n_filters = n_filters
20 |         self.rnn_units = rnn_units
21 |         self.dropout_p = dropout_p
22 |         self.with_attention = with_attention
23 |         name = 'TextConvLSTM2_Attn_' + config.token_level
24 |         BasicDeepModel.__init__(self, config=config, name=name, **kwargs)
25 |         
26 |         
27 |     def model_unit(self, inputs, masking, embedding, n_filters=None, rnn_units=None, dropout_p=None):
28 |         """模型主体Unit"""
29 |         if n_filters is None:
30 |             n_filters = self.n_filters
31 |         if rnn_units is None:
32 |             rnn_units = [self.rnn_units] * 2
33 |         if isinstance(rnn_units, int):
34 |             rnn_units = [rnn_units] * 2
35 |         if dropout_p is None:
36 |             dropout_p = [self.dropout_p] * 2
37 |         if isinstance(dropout_p, float):
38 |             dropout_p = [dropout_p] * 2
39 |         
40 |         X = masking(inputs)
41 |         X = embedding(X)
42 |         X = BatchNormalization()(X)
43 |         X = SpatialDropout1D(dropout_p[0])(X)
44 |         # TODO Conv1D没有activation ???
45 |         X = Conv1D(n_filters, kernel_size=3, padding="valid", kernel_initializer="glorot_uniform")(X) # 相比LSTMGRUModel，此处多了个Conv1D
46 |         X = Bidirectional(LSTM(rnn_units[0], return_sequences=True))(X)
47 |         X = SpatialDropout1D(dropout_p[1])(X)
48 |         X = Bidirectional(GRU(rnn_units[1], return_sequences=True))(X)
49 |         
50 |         maxpool = GlobalMaxPooling1D()(X)
51 |         avgpool = GlobalAveragePooling1D()(X)
52 |         if self.with_attention:
53 |             attn = AttentionWeightedAverage()(X)
54 |             X = Concatenate()([maxpool, avgpool, attn])
55 |         else:
56 |             X = Concatenate()([maxpool, avgpool])
57 |         return X
58 |     
59 |     
60 |     def build_model(self):
61 |         # 模型主体
62 |         if self.config.token_level == 'word':
63 |             X = self.model_unit(self.word_input, self.word_masking, self.word_embedding)
64 |             inputs = [self.word_input]
65 |             
66 |         elif self.config.token_level == 'char':
67 |             X = self.model_unit(self.char_input, self.char_masking, self.char_embedding)
68 |             inputs = [self.char_input]
69 |             
70 |         else:
71 |             # 与TextConvLSTMGRU对word进行特殊处理！  # TODO WHY???   与char相比，没有conv和attention
72 |             word_X = self.word_masking(self.word_input)
73 |             word_X = self.word_embedding(word_X)
74 |             word_X = BatchNormalization()(word_X)
75 |             word_X = SpatialDropout1D(0.2)(word_X)      # TODO 0.2  下面0.1 ？
76 |             word_X = Bidirectional(GRU(self.rnn_units // 2, return_sequences=True))(word_X)
77 |             word_X = SpatialDropout1D(0.1)(word_X)
78 |             word_X = Bidirectional(GRU(self.rnn_units // 2, return_sequences=True))(word_X)
79 |             word_maxpool = GlobalMaxPooling1D()(word_X)
80 |             word_avgpool = GlobalAveragePooling1D()(word_X)
81 |             
82 |             char_X = self.model_unit(self.char_input, self.char_masking, self.char_embedding)
83 |             X = Concatenate()([word_maxpool, word_avgpool, char_X])
84 |             inputs = [self.word_input, self.char_input]
85 |         
86 |         
87 |         # 结构化特征
88 |         if self.config.structured in ['word', 'char', 'both']:
89 |             X = Concatenate()([X] + self.structured_input)
90 |             inputs = inputs + self.structured_input
91 |         
92 |         
93 |         # 模型结尾
94 |         X = Dropout(0.5)(X)
95 |         out = Dense(self.n_classes, activation=self.activation)(X)
96 |         
97 |         self.model = Model(inputs=inputs, outputs=out)


--------------------------------------------------------------------------------
/model/TextConvLSTM_Attn.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created:    2019-08-17 20:52:19
 4 | Author:     liuyao8
 5 | Descritipn: 
 6 | """
 7 | 
 8 | from keras.layers import BatchNormalization, SpatialDropout1D, Conv1D, Bidirectional, LSTM, GRU, \
 9 |                         GlobalMaxPooling1D, GlobalAveragePooling1D, Concatenate, Dropout, Dense
10 | from keras.models import Model
11 | 
12 | from model.BasicModel import BasicDeepModel
13 | from model.Layers import AttentionWeightedAverage
14 | 
15 | 
16 | class TextConvLSTM_Attn(BasicDeepModel):
17 |     
18 |     def __init__(self, config=None, n_filters=128, rnn_units=64, dropout_p=0.25, with_attention=True, **kwargs):
19 |         self.n_filters = n_filters
20 |         self.rnn_units = rnn_units
21 |         self.dropout_p = dropout_p
22 |         self.with_attention = with_attention
23 |         name = 'TextConvLSTM_Attn_' + config.token_level
24 |         BasicDeepModel.__init__(self, config=config, name=name, **kwargs)
25 |         
26 |         
27 |     def model_unit(self, inputs, masking, embedding, n_filters=None, rnn_units=None, dropout_p=None):
28 |         """模型主体Unit"""
29 |         if n_filters is None:
30 |             n_filters = self.n_filters
31 |         if rnn_units is None:
32 |             rnn_units = [self.rnn_units] * 2
33 |         if isinstance(rnn_units, int):
34 |             rnn_units = [rnn_units] * 2
35 |         if dropout_p is None:
36 |             dropout_p = [self.dropout_p] * 2
37 |         if isinstance(dropout_p, float):
38 |             dropout_p = [dropout_p] * 2
39 |         
40 |         X = masking(inputs)
41 |         X = embedding(X)
42 |         X = BatchNormalization()(X)
43 |         X = SpatialDropout1D(dropout_p[0])(X)
44 |         X = Conv1D(n_filters, kernel_size=3, padding="valid", kernel_initializer="glorot_uniform")(X) # 相比LSTMGRUModel，此处多了个Conv1D
45 |         X = Bidirectional(LSTM(rnn_units[0], return_sequences=True))(X)
46 |         X = SpatialDropout1D(dropout_p[1])(X)
47 |         X = Bidirectional(GRU(rnn_units[1], return_sequences=True))(X)
48 |         
49 |         maxpool = GlobalMaxPooling1D()(X)
50 |         avgpool = GlobalAveragePooling1D()(X)
51 |         if self.with_attention:
52 |             attn = AttentionWeightedAverage()(X)
53 |             X = Concatenate()([maxpool, avgpool, attn])
54 |         else:
55 |             X = Concatenate()([maxpool, avgpool])
56 |         return X
57 |     
58 |     
59 |     def build_model(self):
60 |         # 模型主体
61 |         if self.config.token_level == 'word':
62 |             X = self.model_unit(self.word_input, self.word_masking, self.word_embedding)
63 |             inputs = [self.word_input]
64 |             
65 |         elif self.config.token_level == 'char':
66 |             X = self.model_unit(self.char_input, self.char_masking, self.char_embedding)
67 |             inputs = [self.char_input]
68 |             
69 |         else:
70 |             word_X = self.model_unit(self.word_input, self.word_masking, self.word_embedding)
71 |             char_X = self.model_unit(self.char_input, self.char_masking, self.char_embedding)
72 |             X = Concatenate()([word_X, char_X])
73 |             inputs = [self.word_input, self.char_input]
74 |         
75 |         
76 |         # 结构化特征
77 |         if self.config.structured in ['word', 'char', 'both']:
78 |             X = Concatenate()([X] + self.structured_input)
79 |             inputs = inputs + self.structured_input
80 |         
81 |         
82 |         # 模型结尾
83 |         X = Dropout(0.5)(X)
84 |         out = Dense(self.n_classes, activation=self.activation)(X)
85 |         
86 |         self.model = Model(inputs=inputs, outputs=out)


--------------------------------------------------------------------------------
/model/TextDPCNN.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created:    2019-08-19 21:25:09
  4 | Author:     liuyao8
  5 | Descritipn: 
  6 | """
  7 | 
  8 | from keras.layers import BatchNormalization, PReLU, Add, MaxPooling1D, Bidirectional, GRU, Dropout, \
  9 |                         Concatenate, Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D, SpatialDropout1D, Dense
 10 | from keras import regularizers
 11 | from keras import backend as K
 12 | from keras.models import Model
 13 | 
 14 | from model.BasicModel import BasicDeepModel
 15 | 
 16 | 
 17 | class TextDPCNN(BasicDeepModel):
 18 |     
 19 |     def __init__(self, config=None, rnn_units=30, n_filters=64, filter_size=3, dp=7, dense_units=256, **kwargs):
 20 |         self.rnn_units = rnn_units
 21 |         self.n_filters = n_filters
 22 |         self.filter_size = filter_size
 23 |         self.dp = dp
 24 |         self.dense_units = dense_units
 25 |         name = 'TextDPCNN_' + config.token_level
 26 |         BasicDeepModel.__init__(self, config=config, name=name, **kwargs)
 27 |         
 28 |         
 29 |     def block(self, X, n_filters, filter_size, kernel_reg, bias_reg, first=False, last=False):
 30 |         """DPCNN网络结构中需要重复的block"""
 31 |         X1 = Conv1D(n_filters, kernel_size=filter_size, padding='same', kernel_regularizer=kernel_reg, bias_regularizer=bias_reg)(X)
 32 |         X1 = BatchNormalization()(X1)
 33 |         X1 = PReLU()(X1)
 34 |         X1 = Conv1D(n_filters, kernel_size=filter_size, padding='same', kernel_regularizer=kernel_reg, bias_regularizer=bias_reg)(X1)
 35 |         X1 = BatchNormalization()(X1)
 36 |         X1 = PReLU()(X1)            # (, 57, 64)
 37 |         
 38 |         if first:
 39 |             X = Conv1D(n_filters, kernel_size=1, padding='same', kernel_regularizer=kernel_reg, bias_regularizer=bias_reg)(X)   # (, 57, 64)
 40 |         
 41 |         X = Add()([X, X1])          # (, 57, 64)
 42 |         
 43 |         if last:
 44 |             X = GlobalMaxPooling1D()(X)
 45 |         else:
 46 |             X = MaxPooling1D(pool_size=3, strides=2)(X)     # (, 28, 64)
 47 |         return X
 48 |         
 49 |         
 50 |     def model_unit(self, inputs, masking, embedding, n_filters=None, filter_size=None, dp=None, dense_units=None):
 51 |         """模型主体Unit"""
 52 |         kernel_reg=regularizers.l2(0.00001)
 53 |         bias_reg=regularizers.l2(0.00001)
 54 |         if n_filters is None:
 55 |             n_filters = self.n_filters
 56 |         if filter_size is None:
 57 |             filter_size = self.filter_size
 58 |         if dp is None:
 59 |             dp = self.dp
 60 |         if dense_units is None:
 61 |             dense_units = self.dense_units
 62 |             
 63 |         # Region Embedding
 64 |         X = masking(inputs)
 65 |         X = embedding(X)
 66 |         X = BatchNormalization()(X)     # (, 57, 100)
 67 |         
 68 |         # 第1层 pre-activation
 69 |         X = self.block(X, n_filters, filter_size, kernel_reg, bias_reg, first=True)     # (, 28, 64)
 70 |         
 71 |         # 重复dp次: 不含第1层
 72 |         flag_last = False
 73 |         for i in range(dp):
 74 |             if i + 1 == dp or flag_last:        # 最后1层
 75 |                 X = self.block(X, n_filters, filter_size, kernel_reg, bias_reg, last=True)
 76 |                 break                           # 务必不要忘了break！！！
 77 |             else:                               # 中间层
 78 |                 if K.int_shape(X)[1] // 2 < 8:  # 此次block操作后没法继续MaxPooling1D，下一层变为最后1层(GlobalMaxPooling1D)
 79 |                     flag_last = True
 80 |                 X = self.block(X, n_filters, filter_size, kernel_reg, bias_reg)
 81 |         
 82 |         # 全连接层
 83 |         X = Dense(dense_units)(X)
 84 |         X = BatchNormalization()(X)
 85 |         X = PReLU()(X)
 86 |         return X
 87 |         
 88 |         
 89 |     def build_model(self):
 90 |         # 模型主体
 91 |         if self.config.token_level == 'word':
 92 |             X = self.model_unit(self.word_input, self.word_masking, self.word_embedding)
 93 |             inputs = [self.word_input]
 94 |             
 95 |         elif self.config.token_level == 'char':
 96 |             X = self.model_unit(self.char_input, self.char_masking, self.char_embedding)
 97 |             inputs = [self.char_input]
 98 |             
 99 |         else:
100 |             # 对word进行特殊处理！
101 |             word_X = self.word_embedding(self.word_input)
102 |             word_X = SpatialDropout1D(0.25)(word_X)
103 |             word_X = Bidirectional(GRU(self.rnn_units, return_sequences=True))(word_X)
104 |             word_X = SpatialDropout1D(0.25)(word_X)
105 |             word_X = Bidirectional(GRU(self.rnn_units, return_sequences=True))(word_X)
106 |             word_maxpool = GlobalMaxPooling1D()(word_X)
107 |             word_avgpool = GlobalAveragePooling1D()(word_X)
108 |             
109 |             char_X = self.model_unit(self.char_input, self.char_masking, self.char_embedding)
110 |             X = Concatenate()([word_maxpool, word_avgpool, char_X])
111 |             inputs = [self.word_input, self.char_input]
112 |         
113 |         
114 |         # 结构化特征
115 |         if self.config.structured in ['word', 'char', 'both']:
116 |             X = Concatenate()([X] + self.structured_input)
117 |             inputs = inputs + self.structured_input
118 |         
119 |         
120 |         # 模型结尾
121 |         X = Dropout(0.5)(X)
122 |         out = Dense(self.n_classes, activation=self.activation)(X)
123 |         
124 |         self.model = Model(inputs=inputs, outputs=out)


--------------------------------------------------------------------------------
/model/TextGRU2_Attn.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created:    2019-08-17 19:20:59
 4 | Author:     liuyao8
 5 | Descritipn: 
 6 | """
 7 | 
 8 | from keras.layers import BatchNormalization, Bidirectional, GRU, SpatialDropout1D, Lambda, \
 9 |                         GlobalMaxPooling1D, GlobalAveragePooling1D, Concatenate, Dropout, Dense
10 | from keras.models import Model
11 | 
12 | from model.BasicModel import BasicDeepModel
13 | from model.Layers import AttentionWeightedAverage
14 | 
15 | 
16 | class TextGRU2_Attn(BasicDeepModel):
17 |     
18 |     def __init__(self, config=None, n_rnns=None, rnn_units=64, dropout_p=0.25, with_attention=True, **kwargs):
19 |         if n_rnns is None:
20 |             self.n_rnns = (2, 2) if config.token_level == 'both' else 2
21 |         self.rnn_units = rnn_units
22 |         self.dropout_p = dropout_p
23 |         name = 'TextGRU2_Attn_' + config.token_level
24 |         BasicDeepModel.__init__(self, config=config, name=name, **kwargs)
25 |         
26 |         
27 |     def model_unit(self, inputs, masking, embedding, n_rnns=None, rnn_units=None, dropout_p=None, with_attention=None):
28 |         """模型主体Unit"""
29 |         if n_rnns is None:
30 |             n_rnns = self.n_rnns
31 |         if rnn_units is None:
32 |             rnn_units = [self.rnn_units] * n_rnns
33 |         if isinstance(rnn_units, int):
34 |             rnn_units = [rnn_units] * n_rnns
35 |         if dropout_p is None:
36 |             dropout_p = [self.dropout_p] * n_rnns
37 |         if isinstance(dropout_p, float):
38 |             dropout_p = [dropout_p] * n_rnns
39 |         if with_attention is None:
40 |             with_attention = self.with_attention
41 |         
42 |         X = masking(inputs)
43 |         X = embedding(X)
44 |         X = BatchNormalization()(X)
45 |         for i in range(n_rnns):
46 |             X = Bidirectional(GRU(rnn_units[i], return_sequences=True))(X)
47 |             X = SpatialDropout1D(dropout_p[i])(X)
48 |         
49 |         maxpool = GlobalMaxPooling1D()(X)
50 |         avgpool = GlobalAveragePooling1D()(X)
51 |         last = Lambda(lambda x: x[:, -1])(X)        # TODO 注释掉！？
52 |         if with_attention:
53 |             attn = AttentionWeightedAverage()(X)
54 |             X = Concatenate()([maxpool, avgpool, last, attn])
55 |         else:
56 |             X = Concatenate()([maxpool, avgpool, last])
57 |         return X
58 |     
59 |     
60 |     def build_model(self):
61 |         # 模型主体
62 |         if self.config.token_level == 'word':
63 |             X = self.model_unit(self.word_input, self.word_masking, self.word_embedding)
64 |             inputs = [self.word_input]
65 |             
66 |         elif self.config.token_level == 'char':
67 |             X = self.model_unit(self.char_input, self.char_masking, self.char_embedding)
68 |             inputs = [self.char_input]
69 |             
70 |         else:
71 |             # 对word进行特殊处理！  # TODO WHY???
72 |             # TODO 与TextGRU的唯一区别，后续TextAttention和TextAttention2可统一成一个
73 |             word_X = self.model_unit(self.word_input, self.word_masking, self.word_embedding, self.n_rnns[0], with_attention=False)
74 |             char_X = self.model_unit(self.char_input, self.char_masking, self.char_embedding, self.n_rnns[1])
75 |             X = Concatenate()([word_X, char_X])
76 |             inputs = [self.word_input, self.char_input]
77 |         
78 |         
79 |         # 结构化特征
80 |         if self.config.structured in ['word', 'char', 'both']:
81 |             X = Concatenate()([X] + self.structured_input)
82 |             inputs = inputs + self.structured_input
83 |         
84 |         
85 |         # 模型结尾
86 |         X = Dropout(0.5)(X)
87 |         out = Dense(self.n_classes, activation=self.activation)(X)
88 |         
89 |         self.model = Model(inputs=inputs, outputs=out)


--------------------------------------------------------------------------------
/model/TextGRU_Attn.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created:    2019-08-17 18:43:40
 4 | Author:     liuyao8
 5 | Descritipn: 
 6 | """
 7 | 
 8 | from keras.layers import BatchNormalization, Bidirectional, GRU, SpatialDropout1D, Lambda, \
 9 |                         GlobalMaxPooling1D, GlobalAveragePooling1D, Concatenate, Dropout, Dense
10 | from keras.models import Model
11 | 
12 | from model.BasicModel import BasicDeepModel
13 | from model.Layers import AttentionWeightedAverage
14 | 
15 | 
16 | class TextGRU_Attn(BasicDeepModel):
17 |     
18 |     def __init__(self, config=None, n_rnns=None, rnn_units=64, dropout_p=0.5, **kwargs):
19 |         if n_rnns is None:
20 |             self.n_rnns = (2, 2) if config.token_level == 'both' else 2
21 |         self.rnn_units = rnn_units
22 |         self.dropout_p = dropout_p
23 |         name = 'TextGRU_Attn_' + config.token_level
24 |         BasicDeepModel.__init__(self, config=config, name=name, **kwargs)
25 |         
26 |     
27 |     def model_unit(self, inputs, masking, embedding, n_rnns=None, rnn_units=None, dropout_p=None):
28 |         """模型主体Unit"""
29 |         if n_rnns is None:
30 |             n_rnns = self.n_rnns
31 |         if rnn_units is None:
32 |             rnn_units = [self.rnn_units] * n_rnns
33 |         if isinstance(rnn_units, int):
34 |             rnn_units = [rnn_units] * n_rnns
35 |         if dropout_p is None:
36 |             dropout_p = [self.dropout_p] * n_rnns
37 |         if isinstance(dropout_p, float):
38 |             dropout_p = [dropout_p] * n_rnns
39 |         
40 |         X = masking(inputs)
41 |         X = embedding(X)
42 |         X = BatchNormalization()(X)
43 |         for i in range(n_rnns):
44 |             X = Bidirectional(GRU(rnn_units[i], return_sequences=True))(X)
45 |             X = SpatialDropout1D(dropout_p[i])(X)
46 |         
47 |         maxpool = GlobalMaxPooling1D()(X)
48 |         avgpool = GlobalAveragePooling1D()(X)
49 |         last = Lambda(lambda x: x[:, -1])(X)
50 |         attn = AttentionWeightedAverage()(X)
51 |         X = Concatenate()([maxpool, avgpool, last, attn])
52 |         return X
53 |     
54 |     
55 |     def build_model(self):
56 |         # 模型主体
57 |         if self.config.token_level == 'word':
58 |             X = self.model_unit(self.word_input, self.word_masking, self.word_embedding)
59 |             inputs = [self.word_input]
60 |             
61 |         elif self.config.token_level == 'char':
62 |             X = self.model_unit(self.char_input, self.char_masking, self.char_embedding)
63 |             inputs = [self.char_input]
64 |             
65 |         else:
66 |             word_X = self.model_unit(self.word_input, self.word_masking, self.word_embedding, self.n_rnns[0])
67 |             char_X = self.model_unit(self.char_input, self.char_masking, self.char_embedding, self.n_rnns[1])
68 |             X = Concatenate()([word_X, char_X])
69 |             inputs = [self.word_input, self.char_input]
70 |         
71 |         
72 |         # 结构化特征
73 |         if self.config.structured in ['word', 'char', 'both']:
74 |             X = Concatenate()([X] + self.structured_input)
75 |             inputs = inputs + self.structured_input
76 |         
77 |         
78 |         # 模型结尾
79 |         X = Dropout(0.5)(X)
80 |         out = Dense(self.n_classes, activation=self.activation)(X)
81 |         
82 |         self.model = Model(inputs=inputs, outputs=out)


--------------------------------------------------------------------------------
/model/TextHAN.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created:    2019-08-20 22:58:52
  4 | Author:     liuyao8
  5 | Descritipn: 
  6 | """
  7 | 
  8 | from keras.layers import Input, BatchNormalization, Bidirectional, LSTM, TimeDistributed, Dropout, Dense, GRU, Masking, Flatten
  9 | from keras.models import Model
 10 | from keras.optimizers import Adam
 11 | 
 12 | from model.BasicModel import BasicDeepModel
 13 | from model.Layers import Attention, AttentionSelf
 14 | 
 15 | 
 16 | class TextHAN(BasicDeepModel):
 17 |     
 18 |     def __init__(self, config=None, rnn_units1=128, rnn_units2=128, **kwargs):
 19 |         self.rnn_units1 = rnn_units1
 20 |         self.rnn_units2 = rnn_units2
 21 |         self.sent_maxlen = config.SENT_MAXLEN
 22 |         self.word_maxlen = config.WORD_MAXLEN
 23 |         self.sent_input = Input(shape=(self.sent_maxlen, self.word_maxlen), dtype='int32', name='sentence1')  # (, sent_maxlen, word_maxlen)
 24 |         name = 'TextHAN'
 25 |         BasicDeepModel.__init__(self, config=config, name=name, **kwargs)
 26 |         
 27 |     
 28 |     # 方法1：以下参考https://github.com/ShawnyXiao/TextClassification-Keras/blob/master/model/HAN/han.py
 29 |     # 脚本https://github.com/AlexYangLi/TextClassification/blob/master/models/keras_han_model.py与方法1其实是一样的，只是写法不同
 30 |     # 脚本https://github.com/richliao/textClassifier/blob/master/textClassifierHATT.py与方法1是一样的
 31 |     def build_model(self):
 32 |         # Sentence Part                                                       sent_input: (, sent_maxlen, word_maxlen)
 33 |         X = TimeDistributed(self.word_encoder(), name='word_encoder')(self.sent_input)  # (, sent_maxlen, 2*rnn_units1)
 34 |         X = Masking()(X)        # TODO 实验验证，加不加，影响不大。为什么？何时需要？
 35 |         X = BatchNormalization()(X)
 36 |         X = Bidirectional(LSTM(self.rnn_units2, return_sequences=True))(X)  # (, sent_maxlen, 2*rnn_units2)
 37 |         X = Attention(self.sent_maxlen)(X)                                  # (, 2*rnn_units2)
 38 |         
 39 |         X = Dropout(0.5)(X)
 40 |         out = Dense(self.n_classes, activation=self.activation)(X)          # (, n_classes)
 41 |         self.model = Model(inputs=self.sent_input, outputs=out)  # TODO 注意inputs是Sentence Part的inputs(而非Word Part)！
 42 | 
 43 | 
 44 |     def word_encoder(self):
 45 |         # Word Part 模型，提供word level的编码功能
 46 |         word_X = self.word_masking(self.word_input)                 # (, word_maxlen)
 47 |         word_X = self.word_embedding(word_X)                        # (, word_maxlen, word_embed_dim)
 48 |         word_X = BatchNormalization()(word_X)
 49 |         word_X = Bidirectional(LSTM(self.rnn_units1, return_sequences=True))(word_X) # (, word_maxlen, 2*rnn_units1)
 50 |         word_out = Attention(self.word_maxlen)(word_X)              # (, 2*rnn_units1)  # TODO 能不能使用AttentionAverageWeighted
 51 |         return Model(inputs=self.word_input, outputs=word_out)
 52 | 
 53 | 
 54 |     def train_evaluate(self, x_train, y_train, x_test, y_test, lr=1e-4, epochs=None):
 55 |         """经测试，only Step1, only Step2, Step1+Step2, 这3种训练模式效果差不多，only Step2略微好一丁点"""
 56 |         # 模型训练
 57 |         print('【' + self.name + '】')
 58 |         self.mode = 3
 59 |         epochs = epochs if epochs else (2, self.n_epochs)
 60 |         
 61 |         def model_compile_fit(lr=1e-4, epochs=3):
 62 |             self.model.compile(loss=self.loss, optimizer=Adam(lr=lr), metrics=self.metrics)
 63 |             return self.model.fit(x_train, y_train, 
 64 |                                       batch_size=self.batch_size*self.config.n_gpus,
 65 |                                       epochs=epochs,
 66 |                                       validation_split=0.3,
 67 |                                       callbacks=None)
 68 |         
 69 |         print('-------------------Step1: 前期冻结Word_Encoder层，编译和训练模型-------------------')
 70 |         self.model.get_layer('word_encoder').trainable = False      # TODO word_encoder由很多层组成，如何只设置其中的Embedding？？
 71 |         history1 = model_compile_fit(1e-4, 3)
 72 |         history1 = model_compile_fit(1e-5, 3)
 73 |         history1 = model_compile_fit(1e-6, 3)
 74 |         history1 = model_compile_fit(1e-7, 3)
 75 |         
 76 |         print('-------------Step2: 训练完参数后，解冻Word_Encoder层，再次编译和训练模型------------')
 77 |         self.model.get_layer('word_encoder').trainable = True
 78 |         history2 = model_compile_fit(1e-4, 3)
 79 |         history2 = model_compile_fit(1e-5, 3)
 80 |         history2 = model_compile_fit(1e-6, 3)
 81 |         history2 = model_compile_fit(1e-7, 3)
 82 |         self.plot_history(history2)
 83 |         history = (history1, history2)
 84 |         
 85 |         # 模型评估
 86 |         test_acc, scores, sims, vectors, test_pred = self._evaluate(x_test, y_test)
 87 |         pickle.dump(test_pred, open('./result/' + self.name + '_test_pred.pkl', 'wb'))
 88 |         return test_acc, scores, sims, vectors, history
 89 |     
 90 |     
 91 |     # 方法2：以下参考https://github.com/yongzhuo/Keras-TextClassification/blob/master/keras_textclassification/m12_HAN/graph.py
 92 |     # 方法1使用了Attention机制，而方法2使用了Self-Attention即Transformer机制！
 93 |     # TODO 输入是self.word_embedding.input？？？待研究！
 94 |     def build_model2(self):
 95 |         # Word Part
 96 |         word_X = self.word_embedding.output                         # (, word_maxlen, word_embed_dim)
 97 |         word_X = Bidirectional(GRU(units=self.rnn_units1, return_sequences=True, activation='relu'))(word_X) # (, word_maxlen, 2*rnn_units1)
 98 |         word_X = AttentionSelf(self.rnn_units*2)(word_X)            # (, word_maxlen, 2*rnn_units)
 99 |         word_X = Dropout(0.5)(word_X)
100 | 
101 |         # Sentence Part
102 |         X = Bidirectional(GRU(units=self.rnn_units2, return_sequences=True, activation='relu'))(word_X)      # (, word_maxlen, 2*rnn_units2)
103 |         X = AttentionSelf(self.word_embed_dim)(X)                   # (, word_maxlen, word_embed_dim)
104 |         X = Dropout(0.5)(X)
105 | 
106 |         X = Flatten()(X)                                            # (, word_maxlen * word_embed_dim)
107 |         out = Dense(self.n_classes, activation=self.activation)(X)  # (, n_classes)
108 |         self.model = Model(inputs=self.word_embedding.input, outputs=out)
109 |         
110 |         
111 | 
112 | if __name__ == '__main__':
113 |     
114 |     import pickle
115 |     from Vocabulary import Vocabulary
116 |     from Config import Config
117 |     config = Config()
118 |     
119 |     # data和config准备  详情请参考脚本 ModelTrain.py
120 |     config = pickle.load(open(config.config_file, 'rb'))
121 |     x_train, y_train, x_test, y_test = pickle.load(open(config.data_encoded_file, 'rb'))
122 |     
123 |     # 根据实际情况修改，也可直接在Config.py里修改，推荐前者
124 |     config.n_gpus = 1
125 |     config.token_level = 'word'
126 |     config.structured = 'none'
127 |     config.bert_flag = False
128 | 
129 |     # 模型创建与训练    
130 |     texthan = TextHAN(config)
131 |     test_acc, scores, sims, vectors, history = texthan.train_evaluate(x_train, y_train, x_test, y_test)
132 |     
133 |     texthan.model.save(config.model_file)
134 |     


--------------------------------------------------------------------------------
/model/TextLSTMGRU_Attn.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created:    2019-08-17 21:42:28
 4 | Author:     liuyao8
 5 | Descritipn: 
 6 | """
 7 | 
 8 | from keras.layers import BatchNormalization, SpatialDropout1D, Bidirectional, LSTM, GRU, \
 9 |                         GlobalMaxPooling1D, GlobalAveragePooling1D, Concatenate, Dropout, Dense
10 | from keras.models import Model
11 | 
12 | from model.BasicModel import BasicDeepModel
13 | from model.Layers import AttentionWeightedAverage
14 | 
15 | 
16 | class TextLSTMGRU_Attn(BasicDeepModel):
17 |     
18 |     def __init__(self, config=None, rnn_units=64, dropout_p=0.25, with_attention=False, **kwargs):
19 |         self.rnn_units = rnn_units
20 |         self.dropout_p = dropout_p
21 |         self.with_attention = with_attention
22 |         name = 'TextLSTMGRU_Attn_' + config.token_level
23 |         BasicDeepModel.__init__(self, config=config, name=name, **kwargs)
24 |         
25 |         
26 |     def model_unit(self, inputs, masking, embedding, dropout_p=None, rnn_units=None, with_attention=None):
27 |         """模型主体Unit"""
28 |         if dropout_p is None:
29 |             dropout_p = (self.dropout_p, self.dropout_p)
30 |         if isinstance(dropout_p, float):
31 |             dropout_p = (dropout_p, dropout_p)
32 |         if rnn_units is None:
33 |             rnn_units = (self.rnn_units, self.rnn_units)
34 |         if isinstance(rnn_units, int):
35 |             rnn_units = (rnn_units, rnn_units)
36 |         if with_attention is None:
37 |             with_attention = self.with_attention
38 |             
39 |         X = masking(inputs)
40 |         X = embedding(X)
41 |         X = BatchNormalization()(X)
42 |         X = SpatialDropout1D(dropout_p[0])(X)
43 |         X = Bidirectional(LSTM(rnn_units[0], return_sequences=True))(X)
44 |         X = SpatialDropout1D(dropout_p[1])(X)
45 |         X = Bidirectional(GRU(rnn_units[1], return_sequences=True))(X)
46 |         
47 |         maxpool = GlobalMaxPooling1D()(X)
48 |         avgpool = GlobalAveragePooling1D()(X)
49 |         if with_attention:
50 |             attn = AttentionWeightedAverage()(X)
51 |             X = Concatenate()([maxpool, avgpool, attn])
52 |         else:
53 |             X = Concatenate()([maxpool, avgpool])
54 |         return X
55 |     
56 |     
57 |     def build_model(self):
58 |         # 模型主体
59 |         if self.config.token_level == 'word':
60 |             X = self.model_unit(self.word_input, self.word_masking, self.word_embedding, 0.33, 100)
61 |             inputs = [self.word_input]
62 |             
63 |         elif self.config.token_level == 'char':
64 |             X = self.model_unit(self.char_input, self.char_masking, self.char_embedding, 0.2, 100)
65 |             inputs = [self.char_input]
66 |             
67 |         else:
68 |             word_X = self.model_unit(self.word_input, self.word_masking, self.word_embedding, (0.5, 0.1), 30)
69 |             char_X = self.model_unit(self.char_input, self.char_masking, self.char_embedding, 0.2, 100)
70 |             X = Concatenate()([word_X, char_X])
71 |             inputs = [self.word_input, self.char_input]
72 |         
73 |         
74 |         # 结构化特征
75 |         if self.config.structured in ['word', 'char', 'both']:
76 |             X = Concatenate()([X] + self.structured_input)
77 |             inputs = inputs + self.structured_input
78 |         
79 |         
80 |         # 模型结尾
81 |         X = Dropout(0.5)(X)
82 |         out = Dense(self.n_classes, activation=self.activation)(X)
83 |         
84 |         self.model = Model(inputs=inputs, outputs=out)


--------------------------------------------------------------------------------
/model/TextLSTM_Attn.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created:    2019-08-16 15:32:58
 4 | Author:     liuyao8
 5 | Descritipn: 
 6 | """
 7 | 
 8 | from keras.layers import BatchNormalization, Bidirectional, LSTM, Concatenate, Dropout, \
 9 |                         Flatten, Dense, Lambda
10 | from keras.models import Model
11 | from keras import backend as K
12 | 
13 | from model.BasicModel import BasicDeepModel
14 | from model.Layers import AttentionWeightedAverage
15 | 
16 | 
17 | class TextLSTM_Attn(BasicDeepModel):
18 |     """TextLSTM模型，支持char, word和both，支持Attention"""
19 |     
20 |     def __init__(self, config=None, n_rnns=None, rnn_units=64, with_sth='mean', **kwargs):
21 |         if n_rnns is None:
22 |             self.n_rnns = (1, 1) if config.token_level == 'both' else 1
23 |         self.rnn_units = rnn_units
24 |         assert with_sth in ('mean', 'flatten', 'attention')
25 |         self.with_sth = with_sth
26 |         name = 'TextLSTM_Attn_' + with_sth + '_' + config.token_level
27 |         BasicDeepModel.__init__(self, config=config, name=name, **kwargs)
28 |         
29 | 
30 |     def model_unit(self, inputs, masking, embedding, n_rnns=None, rnn_units=None, with_sth=None):
31 |         """模型主体Unit"""
32 |         if n_rnns is None:
33 |             n_rnns = self.n_rnns
34 |         if rnn_units is None:
35 |             rnn_units = [self.rnn_units] * n_rnns
36 |         if isinstance(rnn_units, int):
37 |             rnn_units = [rnn_units] * n_rnns
38 |         if with_sth is None:
39 |             with_sth = self.with_sth
40 |         
41 |         X = masking(inputs)
42 |         X = embedding(X)
43 |         X = BatchNormalization()(X)
44 |         for i in range(n_rnns):
45 |             X = Bidirectional(LSTM(rnn_units[i], return_sequences=True))(X)  # TODO LSTM VS CuDNNLSTM   128需要动态变化？
46 |             X = Dropout(0.5)(X)     # TODO TextAttention此处为SpatialDropout1D？？？
47 |             
48 |         # X's shape = (None, word_maxlen, 2*rnn_units)  # TODO shape要变成2维的，才能输入到输出层！！！
49 |         if with_sth == 'mean':
50 |             X = Lambda(lambda x: K.mean(x, axis=1))(X)  # (None, 2*rnn_units)  # TODO 不能写成 X=K.mean(X,axis=1)，会报错！
51 |         elif with_sth == 'flatten':
52 |             X = Flatten()(X)                            # (None, word_maxlen*2*rnn_units)
53 |         elif with_sth == 'attention':
54 |             X = AttentionWeightedAverage()(X)           # (None, 2*rnn_units)
55 |         return X
56 |         
57 | 
58 |     def build_model(self):
59 |         # 模型主体
60 |         if self.config.token_level == 'word':
61 |             X = self.model_unit(self.word_input, self.word_masking, self.word_embedding)
62 |             inputs = [self.word_input]
63 |             
64 |         elif self.config.token_level == 'char':
65 |             X = self.model_unit(self.char_input, self.char_masking, self.char_embedding)
66 |             inputs = [self.char_input]
67 |             
68 |         else:
69 |             word_X = self.model_unit(self.word_input, self.word_masking, self.word_embedding, self.n_rnns[0])
70 |             char_X = self.model_unit(self.char_input, self.char_masking, self.char_embedding, self.n_rnns[1])
71 |             X = Concatenate()([word_X, char_X])
72 |             inputs = [self.word_input, self.char_input]
73 |         
74 |         
75 |         # 结构化特征
76 |         if self.config.structured in ['word', 'char', 'both']:
77 |             X = Concatenate()([X] + self.structured_input)
78 |             inputs = inputs + self.structured_input
79 |         
80 |         
81 |         # 模型结尾
82 |         X = Dropout(0.5)(X)
83 |         out = Dense(self.n_classes, activation=self.activation)(X)
84 |         
85 |         self.model = Model(inputs=inputs, outputs=out)


--------------------------------------------------------------------------------
/model/TextRCNN_Attn.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created:    2019-08-18 14:52:38
  4 | Author:     liuyao8
  5 | Descritipn: 
  6 | """
  7 | 
  8 | from keras.layers import Input, BatchNormalization, Bidirectional, GRU, Dropout, Lambda, \
  9 |                         Concatenate, Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D, Dense
 10 | from keras import backend as K
 11 | from keras.models import Model
 12 | 
 13 | from model.BasicModel import BasicDeepModel
 14 | from model.Layers import AttentionWeightedAverage
 15 | 
 16 | 
 17 | class TextRCNN_Attn(BasicDeepModel):
 18 |     """简易版TextRCNN"""
 19 |     
 20 |     def __init__(self, config=None, rnn_units=64, n_filters=64, **kwargs):
 21 |         self.rnn_units = rnn_units
 22 |         self.n_filters = n_filters
 23 |         name = 'TextRCNN_Attn_' + config.token_level
 24 |         BasicDeepModel.__init__(self, config=config, name=name, **kwargs)
 25 |         
 26 |         
 27 |     def model_unit(self, inputs, left_inputs, right_inputs, masking, embedding, rnn_units=None, n_filters=None):
 28 |         """模型主体Unit"""
 29 |         if rnn_units is None:
 30 |             rnn_units = [self.rnn_units] * 3
 31 |         if isinstance(rnn_units, int):
 32 |             rnn_units = [rnn_units] * 3
 33 |         if n_filters is None:
 34 |             n_filters = self.n_filters
 35 |         
 36 |         X = masking(inputs)
 37 |         X = embedding(X)
 38 |         X = BatchNormalization()(X)
 39 |         X = Bidirectional(GRU(rnn_units[0], return_sequences=True))(X)
 40 |         
 41 |         left_X = masking(left_inputs)
 42 |         left_X = embedding(left_X)
 43 |         left_X = BatchNormalization()(left_X)
 44 |         left_X = Bidirectional(GRU(rnn_units[1], return_sequences=True))(left_X)
 45 |         
 46 |         right_X = masking(right_inputs)
 47 |         right_X = embedding(right_X)
 48 |         right_X = BatchNormalization()(right_X)
 49 |         right_X = Dropout(0.5)(right_X)
 50 |         right_X = Bidirectional(GRU(rnn_units[2], return_sequences=True, go_backwards=True))(right_X)
 51 |         right_X = Lambda(lambda x: K.reverse(x, axes=1))(right_X)
 52 |         
 53 |         concat = Concatenate()([X, left_X, right_X])
 54 |         concat = Conv1D(n_filters, kernel_size=1, activation='relu')(concat)
 55 |         
 56 |         # TODO 为什么没有left_x与x交互的操作！！？？right_x与x同理！！？？
 57 |         # 比如上一个left与上一个word共同生成当前left？？？(详见论文中的公式1和2！！！)
 58 |         # 另外，与论文相比或与别的实现相比，下面这些是多余的，应该直接到output=Dense那里 ？？？
 59 |         maxpool = GlobalMaxPooling1D()(concat)
 60 |         avgpool = GlobalAveragePooling1D()(concat)
 61 |         attn = AttentionWeightedAverage()(concat)
 62 |         X = Concatenate()([maxpool, avgpool, attn])
 63 |         return X
 64 |         
 65 |         
 66 |     def build_model(self):
 67 |         # 额外的Input
 68 |         self.word_left_inputs = Input(shape=(self.word_maxlen, ), name='word_left')
 69 |         self.word_right_inputs = Input(shape=(self.word_maxlen, ), name='word_right')
 70 |         self.char_left_inputs = Input(shape=(self.char_maxlen, ), name='char_left')
 71 |         self.char_right_inputs = Input(shape=(self.char_maxlen, ), name='char_right')
 72 |         
 73 |         # 模型主体
 74 |         if self.config.token_level == 'word':
 75 |             X = self.model_unit(self.word_input, self.word_left_inputs, self.word_right_inputs, self.word_masking, self.word_embedding)
 76 |             inputs = [self.word_input, self.word_left_inputs, self.word_right_inputs]
 77 |             
 78 |         elif self.config.token_level == 'char':
 79 |             X = self.model_unit(self.char_input, self.char_left_inputs, self.char_right_inputs, self.char_masking, self.char_embedding)
 80 |             inputs = [self.char_input, self.char_left_inputs, self.char_right_inputs]
 81 |             
 82 |         else:
 83 |             word_X = self.model_unit(self.word_input, self.word_left_inputs, self.word_right_inputs, self.word_masking, self.word_embedding)
 84 |             char_X = self.model_unit(self.char_input, self.char_left_inputs, self.char_right_inputs, self.char_masking, self.char_embedding)
 85 |             X = Concatenate()([word_X, char_X])
 86 |             inputs = [self.word_input, self.word_left_inputs, self.word_right_inputs, \
 87 |                       self.char_input, self.char_left_inputs, self.char_right_inputs]
 88 |         
 89 |         
 90 |         # 结构化特征
 91 |         if self.config.structured in ['word', 'char', 'both']:
 92 |             X = Concatenate()([X] + self.structured_input)
 93 |             inputs = inputs + self.structured_input
 94 |         
 95 |         
 96 |         # 模型结尾
 97 |         X = Dropout(0.5)(X)
 98 |         out = Dense(self.n_classes, activation=self.activation)(X)
 99 |         
100 |         self.model = Model(inputs=inputs, outputs=out)


--------------------------------------------------------------------------------