├── .gitignore ├── Config.py ├── DataAugmentation.py ├── DataPreprocessing.py ├── Embedding.py ├── FeatureStructured.py ├── ModelApply.py ├── ModelTrain.py ├── README.md ├── TokenSelection.py ├── Vocabulary.py ├── image ├── 1573355016134.png ├── 1573364046216.png ├── 1573366328001.png └── 1573368628525.png └── model ├── BasicModel.py ├── Bert ├── __init__.py ├── args.py ├── extract_feature.py ├── modeling.py ├── optimization.py └── tokenization.py ├── Layers.py ├── TextBertCNN.py ├── TextBertGRU.py ├── TextCNN.py ├── TextCNN_BiGRU.py ├── TextCapsule.py ├── TextConvLSTM2_Attn.py ├── TextConvLSTM_Attn.py ├── TextDPCNN.py ├── TextGRU2_Attn.py ├── TextGRU_Attn.py ├── TextHAN.py ├── TextLSTMGRU_Attn.py ├── TextLSTM_Attn.py └── TextRCNN_Attn.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Liuyaox 2 | data/ 3 | doc/ 4 | local/ 5 | aspect_extraction.py 6 | .idea/ 7 | 8 | 9 | 10 | # Byte-compiled / optimized / DLL files 11 | __pycache__/ 12 | *.py[cod] 13 | *$py.class 14 | 15 | # C extensions 16 | *.so 17 | 18 | # Distribution / packaging 19 | .Python 20 | build/ 21 | develop-eggs/ 22 | dist/ 23 | downloads/ 24 | eggs/ 25 | .eggs/ 26 | lib/ 27 | lib64/ 28 | parts/ 29 | sdist/ 30 | var/ 31 | wheels/ 32 | *.egg-info/ 33 | .installed.cfg 34 | *.egg 35 | MANIFEST 36 | 37 | # PyInstaller 38 | # Usually these files are written by a python script from a template 39 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 40 | *.manifest 41 | *.spec 42 | 43 | # Installer logs 44 | pip-log.txt 45 | pip-delete-this-directory.txt 46 | 47 | # Unit test / coverage reports 48 | htmlcov/ 49 | .tox/ 50 | .coverage 51 | .coverage.* 52 | .cache 53 | nosetests.xml 54 | coverage.xml 55 | *.cover 56 | .hypothesis/ 57 | .pytest_cache/ 58 | 59 | # Translations 60 | *.mo 61 | *.pot 62 | 63 | # Django stuff: 64 | *.log 65 | local_settings.py 66 | db.sqlite3 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # celery beat schedule file 88 | celerybeat-schedule 89 | 90 | # SageMath parsed files 91 | *.sage.py 92 | 93 | # Environments 94 | .env 95 | .venv 96 | env/ 97 | venv/ 98 | ENV/ 99 | env.bak/ 100 | venv.bak/ 101 | 102 | # Spyder project settings 103 | .spyderproject 104 | .spyproject 105 | 106 | # Rope project settings 107 | .ropeproject 108 | 109 | # mkdocs documentation 110 | /site 111 | 112 | # mypy 113 | .mypy_cache/ 114 | -------------------------------------------------------------------------------- /Config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created: 2019-08-06 20:00:46 4 | Author: liuyao8 5 | Descritipn: 6 | """ 7 | 8 | import argparse 9 | 10 | 11 | class Config(object): 12 | 13 | def __init__(self): 14 | 15 | # 任务相关 16 | self.task = 'multilabel' 17 | self.token_level = 'word' # word: word粒度 char: char粒度 both: word+char粒度 18 | self.N_CLASSES = 11 # 标签/类别数量 19 | 20 | 21 | # Embedding 22 | self.MIN_COUNT = 2 # 训练Embedding,创建Vocabulary时要求的低频下限 23 | self.PUBLIC_EMBED_DIM = 200 # 公开训练好的Embedding向量维度 24 | self.WORD_EMBED_DIM = 100 25 | self.CHAR_EMBED_DIM = 100 26 | self.model_word2vec_file = './local/model_word2vec.w2v' # 训练好的Word Embedding 27 | self.model_char2vec_file = './local/model_char2vec.w2v' # 训练好的Char Embedding 28 | 29 | 30 | # Vocabulary 31 | self.PAD_IDX = 0 # PAD约定取0,不要改变,以下UNK,SOS,EOS可以改变 32 | self.UNK_IDX = 1 # unknow word # TODO 原本是没有UNK的? 33 | self.SOS_IDX = 2 # Start of sentence 34 | self.EOS_IDX = 3 # End of sentence 35 | self.vocab_file = './local/vocab.pkl' # 词汇表,包含word/char,idx,vector三者之间映射字典,Embedding Layer初始化权重 36 | 37 | 38 | # 结构化特征 39 | # TODO structured改成模型定义时参数! 40 | self.structured = 'word' # word: word粒度 char: char粒度 both: word+char粒度 none: 无 41 | self.word_svd_n_componets = 100 42 | self.char_svd_n_componets = 150 43 | self.word_tfidf_lsa_file = './local/word_tfidf_lsa.pkl' 44 | self.char_tfidf_lsa_file = './local/char_tfidf_lsa.pkl' 45 | 46 | 47 | # Bert相关 48 | self.bert_flag = False 49 | self.bert_maxlen = 100 50 | self.bert_dim = 768 51 | self.bert_model_path = '/home/liuyao58/data/BERT/chinese_L-12_H-768_A-12/' 52 | self.bert_graph_tmpfile = './tmp_graph_xxx' 53 | self.data_bert_file = './local/bert_data.pkl' 54 | 55 | 56 | # 特征选择 57 | self.words_chi2_file = '' # 基于卡方统计量筛选后的word 58 | self.chars_chi2_file = '' # 基于卡方统计量筛选后的char 59 | 60 | 61 | # 数据预处理和编码 62 | self.data_file = './data/sku_qa_data_30000.csv' # 处理好的标注数据,尚未编码 63 | self.data_encoded_file = './local/data_30000_encoded.pkl' # 向量化编码后的训练数据 64 | self.WORD_MAXLEN = 100 # 57 65 | self.CHAR_MAXLEN = 200 # 126 66 | self.SENT_MAXLEN = 50 # 18 67 | 68 | 69 | # 训练 70 | self.n_gpus = 1 71 | self.BATCH_SIZE = 32 72 | self.n_folds = 5 73 | self.n_epochs = 10 74 | self.model_file = './local/model.h5' 75 | 76 | 77 | # 其他文件和路径 78 | self.annotation_file = './data/商品问答_手机_已标注_30000.xlsx' # 原始的标注数据 79 | self.stopwords_files = ['./data/京东商城商品评论-Stopwords.txt', 80 | './data/京东商城商品评论-Stopwords-other_github.txt'] # 公开停用词 81 | self.cleaned_all_stopwords_file = './data/cleaned_all_stopwords.txt' # 合并处理好的公开停用词 82 | self.config_file = './local/config.pkl' # config文件 83 | 84 | 85 | 86 | def get_args(): 87 | """待完善……""" 88 | parser = argparse.ArgumentParser() 89 | 90 | parser.add_argument('--server', default=None, type=int, help='[6099]') 91 | parser.add_argument('--phase', default=None, help='[Train/Test]') 92 | parser.add_argument('--sen_len', default=None, type=int, help='sentence length') 93 | 94 | parser.add_argument('--net_name', default=None, help='[lstm]') 95 | parser.add_argument('--dir_date', default=None, help='Name it with date, such as 20180102') 96 | parser.add_argument('--batch_size', default=32, type=int, help='Batch size') 97 | parser.add_argument('--lr_base', default=1e-3, type=float, help='Base learning rate') 98 | parser.add_argument('--lr_decay_rate', default=0.1, type=float, help='Decay rate of lr') 99 | parser.add_argument('--epoch_lr_decay', default=1000, type=int, help='Every # epoch, lr decay lr_decay_rate') 100 | 101 | parser.add_argument('--layer_num', default=2, type=int, help='Lstm layer number') 102 | parser.add_argument('--hidden_size', default=64, type=int, help='Lstm hidden units') 103 | parser.add_argument('--gpu', default='0', help='GPU id list') 104 | parser.add_argument('--workers', default=4, type=int, help='Workers number') 105 | 106 | return parser.parse_args() 107 | 108 | 109 | 110 | if __name__ == '__main__': 111 | 112 | args = get_args() 113 | gpu = args.gpu 114 | -------------------------------------------------------------------------------- /DataAugmentation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created: 2019-08-23 16:25:27 4 | Author: liuyao8 5 | Descritipn: 样本处理:数据增强 6 | """ 7 | 8 | import random 9 | 10 | 11 | # 1. 数据增强 12 | 13 | def data_enhance_for_text(texts, categories, mode='limit'): 14 | """ 15 | 数据增强,打乱老样本序列顺序以生成新样本 16 | ARGS 17 | texts: iterable, 每个元素是一个token列表, token既可以是token也可以是token id 18 | categories: iterable, 每个元素是一个类别id,与texts各元素一一对应 19 | mode: 数据增强模式 20 | limit=基于各类别样本数量,为数量少的类别增加新样本,使各类别样本数达到 min(原样本数*2, 最大类别样本数) 21 | double=所有类别的样本都翻倍,不管各类别原样本数量是多少 22 | RETURN 23 | dic2: 字典,key为cate,value为该cate对应的数据增强后的样本列表 24 | """ 25 | assert mode in ('limit', 'double') 26 | 27 | # 构建类别样本字典: <类别, (样本数, 样本列表)> 28 | dic1 = {} 29 | for text, cate in zip(texts, categories): 30 | if cate not in dic1: 31 | dic1[cate] = (1, [text, ]) 32 | else: 33 | dic1[cate][0] += 1 34 | dic1[cate][1].append(text) 35 | num_max = max([val[0] for val in dic1.values()]) # 最大类别样本数 36 | 37 | # 数据增强 38 | dic2 = {} 39 | for cate, (num, texts) in dic1.items(): 40 | if mode == 'limit': 41 | num_extra = min(num, num_max - num) # 数据增强后样本数为 min(原样本数*2, 最大类别样本数) 42 | texts_extra = random.sample(texts, num_extra) # 从原样本中随机挑选若干样本用于生成新样本 43 | else: 44 | texts_extra = texts.copy() 45 | for text in texts_extra: 46 | random.shuffle(text) # 打乱原序列顺序 47 | texts.append(text) 48 | dic2[cate] = texts 49 | return dic2 50 | 51 | 52 | 53 | if __name__ == '__main__': 54 | # 项目暂未使用 55 | pass -------------------------------------------------------------------------------- /DataPreprocessing.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created: 2019-07-03 20:53:31 4 | Author: liuyao8 5 | Descritipn: 数据预处理,包括两大块 6 | a. 待标注数据生成:基于最原始数据,生成便于标注的数据格式 7 | b. 已标注数据处理:已标注数据规整、去除停用词、分词、Label规整等 8 | 以确保在训练Word Embedding、创建Vocabulary等时不需复杂或耗时的额外处理,可直接使用! 9 | """ 10 | 11 | import pandas as pd 12 | import jieba 13 | import pickle 14 | from functools import reduce 15 | import re 16 | 17 | from Config import Config 18 | config = Config() 19 | 20 | 21 | # 1. 原始数据 --> 待标注数据 22 | question_path = './data/cellphone_questions.txt' 23 | colnames = ['question_raw', 'spu', 'follows'] 24 | data = pd.read_csv(question_path, sep='\t', header=None, names=colnames, encoding='utf8') 25 | data2 = data.sample(frac=0.86, random_state=4321) # 30181 26 | data2.to_excel('./data/question_cellphone_20190715_30000.xlsx', header=True, index=False, encoding='utf8') 27 | 28 | 29 | # 2. 已标注数据 --> 训练数据 30 | # 读取数据 31 | cols_dic = {'序号': 'no', '性能&系统': 'system', '功能': 'function', '电池': 'battery', '外观': 'appearance', 32 | '电话&网络': 'network', '拍照': 'photo', '附件赠品': 'accessory', '购买相关': 'purchase', 33 | '品控': 'quality', '配置&硬件': 'hardware', '比较': 'contrast', '标注人': 'annotator'} 34 | annotation = pd.read_excel(config.annotation_file, header=1, encoding='utf8').fillna(0).rename(columns=cols_dic) 35 | annotation['question_raw'] = annotation['question_raw'].map(lambda x: ' '.join(x.split())) # 多个空格变1个 36 | 37 | 38 | # 停用词 39 | stopwords = [open(x, 'r', encoding='utf8').readlines() for x in config.stopwords_files] 40 | stopwords = list(set([x.strip() for x in reduce(lambda x, y: x + y, stopwords)])) # TODO 重要!加strip,可能会删除一个空格停用词,下面会手动添加 41 | stopwords = stopwords + ['', ' '] # TODO 非常重要!手动在停用词表中添加空字符串和空格!!! 42 | pickle.dump(stopwords, open(config.cleaned_all_stopwords_file, 'wb')) 43 | # TODO char-level的停用词应该与word的不一样!比如:'一'单独出现在word-level分词中说明没别的字可跟它组成词,它就是停用词,但出现在char-level中并不一定 44 | 45 | 46 | # 分词 47 | # 支持3种level: word, char, sentence(用于TextHAN) 48 | get_wordsegs = lambda x: ' '.join([seg for seg in jieba.cut(x, cut_all=False) if seg not in stopwords]) # TODO 优化点:试试cut_all=True 49 | get_charsegs = lambda x: ' '.join([seg for seg in x.replace(' ', '') if seg not in stopwords]) # char-level也要删除停用词 50 | # TODO 重要!使用sklearn.pipeline把get_wordsegs和get_charsegs保存进pipeline!!!包括其中的stopwords!!! 51 | # 句子间Tokenization + 句子内分词 52 | # 启发于:# 参考:https://github.com/richliao/textClassifier/blob/master/textClassifierHATT.py 53 | p1 = re.compile(r'[。!?!?]+') # 删除英文句号,因为很多是数字小数点 54 | p2 = re.compile(r'[。!?,:!?,:]+') # 同上 55 | get_sentsegs1 = lambda x: '&'.join([y for y in [ 56 | ' '.join([seg for seg in jieba.cut(sent.strip(), cut_all=False) if seg not in stopwords]) 57 | for sent in p1.split(x) if len(sent) >= 1 58 | ] if len(y) >= 1]).strip('&').strip() 59 | get_sentsegs2 = lambda x: '&'.join([y for y in [ 60 | ' '.join([seg for seg in jieba.cut(sent.strip(), cut_all=False) if seg not in stopwords]) 61 | for sent in p2.split(x) if len(sent) >= 1 62 | ] if len(y) >= 1]).strip('&').strip() 63 | 64 | # question_raw取值示例:'抢了个免息券,想入手XR,我只有一个联通卡,单卡信号怎么样' 65 | annotation['question_wordseg'] = annotation['question_raw'].map(get_wordsegs) 66 | annotation['question_charseg'] = annotation['question_raw'].map(get_charsegs) 67 | annotation['question_sentseg1'] = annotation['question_raw'].map(get_sentsegs1) 68 | annotation['question_sentseg2'] = annotation['question_raw'].map(get_sentsegs2) 69 | 70 | 71 | # Label 72 | cols_y = ['system', 'function', 'battery', 'appearance', 'network', 'photo', 'accessory', 'purchase', 'quality', 'hardware', 'contrast'] 73 | annotation['labels'] = annotation.apply(lambda se: se[cols_y][se[cols_y]==1].index.tolist(), axis=1) 74 | annotation['labels'] = annotation['labels'].apply(lambda x: '&&' if len(x) == 0 else '&&'.join(x)) 75 | 76 | 77 | # 保存本地 78 | annotation.to_csv(config.data_file, sep='\t', index=False, encoding='utf8') 79 | -------------------------------------------------------------------------------- /Embedding.py: -------------------------------------------------------------------------------- 1 | from model.TextLSTM import TextLSTM# -*- coding: utf-8 -*- 2 | """ 3 | Created: 2019-08-06 21:24:39 4 | Author: liuyao8 5 | Descritipn: 6 | """ 7 | 8 | import numpy as np 9 | from gensim.models import Word2Vec 10 | 11 | 12 | class CorpusGenerator(object): 13 | """ 14 | 使用 gensim 生成 Word2Vec 所需的语料 Generator,由文件直接生成,支持 word-level 和 char-level 15 | NOTES 16 | 文件每行必须事先完成分词或分字:每行是分隔的词或字的字符串,形如:'颜色 很 漂亮' 或 '颜 色 很 漂 亮' 17 | """ 18 | def __init__(self, corpus_file, stopwords=[], sep=' '): 19 | self.corpus_file = corpus_file 20 | self.stopwords = stopwords 21 | self.sep = sep 22 | 23 | def __iter__(self): 24 | for line in open(self.corpus_file): 25 | # 输出结果:每个元素形如['颜色', '很', '漂亮'] 或 ['颜', '色', '很', '漂', '亮'],过滤指定词或字(如停用词等) 26 | yield [x for x in line.strip().split(self.sep) if x not in self.stopwords] 27 | 28 | 29 | def train_w2v_model(sentences, size=100, min_count=3, window=5, sg=1, workers=8, iter=8, compute_loss=True): 30 | """ 31 | 训练 Word2Vec 字/词向量 32 | ARGS 33 | sentences: iterable of sentence, 其中sentence是分字/分词列表,形如:['颜色', '很', '漂亮'] 或 ['颜', '色', '很', '漂', '亮'] 34 | 其他:与Word2Vec函数参数保持一致,sg=1表示使用skip-gram算法 35 | RETURN 36 | model: 训练好的Word2Vec模型,包含(idx, token, vector)三者之间的4种映射字典:idx2token, idx2vector, token2idx, token2vector(即model.wv) 37 | """ 38 | model = Word2Vec(sentences, size=size, min_count=min_count, window=window, sg=sg, workers=workers, iter=iter, compute_loss=compute_loss) 39 | model.idx2token = {} 40 | model.token2idx = {} 41 | model.idx2vector = {} 42 | for token in model.wv.vocab.keys(): 43 | idx = model.wv.vocab[token].index # token对应的idx 44 | model.idx2token[idx] = token 45 | model.token2idx[token] = idx 46 | model.idx2vector[idx] = model[token] # 可直接使用model[token],当然也可model.wv[token] 47 | return model 48 | 49 | 50 | def pretrained_embedding(embedding_file, seps=('\t', ','), header=False): 51 | """Public Pretrained Embedding File --> Original Full Embedding""" 52 | embedding = {} 53 | with open(embedding_file, 'r', encoding='utf-8') as fr: 54 | if header: 55 | fr.readline() # Drop line 1 56 | for line in fr: 57 | values = line.strip().split(seps[0]) 58 | if len(values) >= 2: 59 | token = values[0] 60 | vector = values[1:] if seps[0] == seps[1] else values[1].split(seps[1]) 61 | embedding[token] = np.asarray(vector, dtype='float32') 62 | return embedding 63 | 64 | 65 | 66 | def example(): 67 | """训练Word2Vec向量,并保存本地""" 68 | import pandas as pd 69 | from Config import Config 70 | config = Config() 71 | 72 | data = pd.read_csv(config.data_file, sep='\t', encoding='utf8') 73 | sentences_word = data['question_wordseg'].map(lambda x: str(x).strip().split(' ')) 74 | sentences_char = data['question_charseg'].map(lambda x: str(x).strip().split(' ')) 75 | 76 | model_word2vec = train_w2v_model(sentences_word, size=config.WORD_EMBED_DIM, min_count=config.MIN_COUNT) 77 | model_char2vec = train_w2v_model(sentences_char, size=config.CHAR_EMBED_DIM, min_count=config.MIN_COUNT, window=10, iter=15) 78 | print(len(model_word2vec.wv.vocab)) # 5484 79 | print(len(model_char2vec.wv.vocab)) # 1595 80 | 81 | model_word2vec.save(config.model_word2vec_file) 82 | model_char2vec.save(config.model_char2vec_file) 83 | 84 | 85 | 86 | if __name__ == '__main__': 87 | 88 | example() 89 | -------------------------------------------------------------------------------- /FeatureStructured.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created: 2019-08-07 20:59:13 4 | Author: liuyao8 5 | Descritipn: 结构化特征如TFIDF, LSA, LSI, LDA等 6 | """ 7 | 8 | from sklearn.feature_extraction.text import TfidfVectorizer 9 | from sklearn.decomposition import TruncatedSVD 10 | from sklearn.pipeline import make_pipeline 11 | 12 | 13 | class FeatureStructured(object): 14 | 15 | def __init__(self): 16 | pass 17 | 18 | 19 | # 1. TFIDF特征 20 | @classmethod 21 | def tfidf_vectorizer(cls, data, ngram_range=(1, 1), vocabulary=None, stopwords=None, max_features=None): 22 | """训练TFIDF模型,并生成TFIDF特征""" 23 | # model_tfidf.vocabulary_是训练后的字典,是features max_features=len(vocabulary_) 24 | model_tfidf = TfidfVectorizer(ngram_range=ngram_range, vocabulary=vocabulary, stop_words=stopwords, 25 | sublinear_tf=True, max_features=max_features) 26 | data_tfidf = model_tfidf.fit_transform(data) # .toarray() (9, max_features) 27 | return model_tfidf, data_tfidf 28 | 29 | 30 | # 2. LSA特征 31 | # LSA转换 = TFIDF转换 + SVD转换 32 | # In particular, truncated SVD works on term count/tf-idf matrices as returned by the vectorizers 33 | # in sklearn.feature_extraction.text. In that context, it is known as latent semantic analysis (LSA). 34 | 35 | # TODO **kawgs 实现 36 | @classmethod 37 | def lsa_vectorizer(cls, data, ngram_range=(1, 1), vocabulary=None, stopwords=None, 38 | max_features=None, n_components=2, n_iter=5): 39 | """ 40 | 训练LSA模型,并生成LSA特征 41 | ARGS 42 | data: iterable of sentence, sentence是空格分隔的分字/分词字符串 43 | 形如 ['小猫咪 爱 吃肉', '我 有 一只 小猫咪', ...] 假设shape为(9, ) (即9个sentence) 44 | 其他:参数及其默认值与 TfidfVectorizer 和 TruncatedSVD 保持一致 45 | USAGE 46 | 训练时,data既可以只是train,也可以是train+val+test,应用时分别应用于train/val/test 47 | """ 48 | model_tfidf = TfidfVectorizer(ngram_range=ngram_range, vocabulary=vocabulary, stop_words=stopwords, 49 | sublinear_tf=True, max_features=max_features) # (9, ) -> (9, max_features) 50 | model_svd = TruncatedSVD(n_components=n_components, n_iter=n_iter, random_state=2019) # -> (9, n_components) 51 | model_lsa = make_pipeline(model_tfidf, model_svd) 52 | data_lsa = model_lsa.fit_transform(data) 53 | return model_lsa, data_lsa 54 | 55 | 56 | @classmethod 57 | def lsa_vectorizer_2steps(cls, data, ngram_range=(1, 1), vocabulary=None, stopwords=None, 58 | max_features=None, n_components=2, n_iter=5): 59 | """功能同lsa_vectorizer, 可返回训练好的TFIDF和SVD模型,假设 data 维度为(9, )""" 60 | # TFIDF 转换 (9, max_features) 61 | model_tfidf, data_tfidf = cls.tfidf_vectorizer(data, ngram_range=ngram_range, vocabulary=vocabulary, 62 | stopwords=stopwords, max_features=max_features) 63 | # SVD 转换 64 | model_svd = TruncatedSVD(n_components=n_components, n_iter=n_iter, random_state=2018) 65 | data_lsa = model_svd.fit_transform(data_tfidf) # (9, n_components) max_features维稀疏向量 -> n_components维稠密向量 66 | return model_tfidf, data_tfidf, model_svd, data_lsa 67 | 68 | 69 | 70 | # 3. LSI特征 71 | 72 | 73 | 74 | 75 | # 4. LDA特征 76 | 77 | 78 | 79 | 80 | # 5. Others 81 | 82 | 83 | 84 | 85 | def example_lsa(): 86 | """生成TFIDF特征、LSA特征""" 87 | import pandas as pd 88 | import pickle 89 | from Config import Config 90 | config = Config() 91 | 92 | data = pd.read_csv(config.data_file, sep='\t', encoding='utf8') 93 | sentences_word, sentences_char = data['question_wordseg'].fillna(''), data['question_charseg'].fillna('') 94 | 95 | vocab = pickle.load(open(config.vocab_file, 'rb')) # 在main中运行的话,必须 from Vocabulary import Vocabulary 96 | 97 | word_model_tfidf, word_tfidf, word_model_svd, word_lsa = FeatureStructured.lsa_vectorizer_2steps( 98 | sentences_word, vocabulary=vocab.word2idx, n_components=config.word_svd_n_componets) # 指定vocabulary,保证全局一致性 99 | char_model_tfidf, char_tfidf, char_model_svd, char_lsa = FeatureStructured.lsa_vectorizer_2steps( 100 | sentences_char, vocabulary=vocab.char2idx, n_components=config.char_svd_n_componets) 101 | 102 | # TODO char粒度的特征计算好像有点问题! 103 | pickle.dump((word_model_tfidf, word_tfidf, word_model_svd, word_lsa), open(config.word_tfidf_lsa_file, 'wb')) 104 | pickle.dump((char_model_tfidf, char_tfidf, char_model_svd, char_lsa), open(config.char_tfidf_lsa_file, 'wb')) 105 | 106 | 107 | 108 | if __name__ == '__main__': 109 | 110 | example_lsa() 111 | -------------------------------------------------------------------------------- /ModelApply.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created: 2019-08-23 15:20:07 4 | Author: liuyao8 5 | Descritipn: 6 | """ 7 | 8 | import pickle 9 | from ModelTrain import get_encoding_func, get_sides_encoding_func 10 | from Vocabulary import Vocabulary 11 | from Config import Config 12 | config = Config() 13 | 14 | 15 | # 加载config 16 | config = pickle.load(open(config.config_file, 'rb')) 17 | 18 | 19 | # 应用数据处理 20 | 21 | 22 | # 模型应用 23 | -------------------------------------------------------------------------------- /ModelTrain.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created: 2019-07-31 14:51:45 4 | Author: liuyao8 5 | Descritipn: 6 | """ 7 | 8 | from numpy import array, zeros 9 | from pandas import read_csv 10 | import pickle 11 | from scipy.sparse import csr_matrix 12 | from sklearn.preprocessing import MultiLabelBinarizer 13 | from sklearn.model_selection import train_test_split 14 | 15 | from Vocabulary import seq_to_idxs 16 | 17 | 18 | def get_encoding_func(vocab, config): 19 | """创建工具:用于生成word和char粒度的数据编码""" 20 | word_encoding = lambda x: seq_to_idxs(str(x).split(), vocab.word2idx, config.WORD_MAXLEN, config.UNK_IDX, config.PAD_IDX) 21 | char_encoding = lambda x: seq_to_idxs(str(x).split(), vocab.char2idx, config.CHAR_MAXLEN, config.UNK_IDX, config.PAD_IDX) 22 | return word_encoding, char_encoding 23 | 24 | 25 | def get_sides(x, maxlen): 26 | """生成left和right原始数据(未编码) for TextRCNN 注意:只截断不补零""" 27 | xs = str(x).split()[: maxlen] # 截断 28 | x_left = ' '.join(['UNK'] + xs[:-1]) 29 | x_right = ' '.join(xs[1:] + ['UNK']) 30 | return x_left, x_right 31 | 32 | 33 | def get_sides_encoding_func(vocab, config): 34 | """创建工具:用于生成left和right原始数据并编码 for TextRCNN""" 35 | word_encoding, char_encoding = get_encoding_func(vocab, config) 36 | word_left_encoding = lambda x: word_encoding(get_sides(x, config.WORD_MAXLEN)[0]) 37 | word_right_encoding = lambda x: word_encoding(get_sides(x, config.WORD_MAXLEN)[1]) 38 | char_left_encoding = lambda x: char_encoding(get_sides(x, config.CHAR_MAXLEN)[0]) 39 | char_right_encoding = lambda x: char_encoding(get_sides(x, config.CHAR_MAXLEN)[1]) 40 | return word_left_encoding, word_right_encoding, char_left_encoding, char_right_encoding 41 | 42 | 43 | def get_bert_model(config): 44 | """创建预训练Bert模型:用于对raw文本编码,raw文本不需分词""" 45 | from model.Bert.extract_feature import BertVector 46 | bert_model = BertVector(pooling_strategy='NONE', 47 | max_seq_len=config.bert_maxlen, 48 | bert_model_path=config.bert_model_path, 49 | graph_tmpfile=config.bert_graph_tmpfile) 50 | return bert_model 51 | 52 | 53 | def sent_array(x_sent_raw, config, word_encoding): 54 | """ 55 | 向量化编码:Sentence粒度, for TextHAN 56 | 编码后document形如下行:其中--表示sentence,|表示其向量结束,WORD_MAXLEN=10, SENT_MAXLEN=6, 编码前是4个sentence 57 | --------00|------0000|----------|-------000|0000000000|0000000000 58 | 参考:https://github.com/richliao/textClassifier/blob/master/textClassifierHATT.py 59 | """ 60 | x_sent = zeros((len(x_sent_raw), config.SENT_MAXLEN, config.WORD_MAXLEN), dtype='int32') # Sentence特征是3维,其他特征是2维 61 | # sent_encoding只进行各个句子内的Word Level编码,编码后结果形如:--------00|------0000|----------|-------000 62 | sent_encoding = lambda x: array([word_encoding(sent) for sent in str(x).split('&')[: config.SENT_MAXLEN]], dtype='int32') # 截断 63 | for i, sents in enumerate(x_sent_raw): 64 | sents_vector = sent_encoding(sents) 65 | j, k = sents_vector.shape 66 | x_sent[i, :j, :k] = sents_vector 67 | return x_sent 68 | 69 | 70 | def data_config_prepare(config): 71 | """特征编码,Label编码,Train/Test划分,Config生成,持久化""" 72 | # 0. 数据准备 73 | data = read_csv(config.data_file, sep='\t', encoding='utf8') 74 | data['labels'] = data['labels'].map(lambda x: [] if x == '&&' else x.split('&&')) 75 | x_raw, x_word_raw, x_char_raw, x_sent_raw1, x_sent_raw2, y_raw = data['question_raw'], \ 76 | data['question_wordseg'], data['question_charseg'], data['question_sentseg1'], data['question_sentseg2'], data['labels'] 77 | 78 | vocab = pickle.load(open(config.vocab_file, 'rb')) # 词汇表,映射字典,Embedding Layer初始化权重 79 | config.CHAR_VOCAB_SIZE = vocab.char_vocab_size 80 | config.WORD_VOCAB_SIZE = vocab.word_vocab_size 81 | config.char_embed_matrix = vocab.char_embed_matrix 82 | config.word_embed_matrix = vocab.word_embed_matrix 83 | config.WORD_MAXLEN = int(1.5 * x_word_raw.map(lambda x: len(str(x).split())).max()) # 57 84 | config.CHAR_MAXLEN = int(1.5 * x_char_raw.map(lambda x: len(str(x).split())).max()) # 126 85 | config.SENT_MAXLEN = int(1.5 * x_sent_raw2.map(lambda x: len(str(x).split('&'))).max()) # 18 86 | config.SENT_MAXLEN = 5 87 | 88 | # 1. Token筛选 89 | 90 | 91 | # 2. 特征和Label向量化编码 以下特征中不需要的特征可直接删除,如left和right特征、Bert编码特征、Sentence特征等 92 | # word和char特征 93 | word_encoding, char_encoding = get_encoding_func(vocab, config) 94 | x_word = array(x_word_raw.map(word_encoding).tolist(), dtype='int32') 95 | x_char = array(x_char_raw.map(char_encoding).tolist(), dtype='int32') 96 | 97 | 98 | # left和right特征 only for TextRCNN 99 | word_left_encoding, word_right_encoding, char_left_encoding, char_right_encoding = get_sides_encoding_func(vocab, config) 100 | x_word_left = array(x_word_raw.map(word_left_encoding).tolist(), dtype='int32') 101 | x_word_right = array(x_word_raw.map(word_right_encoding).tolist(), dtype='int32') 102 | x_char_left = array(x_char_raw.map(char_left_encoding).tolist(), dtype='int32') 103 | x_char_right = array(x_char_raw.map(char_right_encoding).tolist(), dtype='int32') 104 | 105 | 106 | # 结构化特征 107 | word_model_tfidf, x_word_tfidf, word_model_svd, x_word_lsa = pickle.load(open(config.word_tfidf_lsa_file, 'rb')) 108 | #char_model_tfidf, char_tfidf, char_model_svd, char_lsa = pickle.load(open(config.char_tfidf_lsa_file, 'rb')) 109 | 110 | 111 | # Bert编码特征 速度超级慢!怎么解决? # TODO 112 | # 对整个句子编码,不需分词!编码向量shape与一般情况下分词后编码不一样,比如"变速箱挺好的"shape为8,"变速箱"shape为5,"变速"shape为4 113 | bert_model = get_bert_model(config) 114 | bert_vectorizer = lambda x: csr_matrix(bert_model.encode([x])["encodes"][0]) 115 | x_bert = array(x_raw.map(bert_vectorizer).tolist(), dtype='int32') 116 | 117 | 118 | # Sentence特征 only for TextHAN 119 | x_sent1 = sent_array(x_sent_raw1, config, word_encoding) 120 | x_sent2 = sent_array(x_sent_raw2, config, word_encoding) 121 | 122 | 123 | # Label 124 | mlb = MultiLabelBinarizer() 125 | y_data = mlb.fit_transform(y_raw) # TODO 使用训练数据还是所有数据来训练mlb??? 126 | config.N_CLASSES = len(mlb.classes_) 127 | config.label_binarizer = mlb 128 | 129 | 130 | # 3. 划分并保存Train/Test 131 | x_word_train, x_word_test, x_word_left_train, x_word_left_test, x_word_right_train, x_word_right_test, \ 132 | x_char_train, x_char_test, x_char_left_train, x_char_left_test, x_char_right_train, x_char_right_test, \ 133 | x_word_lsa_train, x_word_lsa_test, \ 134 | x_bert_train, x_bert_test, \ 135 | x_sent1_train, x_sent1_test, x_sent2_train, x_sent2_test, \ 136 | y_train, y_test = train_test_split( 137 | x_word, x_word_left, x_word_right, 138 | x_char, x_char_left, x_char_right, 139 | x_word_lsa, 140 | x_bert, # bert编码特征计算太慢,可删除该行,不使用bert编码特征 141 | x_sent1, x_sent2, 142 | y_data, 143 | test_size=0.2, random_state=2019 144 | ) 145 | x_train = { 146 | 'word': x_word_train, 147 | 'word_left': x_word_left_train, 148 | 'word_right': x_word_right_train, 149 | 'word_structured': x_word_lsa_train, 150 | 'char': x_char_train, 151 | 'char_left': x_char_left_train, 152 | 'char_right': x_char_right_train, 153 | 'sentence1': x_sent1_train, 154 | 'sentence2': x_sent2_train 155 | } 156 | x_test = { 157 | 'word': x_word_test, 158 | 'word_left': x_word_left_test, 159 | 'word_right': x_word_right_test, 160 | 'word_structured': x_word_lsa_test, 161 | 'char': x_char_test, 162 | 'char_left': x_char_left_test, 163 | 'char_right': x_char_right_test, 164 | 'sentence1': x_sent1_test, 165 | 'sentence2': x_sent2_test 166 | } 167 | 168 | # 保存编码后数据 169 | pickle.dump((x_train, y_train, x_test, y_test), open(config.data_encoded_file, 'wb')) 170 | pickle.dump((x_bert_train, y_train, x_bert_test, y_test), open(config.data_bert_file, 'wb')) 171 | pickle.dump(config, open(config.config_file, 'wb')) 172 | 173 | 174 | def data_augmentation(): 175 | """数据增强""" 176 | pass 177 | 178 | 179 | def example(bert_flag=False): 180 | from Vocabulary import Vocabulary 181 | from Config import Config 182 | config = Config() 183 | 184 | 185 | # Data和Config准备 186 | data_config_prepare(config) 187 | config = pickle.load(open(config.config_file, 'rb')) 188 | data_file = config.data_bert_file if bert_flag else config.data_encoded_file 189 | x_train, y_train, x_test, y_test = pickle.load(open(data_file, 'rb')) 190 | 191 | 192 | # 根据实际情况修改,也可直接在Config.py里修改,推荐前者 193 | config.n_gpus = 1 194 | config.token_level = 'word' 195 | config.structured = 'none' 196 | config.bert_flag = False 197 | 198 | 199 | # 模型训练 评估 保存 200 | if not bert_flag: # 一般模型 201 | from model.TextCNN import TextCNN 202 | textcnn = TextCNN(config) 203 | test_acc, scores, sims, vectors, _, _ = textcnn.train_evaluate(x_train, y_train, x_test, y_test, epochs=(2, 10)) 204 | textcnn.model.save(config.model_file) 205 | 206 | else: # Bert模型 207 | config.bert_flag = True 208 | x_train = array([term.toarray() for term in x_train], dtype='int32') 209 | x_test = array([term.toarray() for term in x_test], dtype='int32') 210 | from model.TextBertGRU import TextBertGRU 211 | textbertgru = TextBertGRU(config) 212 | test_acc, scores, sims, vectors, history = textbertgru.train_evaluate(x_train, y_train, x_test, y_test) 213 | textbertgru.model.save(config.model_file) 214 | 215 | 216 | 217 | if __name__ == '__main__': 218 | 219 | example() 220 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Text Classification 2 | 3 | 基于Keras的15种模型:TextCNN, TextRNN, TextDPCNN, TextRCNN, TextHAN, TextBert等及其变种 4 | 5 | 支持5类特征及其组合:word-level, char-level, 结构化特征(TFIDF, LSA), Context特征(word-left, word-right, char-left, char-right), sentence-level 6 | 7 | 支持4种分类任务:单标签二分类,单标签多分类,多标签二分类,多标签多分类 8 | 9 | ## Task & Data 10 | 11 | 任务描述:给定一个关于手机的用户提问,判断用户关注的是手机哪些Labels。 12 | 13 | Labels: System, Function, Battery, Appearance, Network, Photo, Accessory, Purchase, Quality, Hardware, Contrast 14 | 15 | 已标注数据集共有30,000,以下为示例: 16 | 17 | ![1573355016134](./image/1573355016134.png) 18 | 19 | 所以,任务类型:**多标签二分类**(Multi-label Binary Classification)任务,共有11个Labels,每个Label有2种取值(关注,不关注)。 20 | 21 | 虽然数据集是关于多标签二分类任务的,但本项目代码适用于**4种分类任务中的任何1种**,只取简单修改Config.py文件即可,基模型定义文件BasicModel.py会自动处理。 22 | 23 | #### 附录1:Config.py和BasicModel.py中关于任务类型的配置和处理代码 24 | 25 | ```python 26 | # Config.py 27 | self.task = 'multilabel' 28 | self.token_level = 'word' # word: word粒度 char: char粒度 both: word+char粒度 29 | self.N_CLASSES = 11 # 标签/类别数量 30 | 31 | # BasicModel.py 32 | # 任务类型决定了类别数量、激活函数、损失函数和评估指标 33 | if config.task == 'binary': # 单标签二分类 34 | self.n_classes = 1 35 | self.activation = 'sigmoid' 36 | self.loss = 'binary_crossentropy' 37 | self.metrics = ['accuracy'] 38 | elif config.task == 'categorical': # 单标签多分类 39 | self.n_classes = config.N_CLASSES 40 | self.activation = 'softmax' 41 | self.loss = 'categorical_crossentropy' 42 | self.metrics = ['accuracy'] 43 | elif config.task == 'multilabel': # 多标签二分类(多标签多分类需转化为多标签二分类) 44 | self.n_classes = config.N_CLASSES 45 | self.activation = 'sigmoid' 46 | self.loss = 'binary_crossentropy' 47 | self.metrics = ['accuracy'] 48 | ``` 49 | 50 | #### 附录2:4种分类任务及其处理方法 51 | 52 | - a. 单标签二分类 53 | 54 | 输出为Dense(1, activation='sigmoid'),应用时1个概率值判断其与阈值大小 55 | 56 | - b. 单标签N分类 57 | 58 | 输出为Dense(N, activation='softmax'),应用时N个概率值取Top1 59 | 60 | - c. M标签二分类 61 | - **c.1** 一个输出:输出为Dense(M, activation=‘sigmoid’),应用时M个概率值取TopK或与阈值判断大小 62 | - c.2 一个输出:问题转化为M分类,类似于b,模型输出结构同b,应用时方法同c.1 63 | 64 | - d. M标签N分类 65 | - d.1 一个输出:问题转化为MN标签二分类,同c.1 66 | - d.2 一个输出:问题转化为MN分类,同c.2 67 | - d.3 M个输出:每个输出都是b,模型输出结构、应用时方法都同b 待尝试 68 | 69 | 备注:本项目使用的处理方法是c.1 70 | 71 | ## Requirement 72 | 73 | Python 3.6.5 74 | 75 | Keras 2.2.4 76 | 77 | Numpy 1.16.3 78 | 79 | Pandas 0.23.0 80 | 81 | SciPy 1.1.0 82 | 83 | Sklearn 0.21.3 84 | 85 | ## Data Preprocessing 86 | 87 | 数据预处理环节流程步骤如下图所示: 88 | 89 | ![1573364046216](./image/1573364046216.png) 90 | 91 | #### 数据清洗和准备 92 | 93 | 文件:[DataPreprocessing.py](https://github.com/liuyaox/text_classification/blob/master/DataPreprocessing.py) 94 | 95 | 内容:简单而通用的功能,如标注数据处理,分词,分字,分句子,过滤停用词,处理原始Labels 96 | 97 | #### Embedding相关 98 | 99 | 文件:[Embedding.py](https://github.com/liuyaox/text_classification/blob/master/Embedding.py) 100 | 101 | 内容:自己训练Word Embedding,读取公开训练的Word Embedding,支持word+char两种粒度 102 | 103 | #### Vocabulary相关 104 | 105 | 文件:[Vocabulary.py](https://github.com/liuyaox/text_classification/blob/master/Vocabulary.py) 106 | 107 | 内容: 108 | 109 | 生成词汇表,支持低频高频词过滤 110 | 111 | 基于Embedding生成三者之间的映射字典 112 | 113 | 生成Embedding Layer初始化权重 114 | 115 | 基于映射字典的向量化编码工具(支持截断、补零、including和excluding) 116 | 117 | 以上功能支持word+char两种粒度 118 | 119 | #### 结构化特征 120 | 121 | 文件:[FeatureStructured.py](https://github.com/liuyaox/text_classification/blob/master/FeatureStructured.py) 122 | 123 | 内容:生成TFIDF特征和LSA特征,支持word+char两种粒度,后续会增加支持LSI, LDA等其他特征 124 | 125 | #### 特征选择 126 | 127 | 文件:[TokenSelection.py](https://github.com/liuyaox/text_classification/blob/master/TokenSelection.py) 128 | 129 | 内容:基于卡方统计值等过滤词和字,项目暂未使用 130 | 131 | #### 数据编码 132 | 133 | 文件 :[ModelTrain.py](https://github.com/liuyaox/text_classification/blob/master/ModelTrain.py) 134 | 135 | 内容:使用向量化编码工具和MultiLabelBinarizer对特征和Label进行编码 136 | 137 | #### 数据增强 138 | 139 | 文件 :[DataAugmentation.py](https://github.com/liuyaox/text_classification/blob/master/DataAugmentation.py) 140 | 141 | 内容:通过Shuffle和Random Drop进行数据增强,项目暂未使用 142 | 143 | ## Model 144 | 145 | 使用了多个Model,各Model结构关系如下图所示: 146 | 147 | ![1573366328001](./image/1573366328001.png) 148 | 149 | #### 使用类继承方式实现三层类定义 150 | 151 | - BasicModel: 所有模型基类 152 | 153 | 实现3种Metrics 154 | 155 | - BasicDeepModel: 深度学习模型基类 156 | 157 | 通用Layers创建 158 | 159 | 绘制Loss和Metrics 160 | 161 | Embedding冻结和解冻 162 | 163 | 模型训练和评估(支持CV) 164 | 165 | 学习率Schedular 166 | 167 | - BasicStatModel: 传统模型基类 168 | 169 | 暂未实现 170 | 171 | #### 实现6大类模型(绿色):共15个模型 172 | 173 | - TextCNN:标配和基础 174 | 175 | - TextRNN:同上,可玩的地方更多 176 | 177 | - TextRCNN:结合CNN和RNN的优点 178 | 179 | - TextDPCNN:受ResNet启发,结合RNN+CNN 180 | 181 | - TextHAN:使用了层次注意力机制 182 | 183 | - TextBert:在TextGRU基础上把输入改为Bert编码的向量 184 | 185 | - 此外,还有5大类待实现模型(灰色) 186 | 187 | #### 三层类模型+全局Config的便捷之处 188 | 189 | - 支持所有分类任务:二分类,多分类,多标签二分类,多标签多分类 190 | 191 | - 支持各种输入组合: 192 | 193 | [word, char, word-structure, char-structure]中任意的4选1,4选2,4选3,4选4 194 | 195 | 另外对于一些特殊模型,支持特殊输入,如TextRCNN模型的Context特征(word-left, word-right, char-left, char-right),以及TextHAN模型的Sentence-level特征 196 | 197 | - 模型训练评估支持KFold,支持6种Finetuning方式 198 | 199 | - 绝大多数模型支持Attention,绝大多数模型支持丰富的参数配置 200 | 201 | ## Train & Evaluation 202 | 203 | ### Train 204 | 205 | Step1: 运行DataPreprocessing.py,基于已标注数据生成训练数据并保存本地 206 | 207 | Step2: 运行Embedding.py,自己训练Embedding,读取公开训练的Embedding,支持char+word两种粒度 208 | 209 | Step3: 运行Vocabulary.py,生成词汇表,基于Embedding生成映射字典,生成Embedding Layer初始化权重矩阵等,支持char+word两种粒度 210 | 211 | Step4: 运行FeatureStructured.py,生成TFIDF特征和LSA特征,支持word+char两种粒度 212 | 213 | Step5: 运行[ModelTrain.py](https://github.com/liuyaox/text_classification/blob/master/ModelTrain.py),项目全流程,包括:数据准备、Token筛选、特征和Label编码、划分Train/Test、环境配置、模型生成、模型训练和评估、模型持久化,详见脚本注释。 214 | 215 | 运行脚本:python3 ModelTrain.py 216 | 217 | 在运行脚本之前,先修改脚本里的配置项,内容如下: 218 | 219 | ```python 220 | # 根据实际情况修改,也可直接在Config.py里修改,推荐前者 221 | config.n_gpus = 1 222 | config.token_level = 'word' # 只使用word-level特征,不使用char-level 223 | config.structured = 'none' # 不使用结构化特征 224 | config.bert_flag = False # 不使用Bert编码的输入向量 225 | ``` 226 | 227 | ### Evaluation 228 | 229 | 15个模型的评估结果如下表所示: 230 | 231 | ![1573368628525](./image/1573368628525.png) 232 | 233 | 备注:模型并未进行精细化调参,大多是默认配置和参数,效果仅供参考。 234 | 235 | 从评估结果中可得出以下结论: 236 | 237 | #### 同一模型内 238 | 239 | - word+char相比word,效果明显有提升 240 | 241 | - word+char+structured相比word+char,效果提升不明显,一些情况下反而会下降 242 | 243 | #### 不同模型间 244 | 245 | - TextCNN训练最快,Precision和F1值相对也较高,可作为一个强有力的Baseline 246 | 247 | - TextRNN训练很慢,效果不是特别好,可能是因为训练数据很多是短文本 248 | 249 | - 各模型之间效果差不多(全是默认参数,没时间做精细化调参) 250 | - 输入改为Bert编码向量后效果提升比较明显,简单的模型(TextGRU)就得到了最好的F1值,后续值得好好研究 251 | - TextHAN比较给力,取到了最高的Precision,后续值得好好研究 252 | 253 | ## Conclusion 254 | 255 | 1. **一个脚本只干一件事情,一件事情只在一个脚本里干**,各脚本解耦,各功能独立,互相之间只通过持久化和Config共享信息 256 | 257 | 2. 充分利用**类和继承以及闭包**,相同功能不要重复定义,也不要到处粘贴复制,相似的功能通过闭包来实现 258 | 259 | 3. Vocabulary及相关映射字典、Embedding权重,**封装整合为一个class**,统一管理 260 | 261 | 4. 调试便捷化+逻辑清晰化 262 | 263 | a. 训练和应用**数据封装进字典**,单输入和多输入使用无差别,字典key对应模型搭建时Input的参数name 264 | 265 | b. 动态搭建模型,使其无缝支持多种输入及其组合 266 | 267 | 方法:通用方法位于父类BasicDeepModel,各子类模型分为**模型主体和模型结尾**2部分,模型核心的纯粹的结构位于模型主体,根据输入不同,进行配置和组装,然后接入模型结尾 268 | 269 | c. 不同类模型,先选择最简单的模型如TextCNN,深入研究经验和Tricks,然后复制到别的模型 270 | 271 | d. 同一类模型,先搭建并跑通最简单的模型,随后基于评估效果,逐渐加深加宽 272 | 273 | 5. 模型组件 274 | 275 | a. CNN+RNN是标配,CNN提取关键词,RNN适合前几层,提取依赖信息,Attention和MaxPooling可突出关键特征 276 | 277 | b. Capsule可代替CNN,有时效果好于CNN 278 | 279 | c. 有条件就使用Bert 280 | 281 | ## Reference 282 | 283 | #### Code 284 | 285 | - 文本分类 - Keras 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | - 多标签分类 - PyTorch 294 | 295 | (2017知乎看山杯 多标签文本分类大赛 Rank1) 296 | 297 | (同上,Rank2) 298 | 299 | #### Libray 300 | 301 | - [kashgari](https://github.com/BrikerMan/Kashgari) : NLP框架,超级傻瓜,超级Cutting Edge 302 | 303 | - [hyperas](https://github.com/maxpumperla/hyperas) : Keras超参数优化工具 304 | 305 | - [sk-multilearn](https://github.com/scikit-multilearn/scikit-multilearn) : Sklearn生态下的多标签分类工具 306 | 307 | #### Article 308 | 309 | - [用深度学习(CNN RNN Attention)解决大规模文本分类问题 - 综述和实践 ](https://zhuanlan.zhihu.com/p/25928551) 310 | 311 | - [在文本分类任务中,有哪些论文中很少提及却对性能有重要影响的tricks?](https://www.zhihu.com/question/265357659) 312 | 313 | -------------------------------------------------------------------------------- /TokenSelection.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created: 2019-08-07 21:13:53 4 | Author: liuyao8 5 | Descritipn: word/char选择,基于卡方统计量。 6 | TODO 注意!word/char是否筛选上了,在Embedding和Vocabulary时可以先不考虑,主要在向量化编码时再考虑是否过滤 7 | """ 8 | 9 | import numpy as np 10 | from collections import Counter 11 | 12 | 13 | # TODO 建议 onlyin和excluding都要有,有时excluding使用更方便! 14 | 15 | # 基于卡方统计量,进行特征选择 16 | 17 | def occurrence_matrix(texts, categories): 18 | """ 19 | 基于texts和category原始数据,计算token与category的共现矩阵 20 | ARGS 21 | texts: iterable, 每个元素是一个token列表, token既可以是token也可以是token id 22 | categories: iterable, 每个元素是一个类别id,与texts各元素一一对应 23 | RETURN 24 | tokens: tokens列表 25 | matrix: 列表,元素与tokens一一对应,相当于token与category共现矩阵,可用于计算两者卡方统计量,从而进行特征选择(token选择) 26 | NOTES 27 | 注意,要求categories是向量化后的类别id,且要求类别id从0开始依次递增,如0,1,2,3,... 28 | """ 29 | cates_num = len(set(categories)) 30 | dic = {} 31 | for text, cate in zip(texts, categories): 32 | for token in set(text): 33 | if token not in dic: 34 | dic[token] = [0] * cates_num 35 | dic[token][cate] += 1 36 | else: 37 | dic[token][cate] += 1 38 | tokens = list(dic.keys()) 39 | matrix = list(dic.values()) 40 | return matrix, tokens 41 | 42 | 43 | def chi2_value(matrix, mask=True): 44 | """ 45 | 基于共现矩阵计算卡方统计量 46 | ARGS 47 | matrix: 二维array或list,共现矩阵,以word,document和document category为例,行是word,列是category,某行某列取值表示:当前category下含有当前word的document数量 48 | mask: 当category下含有word的document数量为0时,是否不再计算category与word的卡方统计量 49 | RETURN 50 | values: 卡方统计量,等于(AD-BC)^2*N/((A+B)(A+C)(B+D)(C+D)) 51 | """ 52 | A = np.array(matrix, dtype=np.float) # A: category下含有word的样本数量,注意类型为float,以便于后续各种复杂计算 53 | word_sum = np.sum(A, 1).reshape((-1, 1)) # 各行对应的样本数,转化为列向量 54 | type_sum = np.sum(A, 0) # 各列对应的样本数 55 | N = np.sum(type_sum) # N: 总样本数量 各行各列总和 56 | B = word_sum - A # B: 非category下含有word的样本数量 57 | C = type_sum - A # C: category下不含有word的样本数量 58 | D = N - A - B - C # D: 非category下不含有word的样本数量 59 | # 若针对每一列,当前列内比较各行,而确定某列后,N, A+C, B+D都是确定不变的,可省略 60 | # 若针对每一行,当前行内比较各列,而确定某行后,N, A+B, C+D都是确定不变的,可省略 61 | values = N * (A * D - B * C) ** 2 / ((A + B) * (A + C) * (B + D) * (C + D)) 62 | if mask: 63 | masking = np.sign(A) # 当A=0时,value应该为0 64 | values = masking * values 65 | return values, A, B, C, D, N 66 | 67 | 68 | def feature_select_by_chi2(matrix, features, max_col_num=1000, mode='column', mask=True): 69 | """ 70 | 基于卡方统计量进行特征选择 71 | ARGS 72 | matrix,mask同chi2_value 73 | features: 特征列表,特征顺序务必要与matrix各行/列保持一致!用于特征索引转换为特征 74 | max_col_num: 每列可选择的特征数量最大值 75 | model: 特征选择的模式,column=各列分别选择特征然后汇总选择的特征,max=取特征各列卡方值最大值为特征卡方值从而选择特征,avg=取平均值 76 | RETURN 77 | cnter: collections.Counter,类似字典,表示选择的特征,及其被多少列选择 78 | selected: 列表,表示选择的特征 79 | """ 80 | values, A, _, _, _, _ = chi2_value(matrix, mask) 81 | # 共有3种模式进行特征选择 82 | if mode == 'column': 83 | masking = np.sign(A) 84 | col_num = np.sum(masking, 0, dtype=np.int64) # 各列拥有的特征数量,注意dtype为int,否则为float 85 | selected = [] 86 | for i in range(A.shape[1]): # 遍历各列 87 | indices = np.argsort(values[:, i]) # 按卡方统计量排序各特征,取其排序索引 88 | k = min(max_col_num, col_num[i]) 89 | topk = [features[i] for i in indices[-k:]] # 前k个特征 90 | selected.extend(topk) 91 | cnter = Counter(selected) 92 | return cnter 93 | elif mode == 'avg': 94 | value = np.mean(values, axis=1) 95 | elif mode == 'max': 96 | value = np.max(values, axis=1) 97 | else: 98 | raise ValueError('mode must be column, avg or max !') 99 | indices = np.argsort(value) 100 | selected = [features[i] for i in indices[-max_col_num:]] 101 | return selected 102 | 103 | 104 | 105 | if __name__ == '__main__': 106 | 107 | # 以下只是示例,项目中暂时未使用特征选择 108 | # 示例:基于卡方统计量进行特征选择 109 | texts = [['t1', 't2', 't3', 't4'], ['t2', 't3', 't5'], ['t1', 't4', 't5'], ['t2','t4'], ['t3', 't4'], ['t1', 't3', 't4']] 110 | categories = [1, 2, 0, 1, 0, 1] 111 | matrix, tokens = occurrence_matrix(texts, categories) 112 | cnter = feature_select_by_chi2(matrix, tokens) # cnter即为选择的特征及其被选择的次数 113 | -------------------------------------------------------------------------------- /Vocabulary.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created: 2019-07-31 16:26:01 4 | Author: liuyao8 5 | Descritipn: a. 通用Vocabulary:token支持char和word,(token, idx, vector)之间4种映射字典,Embedding Layer初始化权重 6 | b. 向量化编码工具:支持Padding和Truncating,支持X和Label,支持including和excluding 7 | """ 8 | 9 | import numpy as np 10 | from functools import reduce 11 | from tqdm import tqdm 12 | from gensim.models import Word2Vec 13 | 14 | 15 | # 与config保持一致 16 | # Default word tokens # TODO 除这4个外,是否还应该有一些别的,比如空格?见P115 17 | PAD_IDX = 0 # PAD约定取0,不要改变,以下UNK,SOS,EOS可以改变 18 | UNK_IDX = 1 # unknow word # TODO 原本是没有UNK的? 19 | SOS_IDX = 2 # Start of sentence 20 | EOS_IDX = 3 # End of sentence 21 | 22 | 23 | class Vocabulary(object): 24 | """token词汇表,token包括word和character""" 25 | 26 | def __init__(self): 27 | # 通用信息 28 | self.token2idx_init = {'PAD': PAD_IDX, 'UNK': UNK_IDX, 'SOS': SOS_IDX, 'EOS': EOS_IDX} 29 | self.idx2token_init = {PAD_IDX: 'PAD', UNK_IDX: 'UNK', SOS_IDX: 'SOS', EOS_IDX: 'EOS'} 30 | 31 | # Word Level 32 | self.word2idx = self.token2idx_init.copy() # TODO 原来是没有{'PAD': PAD_IDX, ...}的? 33 | self.idx2word = self.idx2token_init.copy() # TODO 字典一定要用copy!!!否则大家都一起跟着改变 34 | self.word2count = {} 35 | self.word_vocab_size = 4 36 | self.word_trimmed = False # 是否已过滤低频word 37 | self.word_stopwords = None # 低频停用词 38 | 39 | self.word_embed_dim = 0 40 | self.word2vector = {} 41 | self.word_idx2vector = {} 42 | self.word_embed_matrix = None # Embedding Layer Weights Matrix 43 | 44 | # Char Level 45 | # TODO char中也会出现UNK,SOS,EOS,也要进行PAD(padding为0,这就要求char2idx[0]就得是PAD),因此char也要处理这4种TOKEN 46 | self.char2idx = self.token2idx_init.copy() 47 | self.idx2char = self.idx2token_init.copy() 48 | self.char2count = {} 49 | self.char_vocab_size = 4 50 | self.char_trimmed = False # 是否已过滤低频char 51 | self.char_stopwords = None 52 | 53 | self.char_embed_dim = 0 54 | self.char2vector = {} 55 | self.char_idx2vector = {} 56 | self.char_embed_matrix = None 57 | 58 | 59 | # 1. 创建词汇表 60 | # 1.1 挨个添加token:直接添加token,通过sentence添加token,通过document添加token 61 | def add_token(self, token, level='word'): 62 | """添加word或char,一个一个添加""" 63 | assert level in ['word', 'char'] 64 | token = token.strip() 65 | if level == 'word': 66 | if token not in self.word2idx: 67 | self.word2idx[token] = self.word_vocab_size 68 | self.idx2word[self.word_vocab_size] = token 69 | self.word_vocab_size += 1 70 | self.word2count[token] = 1 71 | else: 72 | self.word2count[token] += 1 73 | else: 74 | if token not in self.char2idx: 75 | self.char2idx[token] = self.char_vocab_size 76 | self.idx2char[self.char_vocab_size] = token 77 | self.char_vocab_size += 1 78 | self.char2count[token] = 1 79 | else: 80 | self.char2count[token] += 1 81 | 82 | 83 | def add_sentence(self, sentence, level='word', sep=' '): 84 | """按sentence添加word或char或both, sentence格式:sep分隔的分词字符串""" 85 | assert level in ['word', 'char', 'both'] 86 | sentence = str(sentence) 87 | if level == 'word': 88 | for word in sentence.strip().split(sep): 89 | self.add_token(word, level='word') 90 | elif level == 'char': 91 | for char in list(sentence.replace(sep, '')): # 删除分隔符后,变成字符列表 92 | self.add_token(char, level='char') 93 | else: 94 | for word in sentence.strip().split(sep): 95 | self.add_token(word, level='word') 96 | for char in list(sentence.replace(sep, '')): 97 | self.add_token(char, level='char') 98 | 99 | 100 | def add_document(self, document, level='word', sep=' '): 101 | """按document添加word或char或both""" 102 | assert level in ['word', 'char', 'both'] 103 | for sentence in document: 104 | self.add_sentence(sentence, level=level, sep=sep) 105 | 106 | 107 | # 1.2 一次性添加所有token 108 | def add_all(self, corpus, level='word', sep=' ', min_count=None): 109 | """ 110 | 词汇表 Vocabulary:支持 char-level 和 word-level,以及两者的汇总 111 | 统计 corpus 中 char/word 频率并倒序排序获得 idx,构建词汇字典: 112 | 注意: 113 | 其实也可不排序,直接随便赋给每个 char/word 一个 idx,只要保证唯一且固定即可 114 | 比如按加入 Vocabulary 顺序依次赋值为1,2,3,...,0另有所用,比如当作 、空格或 的 idx 115 | TODO idx=0 给谁??怎么给?? 也有把PAD和UNK赋值给词汇表里最后2个idx的 116 | """ 117 | assert level in ['word', 'char', 'both'] 118 | token2count = {} 119 | for line in corpus: 120 | tokens = line.strip().split(sep) if level == 'word' else list(line.strip()) # word时默认每一行是分词后分隔好的结果 121 | for token in tokens: 122 | token2count[token] = token2count.get(token, 0) + 1 123 | if min_count: # 过滤低频字/词 124 | token2count = {word: num for (word, num) in token2count.items() if num >= min_count} 125 | 126 | token_sorted = sorted(token2count, key=token2count.get, reverse=True) # 按token频率倒序排列 127 | token_list = token_sorted if ' ' in token_sorted else [' '] + token_sorted # TODO 空格是否加入vocab? 如何确定idx=0对应的term??? 128 | 129 | if level == 'word': 130 | self.word2count = token2count 131 | self.word2idx = {word: idx + 4 for (idx, word) in enumerate(token_list)}.update(self.token2idx_init) 132 | self.idx2word = {idx: word for (word, idx) in self.word2idx.items()} 133 | self.word_vocab_size = len(self.word2idx) 134 | else: 135 | self.char2count = token2count 136 | self.char2idx = {char: idx + 4 for (idx, char) in enumerate(token_list)}.update(self.idx2token_init) 137 | self.idx2char = {idx: char for (char, idx) in self.char2idx.items()} 138 | self.char_vocab_size = len(self.char2idx) 139 | 140 | 141 | # 2. 低频过滤 142 | def trim(self, min_count, level='word'): 143 | """过滤低频word或char""" 144 | assert level in ['word', 'char'] 145 | if (level == 'word' and self.word_trimmed) or (level == 'char' and self.char_trimmed): 146 | return 147 | if level == 'word': 148 | self.word_stopwords = [word for word, cnt in self.word2count.items() if cnt < min_count] 149 | kept = [word for word, cnt in self.word2count.items() if cnt >= min_count] 150 | print(f'kept words: {len(kept)} / {len(self.word2idx)} = {len(kept) / len(self.word2idx): .4f}') 151 | self.word2idx = self.token2idx_init.copy() 152 | self.idx2word = self.idx2token_init.copy() 153 | self.word2count = {} 154 | self.word_vocab_size = 4 155 | for word in kept: 156 | self.add_token(word, level='word') 157 | self.word_trimmed = True 158 | 159 | else: 160 | self.char_stopwords = [char for char, cnt in self.char2count.items() if cnt < min_count] 161 | kept = [char for char, cnt in self.char2count.items() if cnt >= min_count] 162 | print(f'kept chars: {len(kept)} / {len(self.char2idx)} = {len(kept) / len(self.char2idx): .4f}') 163 | self.char2idx = self.token2idx_init.copy() 164 | self.idx2char = self.idx2token_init.copy() 165 | self.char2count = {} 166 | self.char_vocab_size = 4 167 | for char in kept: 168 | self.add_token(char, level='char') 169 | self.char_trimmed = True 170 | 171 | 172 | # 3. 创建xxx2vector: (word/char, idx) --> vector 173 | def init_vectors(self, embedding=None, level='word'): 174 | """ 175 | 基于训练好的word/char embedding,初始化word2vector或char2vector及其对应的idx2vector 176 | 其中embedding既可以是公开训练好的,也可以是自己训练好的,前者过于巨大, 177 | 后者其实理论上就是word2vector,但实际中可能会因为语料不同步等原因,导致两者的word并不完全相同。 178 | 另外后者可以是gensim.models.Word2Vec模型,也可以是普通字典 179 | 不管前者后者,我们只选择感兴趣的word(word2idx中的word) 180 | TODO 优化点:增加备用 word embedding 如同get_word2vector_idx2vector一样! 181 | """ 182 | assert level in ['word', 'char'] 183 | if isinstance(embedding, Word2Vec): 184 | embedding = {token: embedding[token] for token in embedding.wv.vocab.keys()} 185 | 186 | embed_dim = len(list(embedding.values())[0]) 187 | if level == 'word': 188 | self.word_embed_dim = embed_dim 189 | for word, idx in self.word2idx.items(): 190 | if word in embedding: 191 | vector = embedding.get(word) 192 | else: 193 | vectors = [embedding.get(x, np.random.uniform(-0.01, 0.01, (embed_dim))) for x in list(word)] 194 | vector = reduce(lambda x, y: x + y, vectors) / len(vectors) # OOV时使用对应的若干字符向量的Average 195 | self.word2vector[word] = vector 196 | self.word_idx2vector[idx] = vector 197 | else: 198 | self.char_embed_dim = embed_dim 199 | for char, idx in self.char2idx.items(): 200 | vector = embedding.get(char, np.random.uniform(-0.01, 0.01, (embed_dim))) 201 | self.char2vector[char] = vector 202 | self.char_idx2vector[idx] = vector 203 | 204 | 205 | # 4. 生成Embedding Layer的初始化权重 206 | def init_embed_matrix(self, level='word'): 207 | """基于wordidx2vector或charidx2vector生成用于Embedding Layer的weights matrix 208 | TODO 总觉得似乎哪里不对??? 之类的如何处理? 209 | """ 210 | assert level in ['word', 'char'] 211 | if level == 'word': 212 | all_embs = np.stack(self.word_idx2vector.values()) 213 | self.word_embed_matrix = np.random.normal(all_embs.mean(), all_embs.std(), size=(self.word_vocab_size, self.word_embed_dim)) 214 | for idx, vector in tqdm(self.word_idx2vector.items()): 215 | self.word_embed_matrix[idx] = vector 216 | else: 217 | all_embs = np.stack(self.char_idx2vector.values()) 218 | self.char_embed_matrix = np.random.normal(all_embs.mean(), all_embs.std(), size=(self.char_vocab_size, self.char_embed_dim)) 219 | for idx, vector in tqdm(self.char_idx2vector.items()): 220 | self.char_embed_matrix[idx] = vector 221 | 222 | 223 | # 一些与Vocabulary相关的工具 224 | # TODO classmethod ??? 225 | def seq_to_idxs(seq, token2idx, token_maxlen, unk_idx=UNK_IDX, pad_idx=PAD_IDX, 226 | padding='post', truncating='post', onlyin=None, excluding=[]): 227 | """ 228 | 向量化编码:基于词汇表token2idx,把seq转化为idx向量,词汇表中不存在的token使用unk_idx进行编码,适用于特征编码和Label编码 229 | 输入seq是分词/分字列表,如:['我', '们', '爱', '学', '习'] 或 ['我们', '爱', '学习'] 230 | 函数功能 = 向量化 + keras.sequence.pad_sequence 231 | ARGS 232 | padding & truncating: post=从后面补零/截断 pre=从前面 233 | onlyin: 只关注这里面的token 234 | excluding: 不关注这里面的token 235 | NOTE 236 | 当onlyin和excluding都存在时同时满足条件,即token in onlyin and token not in excluding 237 | """ 238 | if onlyin: 239 | seq = [token for token in seq if token in onlyin] 240 | seq = [token for token in seq if token not in excluding + ['', ' ']] # TODO ['', ' ']??? 241 | 242 | seq_vec = [token2idx.get(token, unk_idx) for token in seq] # OOV的token标注为专门的unk_idx 243 | seq_vec = seq_vec[: token_maxlen] if truncating == 'post' else seq_vec[-token_maxlen:] # 截断:前或后 244 | paddings = [pad_idx] * (token_maxlen - len(seq_vec)) # 小于向量长度的部分用pad_idx来padding 245 | return seq_vec + paddings if padding == 'post' else paddings + seq_vec # PAD: 前或后 246 | 247 | 248 | 249 | def example(): 250 | """创建word和char的词汇表,并保存本地""" 251 | import pandas as pd 252 | import pickle 253 | from Config import Config 254 | config = Config() 255 | 256 | data = pd.read_csv(config.data_file, sep='\t', encoding='utf8') 257 | sentences_word, sentences_char = data['question_wordseg'], data['question_charseg'] 258 | 259 | # 创建词汇表 260 | # TODO 仅仅使用当前任务的全量数据么?要不要加一些其他更全的语料库?应用时,遇到OOV的词汇咋整? 261 | vocab = Vocabulary() 262 | vocab.add_document(sentences_word, level='word') 263 | vocab.add_document(sentences_char, level='char') # word与char-level使用的数据不一样(停用词不一样),所以分别单独创建 264 | vocab.trim(min_count=config.MIN_COUNT, level='word') # min_count与训练Embedding时保持一致 265 | vocab.trim(min_count=config.MIN_COUNT, level='char') 266 | # kept words: 5484 / 11692 = 0.4690 267 | # kept chars: 1594 / 2052 = 0.7768 268 | 269 | # 生成xxx2vector和Embedding Layer初始化权重 270 | model_word2vec = Word2Vec.load(config.model_word2vec_file) 271 | model_char2vec = Word2Vec.load(config.model_char2vec_file) 272 | vocab.init_vectors(model_word2vec, level='word') 273 | vocab.init_vectors(model_char2vec, level='char') 274 | vocab.init_embed_matrix(level='word') 275 | vocab.init_embed_matrix(level='char') 276 | 277 | # 保存本地 278 | pickle.dump(vocab, open(config.vocab_file, 'wb')) 279 | 280 | 281 | 282 | if __name__ == '__main__': 283 | 284 | example() 285 | -------------------------------------------------------------------------------- /image/1573355016134.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuyaox/text_classification/829573bfbbee8076a822ae65f9d5e6d49e15b375/image/1573355016134.png -------------------------------------------------------------------------------- /image/1573364046216.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuyaox/text_classification/829573bfbbee8076a822ae65f9d5e6d49e15b375/image/1573364046216.png -------------------------------------------------------------------------------- /image/1573366328001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuyaox/text_classification/829573bfbbee8076a822ae65f9d5e6d49e15b375/image/1573366328001.png -------------------------------------------------------------------------------- /image/1573368628525.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuyaox/text_classification/829573bfbbee8076a822ae65f9d5e6d49e15b375/image/1573368628525.png -------------------------------------------------------------------------------- /model/BasicModel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created: 2019-08-11 19:40:55 4 | Author: liuyao8 5 | Descritipn: a. BasicModel: 模型基类,用于生成BasicStatModel和BasicDeepModel,目前仅提供功能:模型评估Metrics计算 6 | b. BasicStatModel: 传统模型基类,提供通用功能: 7 | c. BasicDeepModel: 深度模型基类,提供通用功能: 8 | """ 9 | 10 | import os 11 | from functools import reduce 12 | from collections import Counter 13 | import numpy as np 14 | import pickle 15 | import matplotlib 16 | matplotlib.use("agg") 17 | import matplotlib.pyplot as plt 18 | from scipy.stats import entropy 19 | from sklearn.metrics.pairwise import cosine_similarity 20 | from sklearn.metrics import roc_curve, auc 21 | from sklearn.model_selection import KFold 22 | 23 | from keras.layers import Input, Masking, Embedding 24 | from keras.models import load_model 25 | from keras.utils import multi_gpu_model, plot_model 26 | from keras.optimizers import Adam, SGD 27 | from keras.callbacks import LearningRateScheduler, EarlyStopping, ModelCheckpoint, ReduceLROnPlateau 28 | 29 | 30 | class BasicModel(object): 31 | 32 | def __init__(self): 33 | # TODO 其实可以放一些通用的变量如label数量等 34 | pass 35 | 36 | def build(self): 37 | pass 38 | 39 | 40 | # Metrics: Precision, Recall, F1-score, Distribution Similarity, ROC curve, ROC area, etc. 41 | # TODO 添加method 42 | def multilabel_precision_recall(self, ys_pred, ys_true): 43 | """ 44 | 多标签分类标准Metrics: Precision, Recall, F1-score 45 | ARGS 46 | ys_pred: 预测标签,iterable of iterable,形如:[['a', 'b', 'c'], ['a', 'd'], ['b'], ...] 47 | ys_true: 真实标签,格式同y_pred 48 | RETURN 49 | precision: 总命中标签数/总预测标签数 50 | recall: 总命中标签数/总真实标签数 51 | f1score: (precision * recall) / (precision + recall) 52 | """ 53 | assert len(ys_pred) == len(ys_true) 54 | ys_pred = self.label_binarizer.inverse_transform(ys_pred > 0.5) 55 | ys_true = self.label_binarizer.inverse_transform(ys_true) 56 | 57 | right_num, all_pred_num, all_true_num = 0, 0, 0 # 总命中标签数 总预测标注数 总真实标签数 58 | for y_pred, y_true in zip(ys_pred, ys_true): 59 | y_pred_set, y_true_set = set(y_pred), set(y_true) 60 | all_pred_num += len(y_pred_set) 61 | all_true_num += len(y_true_set) 62 | right_num += len(y_pred_set & y_true_set) # 命中标签数:交集大小 63 | 64 | precision = float(right_num) / all_pred_num 65 | recall = float(right_num) / all_true_num 66 | f1score = (precision * recall) / (precision + recall) 67 | return round(precision, 4), round(recall, 4), round(f1score, 4) 68 | 69 | 70 | def roc_auc(self, ys_pred, ys_true, n_label): 71 | """ 72 | ROC-AUC curve ???? 73 | ARGS 74 | ys_pred: 预测标签(的概率?),iterable of iterable,原始预测结果,shape=(n_sample, n_label) 75 | ys_true: 真实标签? shape同上 76 | n_label: 标签个数 77 | """ 78 | # 为每个label计算ROC curve和ROC area 79 | fpr, tpr = {}, {} 80 | roc_auc = {} 81 | for i in range(n_label): 82 | fpr[i], tpr[i], _ = roc_curve(ys_true[:, i], ys_pred[:, i]) 83 | roc_auc[i] = auc(fpr[i], tpr[i]) 84 | # 计算micro-average ROC curve and ROC area 85 | fpr['micro'], tpr['micro'], _ = roc_curve(ys_true.ravel(), ys_pred.ravel()) 86 | roc_auc['micro'] = auc(fpr['micro'], tpr['micro']) 87 | 88 | 89 | def multilabel_distribution_similarity(self, ys_pred, ys_true): 90 | """ 91 | 多标签分类特定Metrics: 各标签分布余弦相似度和KL散度 92 | ARGS同上 93 | RETURN 94 | similarity: 基于各标签数据分布,预测结果与真实结果的余弦相似度 越小越好 95 | relative_entropy: KL散度/相对熵 越小越好 96 | """ 97 | assert len(ys_pred) == len(ys_true) 98 | ys_pred = self.label_binarizer.inverse_transform(ys_pred > 0.5) 99 | ys_true = self.label_binarizer.inverse_transform(ys_true) 100 | 101 | ys_pred = Counter(reduce(lambda x, y: x + y, ys_pred)) 102 | ys_true = Counter(reduce(lambda x, y: x + y, ys_true)) 103 | keys = list(set(list(ys_pred.keys()) + list(ys_true.keys()))) 104 | vec_pred = [ys_pred[k] for k in keys] 105 | vec_true = [ys_true[k] for k in keys] 106 | 107 | sim_cosine = cosine_similarity([vec_pred], [vec_true])[0, 0] # 余弦相似度 108 | sim_entropy = entropy(vec_pred, vec_true) # KL散度/相对熵 109 | sim_eucliean = sum([(x - y) ** 2 for (x, y) in zip(vec_pred, vec_true)]) ** 0.5 110 | sim_manhattan = sum([abs(x - y) for (x, y) in zip(vec_pred, vec_true)]) 111 | sims = (round(sim_cosine, 4), round(sim_entropy, 4), round(sim_eucliean, 4), round(sim_manhattan, 4)) 112 | return (vec_pred, vec_true), sims 113 | 114 | 115 | 116 | class BasicStatModel(BasicModel): 117 | 118 | def __init__(self, n_fold=5, name='BasicStatModel', config=None): 119 | pass 120 | 121 | 122 | 123 | class BasicDeepModel(BasicModel): 124 | 125 | def __init__(self, config=None, name='BasicDeepModel', model_summary=True, model_plot=False, 126 | token_level=None, structured=None, bert_flag=None): 127 | # 基本信息 128 | if token_level: 129 | config.token_level = token_level 130 | if structured: 131 | config.structured = structured 132 | if bert_flag: 133 | config.bert_flag = bert_flag 134 | self.config = config 135 | stru_postfix = '_stru-' + config.structured if config.structured != 'none' else '' 136 | bert_postfix = '_bert' if config.bert_flag else '' 137 | self.name = name + '_level-' + config.token_level + stru_postfix + bert_postfix 138 | 139 | 140 | # 任务类型决定了类别数量、激活函数和损失函数 141 | if config.task == 'binary': # 单标签二分类 142 | self.n_classes = 1 143 | self.activation = 'sigmoid' 144 | self.loss = 'binary_crossentropy' 145 | self.metrics = ['accuracy'] 146 | elif config.task == 'categorical': # 单标签多分类 147 | self.n_classes = config.N_CLASSES 148 | self.activation = 'softmax' 149 | self.loss = 'categorical_crossentropy' 150 | self.metrics = ['accuracy'] # TODO ??? 151 | elif config.task == 'multilabel': # 多标签二分类(多标签多分类需转化为多标签二分类) 152 | self.n_classes = config.N_CLASSES 153 | self.activation = 'sigmoid' 154 | self.loss = 'binary_crossentropy' 155 | self.metrics = ['accuracy'] 156 | 157 | 158 | # TODO 能不能删除这些self.xxx,而直接使用self.config.xxx来代替!? 159 | # word相关 160 | self.word_maxlen = config.WORD_MAXLEN 161 | self.word_vocab_size = config.WORD_VOCAB_SIZE 162 | self.word_embed_dim = config.WORD_EMBED_DIM 163 | self.word_embed_matrix = config.word_embed_matrix 164 | 165 | # char相关 166 | self.char_maxlen = config.CHAR_MAXLEN 167 | self.char_vocab_size = config.CHAR_VOCAB_SIZE 168 | self.char_embed_dim = config.CHAR_EMBED_DIM 169 | self.char_embed_matrix = config.char_embed_matrix 170 | 171 | # KFold相关 172 | self.n_folds = config.n_folds 173 | self.kfold = KFold(n_splits=config.n_folds, shuffle=True, random_state=10) 174 | 175 | # Model相关 176 | self.masking_value = config.PAD_IDX # TODO mask PAD 突然想到:与PyTorch中的packed_padding和padded_packing相同功能??? 177 | self.create_model(model_summary, model_plot) 178 | 179 | # Train相关 180 | self.n_epochs = 20 181 | self.batch_size = config.BATCH_SIZE 182 | self.init_lr = 0.001 183 | 184 | # Callback 185 | self.lr_schedule = None 186 | self.early_stopping = None 187 | self.snap_epochs = 10 # TODO ? 188 | self.snapshot = None 189 | self.checkpoint = None 190 | 191 | # Predict相关 192 | self.label_binarizer = config.label_binarizer 193 | 194 | 195 | def create_model(self, model_summary=True, model_plot=False): 196 | """调用当前类的build_layers生成通用layers,调用子类的build_model生成model""" 197 | self.build_layers() 198 | self.build_model() 199 | if self.config.n_gpus > 1: 200 | self.model = multi_gpu_model(self.model, gpus=self.config.n_gpus) 201 | if model_summary: 202 | self.model.summary() 203 | if model_plot: 204 | plot_model(self.model, to_file=self.name+'.png', show_shapes=True) 205 | 206 | 207 | def build_layers(self): 208 | """创建DeepModel通用的Layers: Input, Masking, Embedding""" 209 | if self.config.token_level == 'word': 210 | self.word_input = Input(shape=(self.word_maxlen, ), dtype='int32', name='word') 211 | self.word_masking = Masking(mask_value=self.masking_value) 212 | self.word_embedding = Embedding(self.word_vocab_size, self.word_embed_dim, weights=[self.word_embed_matrix], name='word_embedding') 213 | elif self.config.token_level == 'char': 214 | self.char_input = Input(shape=(self.char_maxlen, ), dtype='int32', name='char') 215 | self.char_masking = Masking(mask_value=self.masking_value) 216 | self.char_embedding = Embedding(self.char_vocab_size, self.char_embed_dim, weights=[self.char_embed_matrix], name='char_embedding') 217 | else: 218 | self.word_input = Input(shape=(self.word_maxlen, ), dtype='int32', name='word') 219 | self.char_input = Input(shape=(self.char_maxlen, ), dtype='int32', name='char') 220 | self.word_masking = Masking(mask_value=self.masking_value) 221 | self.char_masking = Masking(mask_value=self.masking_value) 222 | self.word_embedding = Embedding(self.word_vocab_size, self.word_embed_dim, weights=[self.word_embed_matrix], name='word_embedding') 223 | self.char_embedding = Embedding(self.char_vocab_size, self.char_embed_dim, weights=[self.char_embed_matrix], name='char_embedding') 224 | 225 | # 结构化特征 226 | word_structured = Input(shape=(self.config.word_svd_n_componets, ), dtype='float32', name='word_structured') 227 | char_structured = Input(shape=(self.config.char_svd_n_componets, ), dtype='float32', name='char_structured') 228 | if self.config.structured == 'word': 229 | # TODO 只支持LSA特征,暂不支持TFIDF特征,因为维度太大 230 | self.structured_input = [word_structured] # 放在[]中是方便添加到别的列表中,比如Input列表和Tensor列表 231 | elif self.config.structured == 'char': 232 | self.structured_input = [char_structured] 233 | elif self.config.structured == 'both': 234 | self.structured_input = [word_structured, char_structured] 235 | 236 | # Bert编码向量 237 | if self.config.bert_flag: 238 | self.word_input = Input(shape=(self.config.bert_maxlen, self.config.bert_dim, ), dtype='float32', name='word_bert') # 输入是2维! 239 | self.word_masking = Masking(mask_value=self.masking_value) 240 | self.word_embedding = None 241 | 242 | 243 | def lr_decay_poly(self, epoch, alpha=0.5, beta=12): 244 | """训练learning rate衰减schedular""" 245 | # TODO 哪种衰减??? 246 | init_lr = self.init_lr 247 | lr = init_lr * alpha * ((1 + epoch) // beta) 248 | print(f'Epoch: {1 + epoch}, lr: {lr}, wd: {self.wd}') 249 | return lr 250 | 251 | 252 | def plot_history(self, history, i_fold=None): 253 | """绘制训练loss和accuracy,并保存图片""" 254 | if not isinstance(history, dict): 255 | history = history.history 256 | epochs = np.arange(0, len(history['loss'])) 257 | plt.style.use('ggplot') 258 | plt.figure() 259 | plt.plot(epochs, history['loss'], label='train_loss') 260 | plt.plot(epochs, history['val_loss'], label='val_loss') 261 | plt.plot(epochs, history['acc'], label='train_acc') 262 | plt.plot(epochs, history['val_acc'], label='val_acc') 263 | plt.title(self.name + ' (mode=' + str(self.mode) + ')') 264 | plt.xlabel('Epoch #') 265 | plt.ylabel('Loss & Accuracy') 266 | plt.legend() 267 | os.makedirs('history', exist_ok=True) 268 | postfix = '-fold' + str(i_fold) if i_fold else '' 269 | plt.savefig('history/' + self.name + '-mode' + str(self.mode) + postfix + '.png') 270 | plt.close() 271 | 272 | 273 | def plot_histories(self, history1, history2, i_fold=None): 274 | """绘制两阶段训练的loss和accuracy,并保存图片""" 275 | history1, history2 = history1.history, history2.history 276 | history = {} 277 | history['loss'] = history1['loss'] + history2['loss'] 278 | history['val_loss'] = history1['val_loss'] + history2['val_loss'] 279 | history['acc'] = history1['acc'] + history2['acc'] 280 | history['val_acc'] = history1['val_acc'] + history2['val_acc'] 281 | self.plot_history(history, i_fold) 282 | 283 | 284 | def embedding_trainable(self, trainable=True): 285 | """是否解冻Embedding Layer""" 286 | if self.config.token_level == 'both': 287 | self.model.get_layer('char_embedding').trainable = trainable 288 | if not self.config.bert_flag: 289 | self.model.get_layer('word_embedding').trainable = trainable 290 | elif self.config.token_level == 'word': 291 | if not self.config.bert_flag: 292 | self.model.get_layer('word_embedding').trainable = trainable 293 | elif self.config.token_level == 'char': 294 | self.model.get_layer('char_embedding').trainable = trainable 295 | else: 296 | exit('Wrong Token Level') 297 | 298 | 299 | def _evaluate(self, x_test, y_test): 300 | """模型评估""" 301 | _, test_acc = self.model.evaluate(x_test, y_test) 302 | test_pred = self.model.predict(x_test, verbose=1) 303 | scores = self.multilabel_precision_recall(test_pred, y_test) 304 | vectors, sims = self.multilabel_distribution_similarity(test_pred, y_test) 305 | print('------------------ Final: Test Metrics: ------------------') 306 | print('Test Accuracy: ' + str(round(test_acc, 4))) 307 | print('Precision: ' + str(scores[0]) + ' Recall: ' + str(scores[1]) + ' F1score: ' + str(scores[2])) 308 | print('Cosine: ' + str(sims[0]) + ' Entropy: ' + str(sims[1]) + ' Euclidean: ' + str(round(sims[2], 1)) + ' Manhattan: ' + str(sims[3])) 309 | return test_acc, scores, sims, vectors, test_pred 310 | 311 | 312 | def train_evaluate(self, x_train, y_train, x_test, y_test, lr=1e-3, epochs=None): 313 | """ 314 | 模型训练和评估 315 | x_train/x_test是字典(key=Input创建时的name, value=Input对应的数据),能够支持多输入 316 | """ 317 | # 模型训练 318 | print('【' + self.name + '】') 319 | if self.config.bert_flag: # 以Bert编码向量作为输入的模型 320 | epochs = epochs if epochs else self.n_epochs 321 | print('---------------------------------------------------------------------') 322 | optimizer = Adam(lr=lr) 323 | self.model.compile(loss=self.loss, optimizer=optimizer, metrics=self.metrics) 324 | history = self.model.fit(x_train, y_train, 325 | batch_size=self.batch_size*self.config.n_gpus, 326 | epochs=epochs, 327 | validation_split=0.3) 328 | else: 329 | self.mode = 3 330 | epochs = epochs if epochs else (2, self.n_epochs) 331 | print('-------------------Step1: 前期冻结Embedding层,编译和训练模型-------------------') 332 | self.embedding_trainable(False) 333 | print('Embedding Trainable: ' + str(self.model.get_layer('word_embedding').trainable)) 334 | optimizer = Adam(lr=lr, clipvalue=2.4) # clipvalue不应该写死,或者使用默认值!下同 335 | self.model.compile(loss=self.loss, optimizer=optimizer, metrics=self.metrics) 336 | history1 = self.model.fit(x_train, y_train, 337 | batch_size=self.batch_size*self.config.n_gpus, 338 | epochs=epochs[0], 339 | validation_split=0.3) 340 | print('-------------Step2: 训练完参数后,解冻Embedding层,再次编译和训练模型-------------') 341 | self.embedding_trainable(True) 342 | print('Embedding Trainable: ' + str(self.model.get_layer('word_embedding').trainable)) 343 | optimizer = Adam(lr=lr, clipvalue=1.5) 344 | self.model.compile(loss=self.loss, optimizer=optimizer, metrics=self.metrics) 345 | #callbacks = [self.lr_schedule, self.checkpoint, ] # TODO self.checkpoint??? 346 | history2 = self.model.fit(x_train, y_train, 347 | batch_size=self.batch_size*self.config.n_gpus, 348 | epochs=epochs[1], 349 | validation_split=0.3, 350 | callbacks=None) 351 | self.plot_history(history2) 352 | history = (history1, history2) 353 | 354 | # 模型评估 355 | test_acc, scores, sims, vectors, test_pred = self._evaluate(x_test, y_test) 356 | pickle.dump(test_pred, open('./result/' + self.name + '_test_pred.pkl', 'wb')) 357 | return test_acc, scores, sims, vectors, history 358 | 359 | 360 | def model_compile_fit(self, data_fold, optimizer='adam', callbacks=None, epochs=None, model_file=None): 361 | """模型编译和训练Helper Function,支持各种配置""" 362 | x_train, y_train, x_val, y_val = data_fold 363 | epochs = epochs if epochs else self.n_epochs 364 | self.model.compile(loss=self.loss, optimizer=optimizer, metrics=self.metrics) # TODO 多标签时accuracy含义是什么? 365 | history = self.model.fit(x_train, y_train, 366 | batch_size=self.batch_size*self.config.n_gpus, 367 | epochs=epochs, 368 | validation_data=(x_val, y_val), 369 | callbacks=callbacks) 370 | if model_file: 371 | self.model.save_weights(model_file) 372 | return history 373 | 374 | 375 | def train_evaluate_cv(self, x_train, y_train, x_test, mode=3): 376 | """ 377 | 使用KFold方式训练模型,应用于x_train和x_test 378 | x_train/x_test是字典(key=Input创建时的name, value=Input对应的数据),表示多输入 379 | model: 训练模式,包括各种Finetuning策略等 380 | """ 381 | self.mode = mode 382 | checkpoint_path = 'checkpoint-mode' + str(mode) + '/' + self.name + '/' 383 | os.makedirs(checkpoint_path, exist_ok=True) 384 | # 先保存训练前的原始模型(参数和状态处于初始状态),以便于后续KFold时每次加载的都是原始模型(line359),保证起点一致,各Fold之间互不影响 385 | init_model_file = checkpoint_path + 'init_weight.h5' 386 | self.model.save_weights(init_model_file) 387 | 388 | # KFold循环前准备 389 | test_pred = np.zeros((len(x_test['word']), self.n_classes)) # K次预测结果的平均值(要对x_test预测K次) 390 | train_pred = np.zeros((len(x_train['word']), self.n_classes)) # K次预测结果不重不漏地覆盖所有x_train 391 | scores_pre, scores_rec, scores_f1, scores_sim = [], [], [], [] 392 | 393 | for i_fold, (train_index, val_index) in enumerate(self.kfold.split(x_train['word'])): 394 | self.model.load_weights(init_model_file) # 每次KFold开始时加载的都是原始模型 395 | 396 | # 取数:X和Y 397 | x_train_fold, x_val_fold = {}, {} 398 | # TODO 改到__init__里,自动取舍各name! 399 | for key in ['word', 'word_left', 'word_right', 'word_structured', 'char', 'char_left', 'char_right', 'char_structured']: # 对应model创建时Input的name 400 | x_train_fold[key] = x_train[key][train_index] 401 | x_val_fold[key] = x_train[key](val_index) 402 | y_train_fold, y_val_fold = y_train[train_index], y_train[val_index] 403 | data_fold = (x_train_fold, y_train_fold, x_val_fold, y_val_fold) 404 | 405 | 406 | # 创建Callbacks: checkpoint, snapshot 407 | model_prefix = checkpoint_path + '/' + str(i_fold) 408 | os.makedirs(model_prefix, exist_ok=True) 409 | model_file = model_prefix + '/k' + str(i_fold) + '_model.h5' 410 | checkpoint = ModelCheckpoint(model_file, monitor='val_loss', verbose=1, save_best_only=True, mode='min') # TODO min??? 411 | snapshot = self.snapshot.get_callbacks(model_save_place=model_prefix) 412 | # TODO 创建callbacks不规范,有的在__init__中,有的在每次KFold内各mode前,有的在某mode内!最好统一规范一下! 413 | 414 | 415 | # 模型编译和训练 416 | # 支持6种模式 417 | # 1 = 一直冻结,一次编译和训练 418 | # 2,3 = 前期冻结,后期解冻,两次编译和训练 419 | # 4,5,6 = 一直解冻,一次编译和训练 420 | if mode == 1: 421 | # 一直冻结Embedding,使用snapshot方式训练模型 422 | self.embedding_trainable(False) 423 | optimizer = Adam(lr=1e-3, clipvalue=2.0) 424 | callbacks = [snapshot, ] 425 | history = self.model_compile_fit(data_fold, optimizer, callbacks, epochs=self.snap_epoch, model_file=None) 426 | 427 | elif mode == 2: 428 | # 前期冻结Embedding层,模型编译和训练 429 | self.embedding_trainable(False) 430 | optimizer = Adam(lr=1e-3, clipvalue=2.0) 431 | history1 = self.model_compile_fit(data_fold, optimizer, epochs=6) 432 | # 训练好参数后,解冻Embedding层,再次编译,使用snapshot方式训练模型 433 | self.embedding_trainable(True) 434 | optimizer = 'adam' 435 | callbacks = [snapshot, ] 436 | history2 = self.model_compile_fit(data_fold, optimizer, callbacks, epochs=self.snap_epoch, model_file=None) 437 | 438 | elif mode == 3: 439 | # 前期冻结Embedding层,模型编译和训练 440 | self.embedding_trainable(False) 441 | optimizer = Adam(lr=1e-3, clipvalue=2.4) 442 | history1 = self.model_compile_fit(data_fold, optimizer, epochs=2, model_file=None) 443 | # 训练好参数后,解冻Embedding层,再次编译,训练模型 444 | self.embedding_trainable(True) 445 | optimizer = Adam(lr=1e-3, clipvalue=1.5) 446 | callbacks = [self.lr_schedule, checkpoint, ] 447 | history2 = self.model_compile_fit(data_fold, optimizer, callbacks, epochs=10, model_file=None) 448 | self.plot_histories(history1, history2, i_fold) 449 | 450 | elif mode == 4: 451 | # 一直解冻Embedding层,编译和训练模型 452 | if self.config.n_gpus == 1: # TODO 为什么gpu=1时为True,=2时呢?为False??? 注意,默认为True 453 | self.embedding_trainable(True) 454 | optimizer = SGD(lr=self.init_lr, momentum=0.9, decay=1e-6) 455 | callbacks = [LearningRateScheduler(self.poly_decay), self.early_stopping, ] 456 | history = self.model_compile_fit(data_fold, optimizer, callbacks, model_file=model_file) 457 | self.plot_history(history, i_fold) 458 | 459 | elif mode == 5: 460 | # 一直解冻Embedding层,编译和训练模型 461 | optimizer = Adam(lr=1e-3, clipnorm=1.0) 462 | callbacks = [self.lr_schedule, checkpoint, ] 463 | history = self.model_compile_fit(data_fold, optimizer, callbacks, epochs=20, model_file=None) 464 | self.plot_history(history, i_fold) 465 | 466 | elif mode == 6: 467 | # 一直解冻Embedding层,编译,使用snapshot方式训练模型 468 | if self.config.n_gpus == 1: 469 | self.embedding_trainable(True) 470 | optimizer = Adam(lr=self.init_lr, decay=1e-6) 471 | callbacks = [snapshot, ] 472 | history = self.model_compile_fit(data_fold, optimizer, callbacks, model_file=None) 473 | self.plot_history(history, i_fold) 474 | 475 | else: 476 | exit('Wrong mode! mode must be in (1, 2, 3, 4, 5, 6)') 477 | 478 | 479 | # 模型评估 480 | h5models = [x for x in os.listdir(model_prefix) if '.h5' in x] 481 | print(h5models) 482 | test_pred_fold = np.zeros((len(x_test['word']), self.n_class)) # 预测test,按模型个数取平均值 483 | val_pred_fold = np.zeros((len(x_val_fold['word']), self.n_class)) # 预测val,按模型个数取平均值 484 | for h5file in h5models: 485 | self.model.load_weights(os.path.join(model_prefix, h5file)) 486 | test_pred_fold += self.model.predict(x_test, verbose=1) / len(h5models) 487 | val_pred_fold += self.model.predict(x_val_fold, batch_size=64*self.config.n_gpus) / len(h5models) 488 | 489 | test_pred += test_pred_fold / self.n_folds # 按KFold取平均值 490 | train_pred[val_index] = val_pred_fold 491 | 492 | precision, recall, f1score = self.multilabel_precision_recall(val_pred_fold, y_val_fold) 493 | vectors, sims = self.multilabel_distribution_similarity(val_pred_fold, y_val_fold) 494 | print('KFold CV precision = ' + str(precision)) 495 | print('KFold CV recall = ' + str(recall)) 496 | print('KFold CV f1score = ' + str(f1score)) 497 | print('KFold CV similarity = ' + str(sims[0])) 498 | scores_pre.append(precision) 499 | scores_rec.append(recall) 500 | scores_f1.append(f1score) 501 | scores_sim.append(sims[0]) 502 | 503 | 504 | # KFold结束后,保存预测结果 505 | print('Total precision = ' + str(np.mean(scores_pre))) 506 | print('Total recall = ' + str(np.mean(scores_rec))) 507 | print('Total f1score = ' + str(np.mean(scores_f1))) 508 | print('Total similarity = ' + str(np.mean(scores_sim))) 509 | result_prefix = './result/mode' + str(mode) + '_' 510 | result_postfix = 'f1_' + str(np.mean(scores_f1)) + 'pre_' + str(np.mean(scores_pre)) + 'rec_' + str(np.mean(scores_rec)) + '.pkl' 511 | #os.makedirs(result_prefix, exist_ok=True) 512 | pickle.dump(train_pred, open(result_prefix + self.name + '_oof_' + result_postfix, 'wb')) 513 | pickle.dump(test_pred, open(result_prefix + self.name + '_test_' + result_postfix, 'wb')) 514 | 515 | 516 | def load_model(self, model_file): 517 | """加载模型及权重""" 518 | self.model = load_model(model_file) 519 | 520 | 521 | def load_weights(self, weights_file): 522 | """加载模型的权重""" 523 | self.model.load_weights(weights_file) 524 | 525 | 526 | -------------------------------------------------------------------------------- /model/Bert/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /model/Bert/args.py: -------------------------------------------------------------------------------- 1 | import os 2 | from enum import Enum 3 | 4 | 5 | class PoolingStrategy(Enum): 6 | NONE = 0 7 | REDUCE_MAX = 1 8 | REDUCE_MEAN = 2 9 | REDUCE_MEAN_MAX = 3 10 | FIRST_TOKEN = 4 # corresponds to [CLS] for single sequences 11 | LAST_TOKEN = 5 # corresponds to [SEP] for single sequences 12 | CLS_TOKEN = 4 # corresponds to the first token for single seq. 13 | SEP_TOKEN = 5 # corresponds to the last token for single seq. 14 | 15 | def __str__(self): 16 | return self.name 17 | 18 | @staticmethod 19 | def from_string(s): 20 | try: 21 | return PoolingStrategy[s] 22 | except KeyError: 23 | raise ValueError() 24 | 25 | xla = True 26 | # list of int. this model has 12 layers, By default this program works on the second last layer. The last layer is too 27 | # closed to the target functions,If you question about this argument and want to use the last hidden layer anyway, please 28 | # feel free to set layer_indexes=[-1], so we use the second last layer 29 | layer_indexes = [-2] 30 | -------------------------------------------------------------------------------- /model/Bert/extract_feature.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import sys 3 | import os 4 | import tempfile 5 | import random 6 | import json 7 | import logging 8 | from termcolor import colored 9 | import contextlib 10 | from queue import Queue 11 | from threading import Thread 12 | import tensorflow as tf 13 | 14 | sys.path.append(os.path.join(os.path.dirname(__file__), '../')) 15 | from model.Bert import modeling 16 | from model.Bert import tokenization 17 | from model.Bert import args 18 | from model.Bert.args import PoolingStrategy 19 | 20 | 21 | def import_tf(device_id=-1, verbose=False): 22 | #os.environ['CUDA_VISIBLE_DEVICES'] = '-1' if device_id < 0 else str(device_id) 23 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '0' if verbose else '3' 24 | tf.logging.set_verbosity(tf.logging.DEBUG if verbose else tf.logging.ERROR) 25 | return tf 26 | 27 | tf = import_tf(0, True) 28 | 29 | 30 | def set_logger(context, verbose=False): 31 | logger = logging.getLogger(context) 32 | logger.setLevel(logging.DEBUG if verbose else logging.INFO) 33 | formatter = logging.Formatter( 34 | '%(levelname)-.1s:' + context + ':[%(filename).5s:%(funcName).3s:%(lineno)3d]:%(message)s', datefmt= 35 | '%m-%d %H:%M:%S') 36 | console_handler = logging.StreamHandler() 37 | console_handler.setLevel(logging.DEBUG if verbose else logging.INFO) 38 | console_handler.setFormatter(formatter) 39 | logger.handlers = [] 40 | logger.addHandler(console_handler) 41 | return logger 42 | 43 | 44 | class InputExample(object): 45 | 46 | def __init__(self, unique_id, text_a, text_b): 47 | self.unique_id = unique_id 48 | self.text_a = text_a 49 | self.text_b = text_b 50 | 51 | 52 | class InputFeatures(object): 53 | """A single set of features of data.""" 54 | 55 | def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids): 56 | self.unique_id = unique_id 57 | self.tokens = tokens 58 | self.input_ids = input_ids 59 | self.input_mask = input_mask 60 | self.input_type_ids = input_type_ids 61 | 62 | 63 | def optimize_graph(config_name, 64 | ckpt_name, 65 | logger=None, 66 | verbose=False, 67 | pooling_strategy=PoolingStrategy.REDUCE_MEAN, 68 | max_seq_len=40, 69 | graph_tmpfile="./tmpxxx"): 70 | if not logger: 71 | logger = set_logger(colored('BERT_VEC', 'yellow'), verbose) 72 | try: 73 | # we don't need GPU for optimizing the graph 74 | tf = import_tf(device_id=0, verbose=verbose) 75 | from tensorflow.python.tools.optimize_for_inference_lib import optimize_for_inference 76 | 77 | # allow_soft_placement:自动选择运行设备 78 | config = tf.ConfigProto(allow_soft_placement=True) 79 | config_fp = config_name 80 | init_checkpoint = ckpt_name 81 | logger.info('model config: %s' % config_fp) 82 | 83 | # 加载bert配置文件 84 | with tf.gfile.GFile(config_fp, 'r') as f: 85 | bert_config = modeling.BertConfig.from_dict(json.load(f)) 86 | 87 | logger.info('build graph...') 88 | # input placeholders, not sure if they are friendly to XLA 89 | input_ids = tf.placeholder(tf.int32, (None, max_seq_len), 'input_ids') 90 | input_mask = tf.placeholder(tf.int32, (None, max_seq_len), 'input_mask') 91 | input_type_ids = tf.placeholder(tf.int32, (None, max_seq_len), 'input_type_ids') 92 | 93 | # xla加速 94 | jit_scope = tf.contrib.compiler.jit.experimental_jit_scope if args.xla else contextlib.suppress 95 | 96 | with jit_scope(): 97 | input_tensors = [input_ids, input_mask, input_type_ids] 98 | 99 | model = modeling.BertModel( 100 | config=bert_config, 101 | is_training=False, 102 | input_ids=input_ids, 103 | input_mask=input_mask, 104 | token_type_ids=input_type_ids, 105 | use_one_hot_embeddings=False) 106 | 107 | # 获取所有要训练的变量 108 | tvars = tf.trainable_variables() 109 | 110 | (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars, 111 | init_checkpoint) 112 | 113 | tf.train.init_from_checkpoint(init_checkpoint, assignment_map) 114 | 115 | minus_mask = lambda x, m: x - tf.expand_dims(1.0 - m, axis=-1) * 1e30 116 | mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1) 117 | masked_reduce_max = lambda x, m: tf.reduce_max(minus_mask(x, m), axis=1) 118 | masked_reduce_mean = lambda x, m: tf.reduce_sum(mul_mask(x, m), axis=1) / ( 119 | tf.reduce_sum(m, axis=1, keepdims=True) + 1e-10) 120 | 121 | # 共享卷积核 122 | with tf.variable_scope("pooling"): 123 | # 如果只有一层,就只取对应那一层的weight 124 | if len(args.layer_indexes) == 1: 125 | encoder_layer = model.all_encoder_layers[args.layer_indexes[0]] 126 | else: 127 | # 否则遍历需要取的层,把所有层的weight取出来并拼接起来shape:768*层数 128 | all_layers = [model.all_encoder_layers[l] for l in args.layer_indexes] 129 | encoder_layer = tf.concat(all_layers, -1) 130 | 131 | input_mask = tf.cast(input_mask, tf.float32) 132 | 133 | # 以下代码是句向量的生成方法,可以理解为做了一个卷积的操作,但是没有把结果相加, 卷积核是input_mask 134 | if pooling_strategy == PoolingStrategy.REDUCE_MEAN: 135 | pooled = masked_reduce_mean(encoder_layer, input_mask) 136 | elif pooling_strategy == PoolingStrategy.REDUCE_MAX: 137 | pooled = masked_reduce_max(encoder_layer, input_mask) 138 | elif pooling_strategy == PoolingStrategy.REDUCE_MEAN_MAX: 139 | pooled = tf.concat([masked_reduce_mean(encoder_layer, input_mask), 140 | masked_reduce_max(encoder_layer, input_mask)], axis=1) 141 | elif pooling_strategy == PoolingStrategy.FIRST_TOKEN or \ 142 | pooling_strategy == PoolingStrategy.CLS_TOKEN: 143 | pooled = tf.squeeze(encoder_layer[:, 0:1, :], axis=1) 144 | elif pooling_strategy == PoolingStrategy.LAST_TOKEN or \ 145 | pooling_strategy == PoolingStrategy.SEP_TOKEN: 146 | seq_len = tf.cast(tf.reduce_sum(input_mask, axis=1), tf.int32) 147 | rng = tf.range(0, tf.shape(seq_len)[0]) 148 | indexes = tf.stack([rng, seq_len - 1], 1) 149 | pooled = tf.gather_nd(encoder_layer, indexes) 150 | elif pooling_strategy == PoolingStrategy.NONE: 151 | pooled = mul_mask(encoder_layer, input_mask) 152 | else: 153 | raise NotImplementedError() 154 | 155 | pooled = tf.identity(pooled, 'final_encodes') 156 | 157 | output_tensors = [pooled] 158 | tmp_g = tf.get_default_graph().as_graph_def() 159 | 160 | with tf.Session(config=config) as sess: 161 | logger.info('load parameters from checkpoint...') 162 | sess.run(tf.global_variables_initializer()) 163 | logger.info('freeze...') 164 | tmp_g = tf.graph_util.convert_variables_to_constants(sess, tmp_g, [n.name[:-2] for n in output_tensors]) 165 | dtypes = [n.dtype for n in input_tensors] 166 | logger.info('optimize...') 167 | tmp_g = optimize_for_inference( 168 | tmp_g, 169 | [n.name[:-2] for n in input_tensors], 170 | [n.name[:-2] for n in output_tensors], 171 | [dtype.as_datatype_enum for dtype in dtypes], 172 | False) 173 | logger.info('write graph to a tmp file: %s' % graph_tmpfile) 174 | with tf.gfile.GFile(graph_tmpfile, 'wb') as f: 175 | f.write(tmp_g.SerializeToString()) 176 | return graph_tmpfile 177 | except Exception as e: 178 | logger.error('fail to optimize the graph!') 179 | logger.error(e) 180 | 181 | 182 | class BertVector: 183 | def __init__(self, batch_size=1, 184 | pooling_strategy="REDUCE_MEAN", 185 | max_seq_len=40, 186 | bert_model_path="./chinese_L-12_H-768_A-12/", 187 | graph_tmpfile="./tmpxxx"): 188 | """ 189 | init BertVector 190 | :param batch_size: Depending on your memory default is 32 191 | """ 192 | self.max_seq_length = max_seq_len 193 | self.layer_indexes = args.layer_indexes 194 | self.gpu_memory_fraction = 1 195 | 196 | self.file_path = os.path.dirname(__file__) 197 | 198 | self.model_dir = os.path.join(self.file_path, bert_model_path) 199 | self.config_name = os.path.join(self.model_dir, 'bert_config.json') 200 | self.ckpt_name = os.path.join(self.model_dir, 'bert_model.ckpt') 201 | self.vocab_file = os.path.join(self.model_dir, 'vocab.txt') 202 | 203 | if pooling_strategy == "NONE": 204 | pooling_strategy = PoolingStrategy.NONE 205 | elif pooling_strategy == "REDUCE_MAX": 206 | pooling_strategy = PoolingStrategy.REDUCE_MAX 207 | elif pooling_strategy == "REDUCE_MEAN": 208 | pooling_strategy = PoolingStrategy.REDUCE_MEAN 209 | elif pooling_strategy == "REDUCE_MEAN_MAX": 210 | pooling_strategy = PoolingStrategy.REDUCE_MEAN_MAX 211 | 212 | self.graph_path = optimize_graph(self.config_name, 213 | self.ckpt_name, 214 | pooling_strategy=pooling_strategy, 215 | max_seq_len=self.max_seq_length, 216 | graph_tmpfile=graph_tmpfile) 217 | 218 | self.tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_file, do_lower_case=True) 219 | self.batch_size = batch_size 220 | self.estimator = self.get_estimator() 221 | self.input_queue = Queue(maxsize=1) 222 | self.output_queue = Queue(maxsize=1) 223 | self.predict_thread = Thread(target=self.predict_from_queue, daemon=True) 224 | self.predict_thread.start() 225 | 226 | def get_estimator(self): 227 | from tensorflow.python.estimator.estimator import Estimator 228 | from tensorflow.python.estimator.run_config import RunConfig 229 | from tensorflow.python.estimator.model_fn import EstimatorSpec 230 | 231 | def model_fn(features, labels, mode, params): 232 | with tf.gfile.GFile(self.graph_path, 'rb') as f: 233 | graph_def = tf.GraphDef() 234 | graph_def.ParseFromString(f.read()) 235 | 236 | input_names = ['input_ids', 'input_mask', 'input_type_ids'] 237 | 238 | output = tf.import_graph_def(graph_def, 239 | input_map={k + ':0': features[k] for k in input_names}, 240 | return_elements=['final_encodes:0']) 241 | 242 | return EstimatorSpec(mode=mode, predictions={ 243 | 'encodes': output[0] 244 | }) 245 | 246 | config = tf.ConfigProto() 247 | config.gpu_options.allow_growth = True 248 | config.gpu_options.per_process_gpu_memory_fraction = self.gpu_memory_fraction 249 | config.log_device_placement = False 250 | config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 251 | 252 | return Estimator(model_fn=model_fn, config=RunConfig(session_config=config), 253 | params={'batch_size': self.batch_size}) 254 | 255 | def predict_from_queue(self): 256 | prediction = self.estimator.predict(input_fn=self.queue_predict_input_fn, yield_single_examples=False) 257 | for i in prediction: 258 | self.output_queue.put(i) 259 | 260 | def encode(self, sentence): 261 | self.input_queue.put(sentence) 262 | prediction = self.output_queue.get() 263 | return prediction 264 | 265 | def queue_predict_input_fn(self): 266 | 267 | return (tf.data.Dataset.from_generator( 268 | self.generate_from_queue, 269 | output_types={'unique_ids': tf.int32, 270 | 'input_ids': tf.int32, 271 | 'input_mask': tf.int32, 272 | 'input_type_ids': tf.int32}, 273 | output_shapes={ 274 | 'unique_ids': (1,), 275 | 'input_ids': (None, self.max_seq_length), 276 | 'input_mask': (None, self.max_seq_length), 277 | 'input_type_ids': (None, self.max_seq_length)})) 278 | 279 | def generate_from_queue(self): 280 | while True: 281 | features = list(self.convert_examples_to_features(seq_length=self.max_seq_length, tokenizer=self.tokenizer)) 282 | yield { 283 | 'unique_ids': [f.unique_id for f in features], 284 | 'input_ids': [f.input_ids for f in features], 285 | 'input_mask': [f.input_mask for f in features], 286 | 'input_type_ids': [f.input_type_ids for f in features] 287 | } 288 | 289 | def input_fn_builder(self, features, seq_length): 290 | """Creates an `input_fn` closure to be passed to Estimator.""" 291 | 292 | all_unique_ids = [] 293 | all_input_ids = [] 294 | all_input_mask = [] 295 | all_input_type_ids = [] 296 | 297 | for feature in features: 298 | all_unique_ids.append(feature.unique_id) 299 | all_input_ids.append(feature.input_ids) 300 | all_input_mask.append(feature.input_mask) 301 | all_input_type_ids.append(feature.input_type_ids) 302 | 303 | def input_fn(params): 304 | """The actual input function.""" 305 | batch_size = params["batch_size"] 306 | 307 | num_examples = len(features) 308 | 309 | # This is for demo purposes and does NOT scale to large data sets. We do 310 | # not use Dataset.from_generator() because that uses tf.py_func which is 311 | # not TPU compatible. The right way to load data is with TFRecordReader. 312 | d = tf.data.Dataset.from_tensor_slices({ 313 | "unique_ids": 314 | tf.constant(all_unique_ids, shape=[num_examples], dtype=tf.int32), 315 | "input_ids": 316 | tf.constant( 317 | all_input_ids, shape=[num_examples, seq_length], 318 | dtype=tf.int32), 319 | "input_mask": 320 | tf.constant( 321 | all_input_mask, 322 | shape=[num_examples, seq_length], 323 | dtype=tf.int32), 324 | "input_type_ids": 325 | tf.constant( 326 | all_input_type_ids, 327 | shape=[num_examples, seq_length], 328 | dtype=tf.int32), 329 | }) 330 | 331 | d = d.batch(batch_size=batch_size, drop_remainder=False) 332 | return d 333 | 334 | return input_fn 335 | 336 | def model_fn_builder(self, bert_config, init_checkpoint, layer_indexes): 337 | """Returns `model_fn` closure for TPUEstimator.""" 338 | 339 | def model_fn(features, labels, mode, params): # pylint: disable=unused-argument 340 | """The `model_fn` for TPUEstimator.""" 341 | 342 | unique_ids = features["unique_ids"] 343 | input_ids = features["input_ids"] 344 | input_mask = features["input_mask"] 345 | input_type_ids = features["input_type_ids"] 346 | 347 | jit_scope = tf.contrib.compiler.jit.experimental_jit_scope 348 | 349 | with jit_scope(): 350 | model = modeling.BertModel( 351 | config=bert_config, 352 | is_training=False, 353 | input_ids=input_ids, 354 | input_mask=input_mask, 355 | token_type_ids=input_type_ids) 356 | 357 | if mode != tf.estimator.ModeKeys.PREDICT: 358 | raise ValueError("Only PREDICT modes are supported: %s" % (mode)) 359 | 360 | tvars = tf.trainable_variables() 361 | 362 | (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars, 363 | init_checkpoint) 364 | 365 | tf.logging.info("**** Trainable Variables ****") 366 | for var in tvars: 367 | init_string = "" 368 | if var.name in initialized_variable_names: 369 | init_string = ", *INIT_FROM_CKPT*" 370 | tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, 371 | init_string) 372 | 373 | all_layers = model.get_all_encoder_layers() 374 | 375 | predictions = { 376 | "unique_id": unique_ids, 377 | } 378 | 379 | for (i, layer_index) in enumerate(layer_indexes): 380 | predictions["layer_output_%d" % i] = all_layers[layer_index] 381 | 382 | from tensorflow.python.estimator.model_fn import EstimatorSpec 383 | 384 | output_spec = EstimatorSpec(mode=mode, predictions=predictions) 385 | return output_spec 386 | 387 | return model_fn 388 | 389 | def convert_examples_to_features(self, seq_length, tokenizer): 390 | """Loads a data file into a list of `InputBatch`s.""" 391 | 392 | features = [] 393 | input_masks = [] 394 | examples = self._to_example(self.input_queue.get()) 395 | for (ex_index, example) in enumerate(examples): 396 | tokens_a = tokenizer.tokenize(example.text_a) 397 | 398 | # if the sentences's length is more than seq_length, only use sentence's left part 399 | if len(tokens_a) > seq_length - 2: 400 | tokens_a = tokens_a[0:(seq_length - 2)] 401 | 402 | # The convention in BERT is: 403 | # (a) For sequence pairs: 404 | # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] 405 | # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 406 | # (b) For single sequences: 407 | # tokens: [CLS] the dog is hairy . [SEP] 408 | # type_ids: 0 0 0 0 0 0 0 409 | # 410 | # Where "type_ids" are used to indicate whether this is the first 411 | # sequence or the second sequence. The embedding vectors for `type=0` and 412 | # `type=1` were learned during pre-training and are added to the wordpiece 413 | # embedding vector (and position vector). This is not *strictly* necessary 414 | # since the [SEP] token unambiguously separates the sequences, but it makes 415 | # it easier for the model to learn the concept of sequences. 416 | # 417 | # For classification tasks, the first vector (corresponding to [CLS]) is 418 | # used as as the "sentence vector". Note that this only makes sense because 419 | # the entire model is fine-tuned. 420 | tokens = [] 421 | input_type_ids = [] 422 | tokens.append("[CLS]") 423 | input_type_ids.append(0) 424 | for token in tokens_a: 425 | tokens.append(token) 426 | input_type_ids.append(0) 427 | tokens.append("[SEP]") 428 | input_type_ids.append(0) 429 | 430 | # Where "input_ids" are tokens's index in vocabulary 431 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 432 | 433 | # The mask has 1 for real tokens and 0 for padding tokens. Only real 434 | # tokens are attended to. 435 | input_mask = [1] * len(input_ids) 436 | input_masks.append(input_mask) 437 | # Zero-pad up to the sequence length. 438 | while len(input_ids) < seq_length: 439 | input_ids.append(0) 440 | input_mask.append(0) 441 | input_type_ids.append(0) 442 | 443 | assert len(input_ids) == seq_length 444 | assert len(input_mask) == seq_length 445 | assert len(input_type_ids) == seq_length 446 | 447 | if ex_index < 5: 448 | tf.logging.info("*** Example ***") 449 | tf.logging.info("unique_id: %s" % (example.unique_id)) 450 | tf.logging.info("tokens: %s" % " ".join( 451 | [tokenization.printable_text(x) for x in tokens])) 452 | tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) 453 | tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) 454 | tf.logging.info( 455 | "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids])) 456 | 457 | yield InputFeatures( 458 | unique_id=example.unique_id, 459 | tokens=tokens, 460 | input_ids=input_ids, 461 | input_mask=input_mask, 462 | input_type_ids=input_type_ids) 463 | 464 | def _truncate_seq_pair(self, tokens_a, tokens_b, max_length): 465 | """Truncates a sequence pair in place to the maximum length.""" 466 | 467 | # This is a simple heuristic which will always truncate the longer sequence 468 | # one token at a time. This makes more sense than truncating an equal percent 469 | # of tokens from each, since if one sequence is very short then each token 470 | # that's truncated likely contains more information than a longer sequence. 471 | while True: 472 | total_length = len(tokens_a) + len(tokens_b) 473 | if total_length <= max_length: 474 | break 475 | if len(tokens_a) > len(tokens_b): 476 | tokens_a.pop() 477 | else: 478 | tokens_b.pop() 479 | 480 | @staticmethod 481 | def _to_example(sentences): 482 | import re 483 | """ 484 | sentences to InputExample 485 | :param sentences: list of strings 486 | :return: list of InputExample 487 | """ 488 | unique_id = 0 489 | for ss in sentences: 490 | line = tokenization.convert_to_unicode(ss) 491 | if not line: 492 | continue 493 | line = line.strip() 494 | text_a = None 495 | text_b = None 496 | m = re.match(r"^(.*) \|\|\| (.*)$", line) 497 | if m is None: 498 | text_a = line 499 | else: 500 | text_a = m.group(1) 501 | text_b = m.group(2) 502 | yield InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b) 503 | unique_id += 1 504 | 505 | 506 | if __name__ == "__main__": 507 | bc = BertVector(batch_size=1, 508 | pooling_strategy="REDUCE_MEAN", 509 | max_seq_len=20, 510 | bert_model_path="D:\\workspaces\\code\\tfhub\\keras_dssm\\chinese_L-12_H-768_A-12\\", 511 | graph_tmpfile="D:\\workspaces\\code\\tfhub\\keras_dssm\\tmpxxx") 512 | query = u"新浪移动" 513 | vectors = bc.encode([query]) 514 | print(str(vectors)) 515 | -------------------------------------------------------------------------------- /model/Bert/optimization.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Functions and classes related to optimization (weight updates).""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import re 22 | import tensorflow as tf 23 | 24 | 25 | def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu): 26 | """Creates an optimizer training op.""" 27 | global_step = tf.train.get_or_create_global_step() 28 | 29 | learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) 30 | 31 | # Implements linear decay of the learning rate. 32 | learning_rate = tf.train.polynomial_decay( 33 | learning_rate, 34 | global_step, 35 | num_train_steps, 36 | end_learning_rate=0.0, 37 | power=1.0, 38 | cycle=False) 39 | 40 | # Implements linear warmup. I.e., if global_step < num_warmup_steps, the 41 | # learning rate will be `global_step/num_warmup_steps * init_lr`. 42 | if num_warmup_steps: 43 | global_steps_int = tf.cast(global_step, tf.int32) 44 | warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) 45 | 46 | global_steps_float = tf.cast(global_steps_int, tf.float32) 47 | warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) 48 | 49 | warmup_percent_done = global_steps_float / warmup_steps_float 50 | warmup_learning_rate = init_lr * warmup_percent_done 51 | 52 | is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) 53 | learning_rate = ( 54 | (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) 55 | 56 | # It is recommended that you use this optimizer for fine tuning, since this 57 | # is how the model was trained (note that the Adam m/v variables are NOT 58 | # loaded from init_checkpoint.) 59 | optimizer = AdamWeightDecayOptimizer( 60 | learning_rate=learning_rate, 61 | weight_decay_rate=0.01, 62 | beta_1=0.9, 63 | beta_2=0.999, 64 | epsilon=1e-6, 65 | exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) 66 | 67 | if use_tpu: 68 | optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) 69 | 70 | tvars = tf.trainable_variables() 71 | grads = tf.gradients(loss, tvars) 72 | 73 | # This is how the model was pre-trained. 74 | (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) 75 | 76 | train_op = optimizer.apply_gradients( 77 | zip(grads, tvars), global_step=global_step) 78 | 79 | new_global_step = global_step + 1 80 | train_op = tf.group(train_op, [global_step.assign(new_global_step)]) 81 | return train_op 82 | 83 | 84 | class AdamWeightDecayOptimizer(tf.train.Optimizer): 85 | """A basic Adam optimizer that includes "correct" L2 weight decay.""" 86 | 87 | def __init__(self, 88 | learning_rate, 89 | weight_decay_rate=0.0, 90 | beta_1=0.9, 91 | beta_2=0.999, 92 | epsilon=1e-6, 93 | exclude_from_weight_decay=None, 94 | name="AdamWeightDecayOptimizer"): 95 | """Constructs a AdamWeightDecayOptimizer.""" 96 | super(AdamWeightDecayOptimizer, self).__init__(False, name) 97 | 98 | self.learning_rate = learning_rate 99 | self.weight_decay_rate = weight_decay_rate 100 | self.beta_1 = beta_1 101 | self.beta_2 = beta_2 102 | self.epsilon = epsilon 103 | self.exclude_from_weight_decay = exclude_from_weight_decay 104 | 105 | def apply_gradients(self, grads_and_vars, global_step=None, name=None): 106 | """See base class.""" 107 | assignments = [] 108 | for (grad, param) in grads_and_vars: 109 | if grad is None or param is None: 110 | continue 111 | 112 | param_name = self._get_variable_name(param.name) 113 | 114 | m = tf.get_variable( 115 | name=param_name + "/adam_m", 116 | shape=param.shape.as_list(), 117 | dtype=tf.float32, 118 | trainable=False, 119 | initializer=tf.zeros_initializer()) 120 | v = tf.get_variable( 121 | name=param_name + "/adam_v", 122 | shape=param.shape.as_list(), 123 | dtype=tf.float32, 124 | trainable=False, 125 | initializer=tf.zeros_initializer()) 126 | 127 | # Standard Adam update. 128 | next_m = ( 129 | tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad)) 130 | next_v = ( 131 | tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2, 132 | tf.square(grad))) 133 | 134 | update = next_m / (tf.sqrt(next_v) + self.epsilon) 135 | 136 | # Just adding the square of the weights to the loss function is *not* 137 | # the correct way of using L2 regularization/weight decay with Adam, 138 | # since that will interact with the m and v parameters in strange ways. 139 | # 140 | # Instead we want ot decay the weights in a manner that doesn't interact 141 | # with the m/v parameters. This is equivalent to adding the square 142 | # of the weights to the loss with plain (non-momentum) SGD. 143 | if self._do_use_weight_decay(param_name): 144 | update += self.weight_decay_rate * param 145 | 146 | update_with_lr = self.learning_rate * update 147 | 148 | next_param = param - update_with_lr 149 | 150 | assignments.extend( 151 | [param.assign(next_param), 152 | m.assign(next_m), 153 | v.assign(next_v)]) 154 | return tf.group(*assignments, name=name) 155 | 156 | def _do_use_weight_decay(self, param_name): 157 | """Whether to use L2 weight decay for `param_name`.""" 158 | if not self.weight_decay_rate: 159 | return False 160 | if self.exclude_from_weight_decay: 161 | for r in self.exclude_from_weight_decay: 162 | if re.search(r, param_name) is not None: 163 | return False 164 | return True 165 | 166 | def _get_variable_name(self, param_name): 167 | """Get the variable name from the tensor name.""" 168 | m = re.match("^(.*):\\d+$", param_name) 169 | if m is not None: 170 | param_name = m.group(1) 171 | return param_name 172 | -------------------------------------------------------------------------------- /model/Bert/tokenization.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import collections 22 | import unicodedata 23 | import six 24 | import tensorflow as tf 25 | 26 | 27 | def convert_to_unicode(text): 28 | """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" 29 | if six.PY3: 30 | if isinstance(text, str): 31 | return text 32 | elif isinstance(text, bytes): 33 | return text.decode("utf-8", "ignore") 34 | else: 35 | raise ValueError("Unsupported string type: %s" % (type(text))) 36 | elif six.PY2: 37 | if isinstance(text, str): 38 | return text.decode("utf-8", "ignore") 39 | elif isinstance(text, unicode): 40 | return text 41 | else: 42 | raise ValueError("Unsupported string type: %s" % (type(text))) 43 | else: 44 | raise ValueError("Not running on Python2 or Python 3?") 45 | 46 | 47 | def printable_text(text): 48 | """Returns text encoded in a way suitable for print or `tf.logging`.""" 49 | 50 | # These functions want `str` for both Python2 and Python3, but in one case 51 | # it's a Unicode string and in the other it's a byte string. 52 | if six.PY3: 53 | if isinstance(text, str): 54 | return text 55 | elif isinstance(text, bytes): 56 | return text.decode("utf-8", "ignore") 57 | else: 58 | raise ValueError("Unsupported string type: %s" % (type(text))) 59 | elif six.PY2: 60 | if isinstance(text, str): 61 | return text 62 | elif isinstance(text, unicode): 63 | return text.encode("utf-8") 64 | else: 65 | raise ValueError("Unsupported string type: %s" % (type(text))) 66 | else: 67 | raise ValueError("Not running on Python2 or Python 3?") 68 | 69 | 70 | def load_vocab(vocab_file): 71 | """Loads a vocabulary file into a dictionary.""" 72 | vocab = collections.OrderedDict() 73 | index = 0 74 | with tf.gfile.GFile(vocab_file, "r") as reader: 75 | while True: 76 | token = convert_to_unicode(reader.readline()) 77 | if not token: 78 | break 79 | token = token.strip() 80 | vocab[token] = index 81 | index += 1 82 | return vocab 83 | 84 | 85 | def convert_by_vocab(vocab, items): 86 | """Converts a sequence of [tokens|ids] using the vocab.""" 87 | output = [] 88 | for item in items: 89 | output.append(vocab[item]) 90 | return output 91 | 92 | 93 | def convert_tokens_to_ids(vocab, tokens): 94 | return convert_by_vocab(vocab, tokens) 95 | 96 | 97 | def convert_ids_to_tokens(inv_vocab, ids): 98 | return convert_by_vocab(inv_vocab, ids) 99 | 100 | 101 | def whitespace_tokenize(text): 102 | """Runs basic whitespace cleaning and splitting on a piece of text.""" 103 | text = text.strip() 104 | if not text: 105 | return [] 106 | tokens = text.split() 107 | return tokens 108 | 109 | 110 | class FullTokenizer(object): 111 | """Runs end-to-end tokenziation.""" 112 | 113 | def __init__(self, vocab_file, do_lower_case=True): 114 | self.vocab = load_vocab(vocab_file) 115 | self.inv_vocab = {v: k for k, v in self.vocab.items()} 116 | self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) 117 | self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) 118 | 119 | def tokenize(self, text): 120 | split_tokens = [] 121 | for token in self.basic_tokenizer.tokenize(text): 122 | for sub_token in self.wordpiece_tokenizer.tokenize(token): 123 | split_tokens.append(sub_token) 124 | 125 | return split_tokens 126 | 127 | def convert_tokens_to_ids(self, tokens): 128 | return convert_by_vocab(self.vocab, tokens) 129 | 130 | def convert_ids_to_tokens(self, ids): 131 | return convert_by_vocab(self.inv_vocab, ids) 132 | 133 | 134 | class BasicTokenizer(object): 135 | """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" 136 | 137 | def __init__(self, do_lower_case=True): 138 | """Constructs a BasicTokenizer. 139 | 140 | Args: 141 | do_lower_case: Whether to lower case the input. 142 | """ 143 | self.do_lower_case = do_lower_case 144 | 145 | def tokenize(self, text): 146 | """Tokenizes a piece of text.""" 147 | text = convert_to_unicode(text) 148 | text = self._clean_text(text) 149 | 150 | # This was added on November 1st, 2018 for the multilingual and Chinese 151 | # models. This is also applied to the English models now, but it doesn't 152 | # matter since the English models were not trained on any Chinese data 153 | # and generally don't have any Chinese data in them (there are Chinese 154 | # characters in the vocabulary because Wikipedia does have some Chinese 155 | # words in the English Wikipedia.). 156 | text = self._tokenize_chinese_chars(text) 157 | 158 | orig_tokens = whitespace_tokenize(text) 159 | split_tokens = [] 160 | for token in orig_tokens: 161 | if self.do_lower_case: 162 | token = token.lower() 163 | token = self._run_strip_accents(token) 164 | split_tokens.extend(self._run_split_on_punc(token)) 165 | 166 | output_tokens = whitespace_tokenize(" ".join(split_tokens)) 167 | return output_tokens 168 | 169 | def _run_strip_accents(self, text): 170 | """Strips accents from a piece of text.""" 171 | text = unicodedata.normalize("NFD", text) 172 | output = [] 173 | for char in text: 174 | cat = unicodedata.category(char) 175 | if cat == "Mn": 176 | continue 177 | output.append(char) 178 | return "".join(output) 179 | 180 | def _run_split_on_punc(self, text): 181 | """Splits punctuation on a piece of text.""" 182 | chars = list(text) 183 | i = 0 184 | start_new_word = True 185 | output = [] 186 | while i < len(chars): 187 | char = chars[i] 188 | if _is_punctuation(char): 189 | output.append([char]) 190 | start_new_word = True 191 | else: 192 | if start_new_word: 193 | output.append([]) 194 | start_new_word = False 195 | output[-1].append(char) 196 | i += 1 197 | 198 | return ["".join(x) for x in output] 199 | 200 | def _tokenize_chinese_chars(self, text): 201 | """Adds whitespace around any CJK character.""" 202 | output = [] 203 | for char in text: 204 | cp = ord(char) 205 | if self._is_chinese_char(cp): 206 | output.append(" ") 207 | output.append(char) 208 | output.append(" ") 209 | else: 210 | output.append(char) 211 | return "".join(output) 212 | 213 | def _is_chinese_char(self, cp): 214 | """Checks whether CP is the codepoint of a CJK character.""" 215 | # This defines a "chinese character" as anything in the CJK Unicode block: 216 | # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) 217 | # 218 | # Note that the CJK Unicode block is NOT all Japanese and Korean characters, 219 | # despite its name. The modern Korean Hangul alphabet is a different block, 220 | # as is Japanese Hiragana and Katakana. Those alphabets are used to write 221 | # space-separated words, so they are not treated specially and handled 222 | # like the all of the other languages. 223 | if ((cp >= 0x4E00 and cp <= 0x9FFF) or # 224 | (cp >= 0x3400 and cp <= 0x4DBF) or # 225 | (cp >= 0x20000 and cp <= 0x2A6DF) or # 226 | (cp >= 0x2A700 and cp <= 0x2B73F) or # 227 | (cp >= 0x2B740 and cp <= 0x2B81F) or # 228 | (cp >= 0x2B820 and cp <= 0x2CEAF) or 229 | (cp >= 0xF900 and cp <= 0xFAFF) or # 230 | (cp >= 0x2F800 and cp <= 0x2FA1F)): # 231 | return True 232 | 233 | return False 234 | 235 | def _clean_text(self, text): 236 | """Performs invalid character removal and whitespace cleanup on text.""" 237 | output = [] 238 | for char in text: 239 | cp = ord(char) 240 | if cp == 0 or cp == 0xfffd or _is_control(char): 241 | continue 242 | if _is_whitespace(char): 243 | output.append(" ") 244 | else: 245 | output.append(char) 246 | return "".join(output) 247 | 248 | 249 | class WordpieceTokenizer(object): 250 | """Runs WordPiece tokenziation.""" 251 | 252 | def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200): 253 | self.vocab = vocab 254 | self.unk_token = unk_token 255 | self.max_input_chars_per_word = max_input_chars_per_word 256 | 257 | def tokenize(self, text): 258 | """Tokenizes a piece of text into its word pieces. 259 | 260 | This uses a greedy longest-match-first algorithm to perform tokenization 261 | using the given vocabulary. 262 | 263 | For example: 264 | input = "unaffable" 265 | output = ["un", "##aff", "##able"] 266 | 267 | Args: 268 | text: A single token or whitespace separated tokens. This should have 269 | already been passed through `BasicTokenizer. 270 | 271 | Returns: 272 | A list of wordpiece tokens. 273 | """ 274 | 275 | text = convert_to_unicode(text) 276 | 277 | output_tokens = [] 278 | for token in whitespace_tokenize(text): 279 | chars = list(token) 280 | if len(chars) > self.max_input_chars_per_word: 281 | output_tokens.append(self.unk_token) 282 | continue 283 | 284 | is_bad = False 285 | start = 0 286 | sub_tokens = [] 287 | while start < len(chars): 288 | end = len(chars) 289 | cur_substr = None 290 | while start < end: 291 | substr = "".join(chars[start:end]) 292 | if start > 0: 293 | substr = "##" + substr 294 | if substr in self.vocab: 295 | cur_substr = substr 296 | break 297 | end -= 1 298 | if cur_substr is None: 299 | is_bad = True 300 | break 301 | sub_tokens.append(cur_substr) 302 | start = end 303 | 304 | if is_bad: 305 | output_tokens.append(self.unk_token) 306 | else: 307 | output_tokens.extend(sub_tokens) 308 | return output_tokens 309 | 310 | 311 | def _is_whitespace(char): 312 | """Checks whether `chars` is a whitespace character.""" 313 | # \t, \n, and \r are technically contorl characters but we treat them 314 | # as whitespace since they are generally considered as such. 315 | if char == " " or char == "\t" or char == "\n" or char == "\r": 316 | return True 317 | cat = unicodedata.category(char) 318 | if cat == "Zs": 319 | return True 320 | return False 321 | 322 | 323 | def _is_control(char): 324 | """Checks whether `chars` is a control character.""" 325 | # These are technically control characters but we count them as whitespace 326 | # characters. 327 | if char == "\t" or char == "\n" or char == "\r": 328 | return False 329 | cat = unicodedata.category(char) 330 | if cat.startswith("C"): 331 | return True 332 | return False 333 | 334 | 335 | def _is_punctuation(char): 336 | """Checks whether `chars` is a punctuation character.""" 337 | cp = ord(char) 338 | # We treat all non-letter/number ASCII as punctuation. 339 | # Characters such as "^", "$", and "`" are not in the Unicode 340 | # Punctuation class but we treat them as punctuation anyways, for 341 | # consistency. 342 | if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or 343 | (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): 344 | return True 345 | cat = unicodedata.category(char) 346 | if cat.startswith("P"): 347 | return True 348 | return False 349 | -------------------------------------------------------------------------------- /model/Layers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created: 2019-08-16 13:37:44 4 | Author: liuyao8 5 | Descritipn: 6 | """ 7 | 8 | from keras.layers import Layer, Activation 9 | from keras import initializers, regularizers, constraints 10 | from keras import backend as K 11 | 12 | 13 | 14 | class AttentionWeightedAverage(Layer): 15 | """ 16 | A weighted Average of different channels across timesteps 17 | Reference: 18 | """ 19 | def __init__(self, return_attention=False, **kwargs): 20 | self.supports_masking = True 21 | self.return_attention = return_attention 22 | super(AttentionWeightedAverage, self).__init__(**kwargs) 23 | 24 | 25 | def build(self, input_shape): 26 | """Define the weights""" 27 | assert len(input_shape) == 3 28 | self.W = self.add_weight(name=self.name+'_W', shape=(input_shape[2], 1), initializer='glorot_uniform') 29 | self.trainable_weights = [self.W] 30 | super(AttentionWeightedAverage, self).build(input_shape) 31 | 32 | 33 | def call(self, x, mask=None): 34 | """ 35 | Layer's logic: 36 | logit = W * x - max(W * x) # 相当于小神经网络: x -> logit 37 | attn = softmax(logit) = exp(logit) / (sum(exp(logit)) + epsilon) 38 | result = sum(attn * x) 39 | 简写:result=sum(p(x)*x) p(x)=softmax(Wx) 加性模型??? 40 | """ 41 | logit = K.dot(x, self.W) # (i0, i1, i2) dot (i2, 1) -> (i0, i1, 1) 42 | logit = K.reshape(logit, (K.shape(x)[0], K.shape(x)[1])) # -> (i0, i1) 43 | logit = logit - K.max(logit, axis=-1, keepdims=True) # (i0, i1) 44 | ai = K.exp(logit) # (i0, i1) 45 | 46 | # masked timesteps have 0 weight 47 | if mask: 48 | ai = ai * K.cast(mask, K.floatx()) # (i0, i1) 49 | 50 | attn = ai / (K.sum(ai, axis=1, keepdims=True) + K.epsilon()) # (i0, i1) 51 | result = K.sum(x * K.expand_dims(attn), axis=1) # (i0, i1, i2) * (i0, i1, 1) -> (i0, i1, i2) -> (i0, i2) 52 | return [result, attn] if self.return_attention else result 53 | 54 | 55 | def compute_output_shape(self, input_shape): 56 | """The shape transformation logic""" 57 | if self.return_attention: 58 | return [(input_shape[0], input_shape[2]), (input_shape[0], input_shape[1])] 59 | return (input_shape[0], input_shape[2]) 60 | 61 | 62 | 63 | class Attention(Layer): 64 | """ 65 | Keras Layer that implements an Attention mechanism for temporal data. 66 | Supports Masking. 67 | Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756] 68 | # Input shape 69 | 3D tensor with shape: `(samples, steps, features)`. 70 | # Output shape 71 | 2D tensor with shape: `(samples, features)`. 72 | :param kwargs: 73 | Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True. 74 | The dimensions are inferred based on the output shape of the RNN. 75 | Example: 76 | hidden = LSTM(64, return_sequences=True)(words) 77 | sentence = Attention()(hidden) 78 | # next add a Dense layer (for classification/regression) or whatever... 79 | 代码源自:https://github.com/ShawnyXiao/TextClassification-Keras/blob/master/model/HAN/attention.py 80 | """ 81 | def __init__(self, step_dim, W_regularizer=None, b_regularizer=None, W_constraint=None, b_constraint=None, bias=True, **kwargs): 82 | self.supports_masking = True 83 | self.init = initializers.get('glorot_uniform') 84 | self.W_regularizer = regularizers.get(W_regularizer) 85 | self.b_regularizer = regularizers.get(b_regularizer) 86 | self.W_constraint = constraints.get(W_constraint) 87 | self.b_constraint = constraints.get(b_constraint) 88 | self.bias = bias 89 | self.step_dim = step_dim 90 | self.features_dim = 0 91 | super(Attention, self).__init__(**kwargs) 92 | 93 | 94 | def build(self, input_shape): 95 | assert len(input_shape) == 3 96 | self.W = self.add_weight(shape=(input_shape[-1], ), name='{}_W'.format(self.name), 97 | initializer=self.init, regularizer=self.W_regularizer, constraint=self.W_constraint) 98 | self.features_dim = input_shape[-1] 99 | if self.bias: 100 | self.b = self.add_weight(shape=(input_shape[1], ), name='{}_b'.format(self.name), 101 | initializer='zero', regularizer=self.b_regularizer, constraint=self.b_constraint) 102 | else: 103 | self.b = None 104 | self.built = True 105 | 106 | 107 | def compute_mask(self, input, input_mask=None): 108 | # do not pass the mask to the next layers 109 | return None 110 | 111 | 112 | def call(self, x, mask=None): 113 | """简写:result=sum(p(x)*x) p(x)=softmax(tanh(Wx+b)) 加性模型""" 114 | features_dim = self.features_dim 115 | step_dim = self.step_dim 116 | e = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim)) # e = K.dot(x, self.W) 117 | if self.bias: 118 | e += self.b 119 | e = K.tanh(e) # e = tanh(Wx + b) 120 | a = K.exp(e) # a = exp(e) 121 | # apply mask after the exp. will be re-normalized next 122 | if mask is not None: 123 | a *= K.cast(mask, K.floatx()) # cast the mask to floatX to avoid float64 upcasting in theano 124 | # In some cases especially in the early stages of training, the sum may be almost zero and this results in NaN's. 125 | # A workaround is to add a very small positive number ε to the sum. 126 | a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx()) # a = softmax(e) = softmax(tanh(Wx + b)) = p(x) 表示一种权重 127 | a = K.expand_dims(a) 128 | result = K.sum(a * x, axis=1) # result = sum(p(x) * x) 对x加权求和 129 | return result 130 | 131 | 132 | def compute_output_shape(self, input_shape): 133 | return input_shape[0], self.features_dim 134 | 135 | 136 | 137 | class AttentionSelf(Layer): 138 | """ 139 | Self Attention, codes from: https://blog.csdn.net/xiaosongshine/article/details/90600028 140 | 代码源自:https://github.com/yongzhuo/Keras-TextClassification/blob/master/keras_textclassification/keras_layers/attention_self.py 141 | """ 142 | def __init__(self, output_dim, **kwargs): 143 | self.output_dim = output_dim 144 | super().__init__(**kwargs) 145 | 146 | 147 | def build(self, input_shape): 148 | # Q、K and V 149 | self.kernel = self.add_weight(name='QKV', shape=(3, input_shape[2], self.output_dim), trainable=True, 150 | initializer='uniform', regularizer=regularizers.L1L2(0.0000032)) 151 | super().build(input_shape) 152 | 153 | 154 | def call(self, x): 155 | '''简写:res=p(x)*Vx p(x)=softmax(Qx*Kx)''' 156 | QX = K.dot(x, self.kernel[0]) 157 | KX = K.dot(x, self.kernel[1]) 158 | VX = K.dot(x, self.kernel[2]) 159 | print("QX.shape", QX.shape) 160 | print("K.permute_dimensions(KX, [0, 2, 1]).shape", K.permute_dimensions(KX, [0, 2, 1]).shape) 161 | 162 | # batch_dot: 即batch-wise dot product,X与Y同一batch分别是Xi和Yi,则dot(Xi, Yi.T)为该batch的结果,遍历所有batch 163 | # 当axes!=None时另说 164 | QK = K.batch_dot(QX, K.permute_dimensions(KX, [0, 2, 1])) 165 | QK = QK / (64 ** 0.5) # TODO 64是不是应该改为self.output_dim更合适一些?!?因为KX's shape=(, input_shape[1], output_dim),KX的维度是output_dim 166 | QK = K.softmax(QK) 167 | print("QK.shape", QK.shape) 168 | 169 | res = K.batch_dot(QK, VX) 170 | return res 171 | 172 | 173 | def compute_output_shape(self, input_shape): 174 | return (input_shape[0], input_shape[1], self.output_dim) 175 | 176 | 177 | 178 | def squash(x, axis=-1): 179 | # s_squared_norm is really small 180 | # s_squared_norm = K.sum(K.square(x), axis, keepdims=True) + K.epsilon() 181 | # scale = K.sqrt(s_squared_norm)/ (0.5 + s_squared_norm) 182 | # return scale * x 183 | s_squared_norm = K.sum(K.square(x), axis, keepdims=True) 184 | scale = K.sqrt(s_squared_norm + K.epsilon()) 185 | return x / scale 186 | 187 | 188 | 189 | class Capsule(Layer): 190 | """ 191 | Capsule TODO 待研究! 192 | """ 193 | def __init__(self, n_capsule, dim_capsule, routings=3, kernel_size=(9, 1), share_weights=True, activation=None, **kwargs): 194 | super(Capsule, self).__init__(**kwargs) 195 | self.n_capsule = n_capsule 196 | self.dim_capsule = dim_capsule 197 | self.routings = routings 198 | self.kernel_size = kernel_size 199 | self.share_weights = share_weights 200 | self.activation = Activation(activation) if activation else squash 201 | 202 | 203 | def build(self, input_shape): 204 | super(Capsule, self).build(input_shape) 205 | input_n_capsule = 1 if self.share_weights else input_shape[-2] 206 | input_dim_capsule = input_shape[-1] 207 | self.W = self.add_weight(name='capsule_kernel', 208 | shape=(input_n_capsule, input_dim_capsule, self.n_capsule * self.dim_capsule), 209 | initializer='glorot_uniform', 210 | trainable=True) 211 | 212 | 213 | def call(self, x): 214 | if self.share_weights: 215 | u_hat_vecs = K.conv1d(x, self.W) 216 | else: 217 | u_hat_vecs = K.local_conv1d(x, self.W, [1], [1]) 218 | 219 | batch_size = K.shape(x)[0] 220 | input_n_capsule = K.shape(x)[1] 221 | u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_n_capsule, self.n_capsule, self.dim_capsule)) 222 | u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3)) 223 | # final u_hat_vecs.shape = [None, n_capsule, input_n_capsule, dim_capsule] 224 | 225 | b = K.zeros_like(u_hat_vecs[:, :, :, 0]) # shape = [None, n_capsule, input_n_capsule] 226 | for i in range(self.routings): 227 | b = K.permute_dimensions(b, (0, 2, 1)) # shape = [None, input_n_capsule, n_capsule] 228 | c = K.softmax(b) 229 | c = K.permute_dimensions(c, (0, 2, 1)) 230 | b = K.permute_dimensions(b, (0, 2, 1)) 231 | outputs = self.activation(K.batch_dot(c, u_hat_vecs, [2, 2])) 232 | if i < self.routings - 1: 233 | b = K.batch_dot(outputs, u_hat_vecs, [2, 3]) 234 | return outputs 235 | 236 | 237 | def compute_output_shape(self, input_shape): 238 | return (None, self.n_capsule, self.dim_capsule) 239 | -------------------------------------------------------------------------------- /model/TextBertCNN.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created: 2019-08-14 13:35:44 4 | Author: liuyao8 5 | Descritipn: 6 | """ 7 | 8 | from keras.layers import BatchNormalization, SpatialDropout1D, Conv1D, GlobalMaxPooling1D, Concatenate, Dropout, Dense 9 | from keras.models import Model 10 | 11 | from model.BasicModel import BasicDeepModel 12 | 13 | 14 | class TextBertCNN(BasicDeepModel): 15 | """TextCNN模型,修改自TextCNN.py,支持Bert编码向量的输入,没有embedding""" 16 | 17 | def __init__(self, config=None, fsizes=(2, 5), n_filters=64, dropout_p=0.25, **kwargs): 18 | self.fsizes = fsizes 19 | self.n_filters = n_filters 20 | self.dropout_p = dropout_p 21 | name = 'TextBertCNN' 22 | config.bert_flag = True 23 | BasicDeepModel.__init__(self, config=config, name=name, **kwargs) 24 | 25 | 26 | def model_unit(self, inputs, masking, embedding=None, dropout_p=None, fsizes=None, n_filters=None): 27 | """模型主体Unit""" 28 | if dropout_p is None: 29 | dropout_p = self.dropout_p 30 | if fsizes is None: 31 | fsizes = self.fsizes 32 | if n_filters is None: 33 | n_filters = [self.n_filters] * (fsizes[1] - fsizes[0] + 1) 34 | 35 | X = masking(inputs) 36 | if embedding: # TODO 为了支持embedding为None的Bert编码向量,暂时还有问题line41 37 | X = embedding(X) 38 | X = BatchNormalization()(X) 39 | X = SpatialDropout1D(dropout_p)(X) 40 | Xs = [] 41 | for i, fsize in enumerate(range(fsizes[0], fsizes[1] + 1)): 42 | Xi = Conv1D(n_filters[i], fsize, activation='relu')(X) # TODO Layer conv1d_5 does not support masking, but was passed an input_mask 43 | Xi = GlobalMaxPooling1D()(Xi) 44 | Xs.append(Xi) 45 | return Xs 46 | 47 | 48 | def build_model(self): 49 | # 模型主体 50 | if self.config.token_level == 'word': 51 | Xs = self.model_unit(self.word_input, self.word_masking, self.word_embedding) 52 | inputs = [self.word_input] 53 | 54 | elif self.config.token_level == 'char': 55 | Xs = self.model_unit(self.char_input, self.char_masking, self.char_embedding) 56 | inputs = [self.char_input] 57 | 58 | else: 59 | word_Xs = self.model_unit(self.word_input, self.word_masking, self.word_embedding) 60 | char_Xs = self.model_unit(self.char_input, self.char_masking, self.char_embedding) 61 | Xs = word_Xs + char_Xs 62 | inputs = [self.word_input, self.char_input] 63 | 64 | 65 | # 结构化特征 66 | if self.config.structured in ['word', 'char', 'both']: 67 | Xs = Xs + self.structured_input 68 | inputs = inputs + self.structured_input 69 | 70 | 71 | # 模型结尾 72 | X = Concatenate()(Xs) if len(Xs) > 1 else Xs[0] 73 | X = BatchNormalization()(X) 74 | X = Dropout(0.5)(X) 75 | # X = Dense(self.hidden_units, activation='relu')(X) # TODO 不需要隐藏层!? 76 | out = Dense(self.n_classes, activation=self.activation)(X) 77 | 78 | self.model = Model(inputs=inputs, outputs=out) -------------------------------------------------------------------------------- /model/TextBertGRU.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created: 2019-08-23 16:28:24 4 | Author: liuyao8 5 | Descritipn: 6 | """ 7 | 8 | from keras.layers import GRU, Dense, BatchNormalization 9 | from keras.optimizers import Adam 10 | from keras.models import Model 11 | 12 | from model.BasicModel import BasicDeepModel 13 | from model.Bert.extract_feature import BertVector 14 | 15 | 16 | class TextBertGRU(BasicDeepModel): 17 | """ 18 | Bert向量简单应用 19 | Bert(Tensorflow实现)预训练模型对原始文本进行向量化编码,输入至RNN模型(Keras实现)里微调 20 | 注意,Bert不参与模型搭建,更不参与训练!相当于提前训练好的Word Embedding那样使用 21 | """ 22 | 23 | def __init__(self, config=None, rnn_units=128, dense_units=128, **kwargs): 24 | self.rnn_units = rnn_units 25 | self.dense_units = dense_units 26 | name = 'TextBertGRU' 27 | config.bert_flag = True # 唯一与BERT关联的地方 28 | BasicDeepModel.__init__(self, config=config, name=name, **kwargs) 29 | 30 | 31 | def build_model(self): 32 | """模型结构与BERT没任何关系,只不过其输入是BERT编码的向量""" 33 | X = self.word_masking(self.word_input) # TODO 务必要有masking,否则loss和val_acc几乎一直不保持不变! 34 | X = GRU(self.rnn_units, dropout=0.25, recurrent_dropout=0.25)(X) 35 | X = Dense(self.dense_units, activation='relu')(X) 36 | X = BatchNormalization()(X) 37 | out = Dense(self.n_classes, activation=self.activation)(X) 38 | self.model = Model(inputs=self.word_input, outputs=out) 39 | 40 | 41 | # 模型创建、训练与评估,详见脚本ModelTrain.py中的example函数 42 | 43 | 44 | 45 | # TODO 以下待办,暂时不用看 设计成数据编码环节,可通用于其他所有模型! 46 | def build_bert_model(self): 47 | self.bert_model = BertVector(pooling_strategy='NONE', 48 | max_seq_len=self.config.bert_maxlen, 49 | bert_model_path=self.config.bert_model_path, 50 | graph_tmpfile=self.config.bert_graph_tmpfile) 51 | 52 | 53 | def sentence_to_bert(self, sentence): 54 | """单个句子编码为向量""" 55 | return self.bert_model.encode([sentence])["encodes"][0] 56 | 57 | 58 | def sentences_to_bert(self, sentences): 59 | """多个句子编码为向量""" 60 | return [self.sentence_to_bert(sent.strip()) for sent in sentences] 61 | 62 | 63 | def data_generator(self, sentences, labels): 64 | """编码数据,生成器""" 65 | while True: 66 | for i in range(0, len(sentences), self.batch_size): 67 | X = self.sentences_to_bert(sentences[i: i + self.batch_size]) 68 | Y = labels[i: i + self.batch_size] 69 | yield (X, Y) 70 | 71 | 72 | def data_prepare(self): 73 | """准备train/test/val,未编码""" 74 | # TODO 待办! 75 | x_train, y_train = None, None 76 | x_val, y_val = None, None 77 | x_test, y_test = None, None 78 | return x_train, y_train, x_val, y_val, x_test, y_test 79 | 80 | 81 | def train_generator(self): 82 | x_train, y_train, x_val, y_val, x_test, y_test = self.data_prepare() 83 | self.model.compile(loss=self.loss, optimizer=Adam(lr=0.001), metrics=self.metrics) 84 | self.model.fit_generator(self.data_generator(x_train, y_train), 85 | steps_per_epoch=int(len(x_train)/self.batch_size)+1, 86 | epochs=10, 87 | verbose=1, 88 | validation_data=(x_val, y_val), 89 | validation_steps=None) 90 | -------------------------------------------------------------------------------- /model/TextCNN.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created: 2019-08-14 13:35:44 4 | Author: liuyao8 5 | Descritipn: 6 | """ 7 | 8 | from keras.layers import BatchNormalization, SpatialDropout1D, Conv1D, GlobalMaxPooling1D, Concatenate, Dropout, Dense 9 | from keras.models import Model 10 | 11 | from model.BasicModel import BasicDeepModel # TODO model. ??? 12 | 13 | 14 | class TextCNN(BasicDeepModel): 15 | """TextCNN模型,支持char,word和both. both时char和word分别进行TextCNN,然后拼接结果""" 16 | 17 | def __init__(self, config=None, fsizes=(2, 5), n_filters=64, dropout_p=0.25, **kwargs): 18 | self.fsizes = fsizes 19 | self.n_filters = n_filters # TODO 是否是BasicDeepModel通用?通用的话放在BasicDeepModel那里 20 | self.dropout_p = dropout_p 21 | name = 'TextCNN_' 22 | BasicDeepModel.__init__(self, config=config, name=name, **kwargs) 23 | 24 | 25 | def model_unit(self, inputs, masking, embedding, dropout_p=None, fsizes=None, n_filters=None): 26 | """模型主体Unit""" 27 | if dropout_p is None: 28 | dropout_p = self.dropout_p 29 | if fsizes is None: 30 | fsizes = self.fsizes 31 | if n_filters is None: 32 | n_filters = [self.n_filters] * (fsizes[1] - fsizes[0] + 1) 33 | 34 | X = masking(inputs) 35 | X = embedding(X) 36 | X = BatchNormalization()(X) 37 | X = SpatialDropout1D(dropout_p)(X) 38 | Xs = [] 39 | for i, fsize in enumerate(range(fsizes[0], fsizes[1] + 1)): 40 | Xi = Conv1D(n_filters[i], fsize, activation='relu')(X) 41 | Xi = GlobalMaxPooling1D()(Xi) 42 | Xs.append(Xi) 43 | return Xs 44 | 45 | 46 | def build_model(self): 47 | # 模型主体 48 | if self.config.token_level == 'word': 49 | Xs = self.model_unit(self.word_input, self.word_masking, self.word_embedding) 50 | inputs = [self.word_input] 51 | 52 | elif self.config.token_level == 'char': 53 | Xs = self.model_unit(self.char_input, self.char_masking, self.char_embedding) 54 | inputs = [self.char_input] 55 | 56 | else: 57 | word_Xs = self.model_unit(self.word_input, self.word_masking, self.word_embedding) 58 | char_Xs = self.model_unit(self.char_input, self.char_masking, self.char_embedding) 59 | Xs = word_Xs + char_Xs 60 | inputs = [self.word_input, self.char_input] 61 | 62 | 63 | # 结构化特征 64 | if self.config.structured in ['word', 'char', 'both']: 65 | Xs = Xs + self.structured_input 66 | inputs = inputs + self.structured_input 67 | 68 | 69 | # 模型结尾 70 | X = Concatenate()(Xs) if len(Xs) > 1 else Xs[0] 71 | X = BatchNormalization()(X) 72 | X = Dropout(0.5)(X) 73 | # X = Dense(self.hidden_units, activation='relu')(X) # TODO 不需要隐藏层!? 74 | out = Dense(self.n_classes, activation=self.activation)(X) 75 | 76 | self.model = Model(inputs=inputs, outputs=out) -------------------------------------------------------------------------------- /model/TextCNN_BiGRU.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created: 2019-08-15 20:08:42 4 | Author: liuyao8 5 | Descritipn: 6 | """ 7 | 8 | from keras.layers import BatchNormalization, SpatialDropout1D, Conv1D, GlobalMaxPooling1D, \ 9 | Concatenate, Dropout, Dense, Bidirectional, GRU, GlobalAveragePooling1D 10 | from keras.models import Model 11 | 12 | from model.BasicModel import BasicDeepModel 13 | 14 | 15 | class TextCNN_BiGRU(BasicDeepModel): 16 | """TextCNN模型,支持char,word和both. both时char进行TextCNN,word进行RNN,然后拼接结果""" 17 | 18 | def __init__(self, config=None, fsizes=(2, 5), n_filters=64, rnn_units=64, dropout_p=0.25, **kwargs): 19 | self.fsizes = fsizes 20 | self.n_filters = n_filters # TODO 是否是BasicDeepModel通用?通用的话放在BasicDeepModel那里 21 | self.rnn_units = rnn_units 22 | self.dropout_p = dropout_p 23 | name = 'TextCNN_BiGRU_' + config.token_level 24 | BasicDeepModel.__init__(self, config=config, name=name, **kwargs) 25 | 26 | 27 | def model_unit(self, inputs, masking, embedding, dropout_p=None, fsizes=None, n_filters=None): 28 | """模型主体Unit""" 29 | if dropout_p is None: 30 | dropout_p = self.dropout_p 31 | if fsizes is None: 32 | fsizes = self.fsizes 33 | if n_filters is None: 34 | n_filters = [self.n_filters] * (fsizes[1] - fsizes[0] + 1) 35 | 36 | X = masking(inputs) 37 | X = embedding(X) 38 | X = BatchNormalization()(X) 39 | X = SpatialDropout1D(dropout_p)(X) 40 | Xs = [] 41 | for i, fsize in enumerate(range(fsizes[0], fsizes[1] + 1)): 42 | Xi = Conv1D(n_filters[i], fsize, activation='relu')(X) 43 | Xi = GlobalMaxPooling1D()(Xi) 44 | Xs.append(Xi) 45 | return Xs 46 | 47 | 48 | def build_model(self): 49 | # 模型主体 50 | if self.config.token_level == 'word': 51 | Xs = self.model_unit(self.word_input, self.word_masking, self.word_embedding) 52 | inputs = [self.word_input] 53 | 54 | elif self.config.token_level == 'char': 55 | Xs = self.model_unit(self.char_input, self.char_masking, self.char_embedding) 56 | inputs = [self.char_input] 57 | 58 | else: 59 | # 对word进行特殊处理! word的BiGRU + char的TextCNN # TODO WHY??? 60 | word_X = self.word_embedding(self.word_input) 61 | word_X = BatchNormalization()(word_X) 62 | for _ in range(2): 63 | word_X = SpatialDropout1D(0.2)(word_X) 64 | word_X = Bidirectional(GRU(self.rnn_units // 2, return_sequences=True))(word_X) 65 | word_maxpool = GlobalMaxPooling1D()(word_X) 66 | word_avgpool = GlobalAveragePooling1D()(word_X) 67 | 68 | char_Xs = self.model_unit(self.char_input, self.char_masking, self.char_embedding) 69 | Xs = [word_maxpool, word_avgpool] + char_Xs 70 | inputs = [self.word_input, self.char_input] 71 | 72 | 73 | # 结构化特征 74 | if self.config.structured in ['word', 'char', 'both']: 75 | Xs = Xs + self.structured_input 76 | inputs = inputs + self.structured_input 77 | 78 | 79 | # 模型结尾 80 | X = Concatenate()(Xs) if len(Xs) > 1 else Xs[0] 81 | X = Dropout(0.5)(X) 82 | out = Dense(self.n_classes, activation=self.activation)(X) 83 | 84 | self.model = Model(inputs=inputs, outputs=out) -------------------------------------------------------------------------------- /model/TextCapsule.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created: 2019-08-20 23:14:02 4 | Author: liuyao8 5 | Descritipn: 6 | """ 7 | 8 | from keras.layers import BatchNormalization, Bidirectional, GRU, Flatten, Dropout, \ 9 | Concatenate, GlobalMaxPooling1D, GlobalAveragePooling1D, SpatialDropout1D, Dense 10 | from keras.models import Model 11 | 12 | from model.BasicModel import BasicDeepModel 13 | from model.Layers import Capsule 14 | 15 | 16 | class TextCapsule(BasicDeepModel): 17 | 18 | def __init__(self, config=None, rnn_units=30, dropout_p=0.2, n_capsule=10, dim_capsule=16, routings=5, share_weights=True, **kwargs): 19 | self.rnn_units = rnn_units 20 | self.dropout_p = dropout_p 21 | self.n_capsule = n_capsule 22 | self.dim_capsule = dim_capsule 23 | self.routings = routings 24 | self.share_weights = share_weights 25 | name = 'TextCapsule_' + config.token_level 26 | BasicDeepModel.__init__(self, config=config, name=name, **kwargs) 27 | 28 | 29 | def model_unit(self, inputs, masking, embedding, dropout_p=None, n_capsule=None, dim_capsule=None, routings=None, share_weights=None): 30 | """模型主体Unit""" 31 | if dropout_p is None: 32 | dropout_p = self.dropout_p 33 | if n_capsule is None: 34 | n_capsule = self.n_capsule 35 | if dim_capsule is None: 36 | dim_capsule = self.dim_capsule 37 | if routings is None: 38 | routings = self.routings 39 | if share_weights is None: 40 | share_weights = self.share_weights 41 | 42 | X = masking(inputs) 43 | X = embedding(X) 44 | X = BatchNormalization()(X) 45 | X = SpatialDropout1D(dropout_p)(X) 46 | X = Bidirectional(GRU(64, return_sequences=True))(X) 47 | capsule = Capsule(n_capsule=n_capsule, dim_capsule=dim_capsule, routings=routings, share_weights=share_weights)(X) 48 | X = Flatten()(capsule) 49 | return X 50 | 51 | 52 | def build_model(self): 53 | # 模型主体 54 | if self.config.token_level == 'word': 55 | X = self.model_unit(self.word_input, self.word_masking, self.word_embedding) 56 | inputs = [self.word_input] 57 | 58 | elif self.config.token_level == 'char': 59 | X = self.model_unit(self.char_input, self.char_masking, self.char_embedding) 60 | inputs = [self.char_input] 61 | 62 | else: 63 | # 对word进行特殊处理! 64 | word_X = self.word_masking(self.word_input) 65 | word_X = self.word_embedding(word_X) 66 | word_X = SpatialDropout1D(0.25)(word_X) 67 | word_X = Bidirectional(GRU(self.rnn_units, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(word_X) # TODO ??? 68 | word_X = Bidirectional(GRU(self.rnn_units, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(word_X) 69 | word_maxpool = GlobalMaxPooling1D()(word_X) 70 | word_avgpool = GlobalAveragePooling1D()(word_X) 71 | 72 | char_X = self.model_unit(self.char_input, self.char_masking, self.char_embedding) 73 | X = Concatenate()([word_maxpool, word_avgpool, char_X]) 74 | inputs = [self.word_input, self.char_input] 75 | 76 | 77 | # 结构化特征 78 | if self.config.structured in ['word', 'char', 'both']: 79 | X = Concatenate()([X] + self.structured_input) 80 | inputs = inputs + self.structured_input 81 | 82 | 83 | # 模型结尾 84 | X = Dropout(0.5)(X) 85 | out = Dense(self.n_classes, activation=self.activation)(X) 86 | 87 | self.model = Model(inputs=inputs, outputs=out) -------------------------------------------------------------------------------- /model/TextConvLSTM2_Attn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created: 2019-08-17 21:11:18 4 | Author: liuyao8 5 | Descritipn: 6 | """ 7 | 8 | from keras.layers import BatchNormalization, SpatialDropout1D, Conv1D, Bidirectional, LSTM, GRU, \ 9 | GlobalMaxPooling1D, GlobalAveragePooling1D, Concatenate, Dropout, Dense 10 | from keras.models import Model 11 | 12 | from model.BasicModel import BasicDeepModel 13 | from model.Layers import AttentionWeightedAverage 14 | 15 | 16 | class TextConvLSTM2_Attn(BasicDeepModel): 17 | 18 | def __init__(self, config=None, n_filters=128, rnn_units=64, dropout_p=0.25, with_attention=True, **kwargs): 19 | self.n_filters = n_filters 20 | self.rnn_units = rnn_units 21 | self.dropout_p = dropout_p 22 | self.with_attention = with_attention 23 | name = 'TextConvLSTM2_Attn_' + config.token_level 24 | BasicDeepModel.__init__(self, config=config, name=name, **kwargs) 25 | 26 | 27 | def model_unit(self, inputs, masking, embedding, n_filters=None, rnn_units=None, dropout_p=None): 28 | """模型主体Unit""" 29 | if n_filters is None: 30 | n_filters = self.n_filters 31 | if rnn_units is None: 32 | rnn_units = [self.rnn_units] * 2 33 | if isinstance(rnn_units, int): 34 | rnn_units = [rnn_units] * 2 35 | if dropout_p is None: 36 | dropout_p = [self.dropout_p] * 2 37 | if isinstance(dropout_p, float): 38 | dropout_p = [dropout_p] * 2 39 | 40 | X = masking(inputs) 41 | X = embedding(X) 42 | X = BatchNormalization()(X) 43 | X = SpatialDropout1D(dropout_p[0])(X) 44 | # TODO Conv1D没有activation ??? 45 | X = Conv1D(n_filters, kernel_size=3, padding="valid", kernel_initializer="glorot_uniform")(X) # 相比LSTMGRUModel,此处多了个Conv1D 46 | X = Bidirectional(LSTM(rnn_units[0], return_sequences=True))(X) 47 | X = SpatialDropout1D(dropout_p[1])(X) 48 | X = Bidirectional(GRU(rnn_units[1], return_sequences=True))(X) 49 | 50 | maxpool = GlobalMaxPooling1D()(X) 51 | avgpool = GlobalAveragePooling1D()(X) 52 | if self.with_attention: 53 | attn = AttentionWeightedAverage()(X) 54 | X = Concatenate()([maxpool, avgpool, attn]) 55 | else: 56 | X = Concatenate()([maxpool, avgpool]) 57 | return X 58 | 59 | 60 | def build_model(self): 61 | # 模型主体 62 | if self.config.token_level == 'word': 63 | X = self.model_unit(self.word_input, self.word_masking, self.word_embedding) 64 | inputs = [self.word_input] 65 | 66 | elif self.config.token_level == 'char': 67 | X = self.model_unit(self.char_input, self.char_masking, self.char_embedding) 68 | inputs = [self.char_input] 69 | 70 | else: 71 | # 与TextConvLSTMGRU对word进行特殊处理! # TODO WHY??? 与char相比,没有conv和attention 72 | word_X = self.word_masking(self.word_input) 73 | word_X = self.word_embedding(word_X) 74 | word_X = BatchNormalization()(word_X) 75 | word_X = SpatialDropout1D(0.2)(word_X) # TODO 0.2 下面0.1 ? 76 | word_X = Bidirectional(GRU(self.rnn_units // 2, return_sequences=True))(word_X) 77 | word_X = SpatialDropout1D(0.1)(word_X) 78 | word_X = Bidirectional(GRU(self.rnn_units // 2, return_sequences=True))(word_X) 79 | word_maxpool = GlobalMaxPooling1D()(word_X) 80 | word_avgpool = GlobalAveragePooling1D()(word_X) 81 | 82 | char_X = self.model_unit(self.char_input, self.char_masking, self.char_embedding) 83 | X = Concatenate()([word_maxpool, word_avgpool, char_X]) 84 | inputs = [self.word_input, self.char_input] 85 | 86 | 87 | # 结构化特征 88 | if self.config.structured in ['word', 'char', 'both']: 89 | X = Concatenate()([X] + self.structured_input) 90 | inputs = inputs + self.structured_input 91 | 92 | 93 | # 模型结尾 94 | X = Dropout(0.5)(X) 95 | out = Dense(self.n_classes, activation=self.activation)(X) 96 | 97 | self.model = Model(inputs=inputs, outputs=out) -------------------------------------------------------------------------------- /model/TextConvLSTM_Attn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created: 2019-08-17 20:52:19 4 | Author: liuyao8 5 | Descritipn: 6 | """ 7 | 8 | from keras.layers import BatchNormalization, SpatialDropout1D, Conv1D, Bidirectional, LSTM, GRU, \ 9 | GlobalMaxPooling1D, GlobalAveragePooling1D, Concatenate, Dropout, Dense 10 | from keras.models import Model 11 | 12 | from model.BasicModel import BasicDeepModel 13 | from model.Layers import AttentionWeightedAverage 14 | 15 | 16 | class TextConvLSTM_Attn(BasicDeepModel): 17 | 18 | def __init__(self, config=None, n_filters=128, rnn_units=64, dropout_p=0.25, with_attention=True, **kwargs): 19 | self.n_filters = n_filters 20 | self.rnn_units = rnn_units 21 | self.dropout_p = dropout_p 22 | self.with_attention = with_attention 23 | name = 'TextConvLSTM_Attn_' + config.token_level 24 | BasicDeepModel.__init__(self, config=config, name=name, **kwargs) 25 | 26 | 27 | def model_unit(self, inputs, masking, embedding, n_filters=None, rnn_units=None, dropout_p=None): 28 | """模型主体Unit""" 29 | if n_filters is None: 30 | n_filters = self.n_filters 31 | if rnn_units is None: 32 | rnn_units = [self.rnn_units] * 2 33 | if isinstance(rnn_units, int): 34 | rnn_units = [rnn_units] * 2 35 | if dropout_p is None: 36 | dropout_p = [self.dropout_p] * 2 37 | if isinstance(dropout_p, float): 38 | dropout_p = [dropout_p] * 2 39 | 40 | X = masking(inputs) 41 | X = embedding(X) 42 | X = BatchNormalization()(X) 43 | X = SpatialDropout1D(dropout_p[0])(X) 44 | X = Conv1D(n_filters, kernel_size=3, padding="valid", kernel_initializer="glorot_uniform")(X) # 相比LSTMGRUModel,此处多了个Conv1D 45 | X = Bidirectional(LSTM(rnn_units[0], return_sequences=True))(X) 46 | X = SpatialDropout1D(dropout_p[1])(X) 47 | X = Bidirectional(GRU(rnn_units[1], return_sequences=True))(X) 48 | 49 | maxpool = GlobalMaxPooling1D()(X) 50 | avgpool = GlobalAveragePooling1D()(X) 51 | if self.with_attention: 52 | attn = AttentionWeightedAverage()(X) 53 | X = Concatenate()([maxpool, avgpool, attn]) 54 | else: 55 | X = Concatenate()([maxpool, avgpool]) 56 | return X 57 | 58 | 59 | def build_model(self): 60 | # 模型主体 61 | if self.config.token_level == 'word': 62 | X = self.model_unit(self.word_input, self.word_masking, self.word_embedding) 63 | inputs = [self.word_input] 64 | 65 | elif self.config.token_level == 'char': 66 | X = self.model_unit(self.char_input, self.char_masking, self.char_embedding) 67 | inputs = [self.char_input] 68 | 69 | else: 70 | word_X = self.model_unit(self.word_input, self.word_masking, self.word_embedding) 71 | char_X = self.model_unit(self.char_input, self.char_masking, self.char_embedding) 72 | X = Concatenate()([word_X, char_X]) 73 | inputs = [self.word_input, self.char_input] 74 | 75 | 76 | # 结构化特征 77 | if self.config.structured in ['word', 'char', 'both']: 78 | X = Concatenate()([X] + self.structured_input) 79 | inputs = inputs + self.structured_input 80 | 81 | 82 | # 模型结尾 83 | X = Dropout(0.5)(X) 84 | out = Dense(self.n_classes, activation=self.activation)(X) 85 | 86 | self.model = Model(inputs=inputs, outputs=out) -------------------------------------------------------------------------------- /model/TextDPCNN.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created: 2019-08-19 21:25:09 4 | Author: liuyao8 5 | Descritipn: 6 | """ 7 | 8 | from keras.layers import BatchNormalization, PReLU, Add, MaxPooling1D, Bidirectional, GRU, Dropout, \ 9 | Concatenate, Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D, SpatialDropout1D, Dense 10 | from keras import regularizers 11 | from keras import backend as K 12 | from keras.models import Model 13 | 14 | from model.BasicModel import BasicDeepModel 15 | 16 | 17 | class TextDPCNN(BasicDeepModel): 18 | 19 | def __init__(self, config=None, rnn_units=30, n_filters=64, filter_size=3, dp=7, dense_units=256, **kwargs): 20 | self.rnn_units = rnn_units 21 | self.n_filters = n_filters 22 | self.filter_size = filter_size 23 | self.dp = dp 24 | self.dense_units = dense_units 25 | name = 'TextDPCNN_' + config.token_level 26 | BasicDeepModel.__init__(self, config=config, name=name, **kwargs) 27 | 28 | 29 | def block(self, X, n_filters, filter_size, kernel_reg, bias_reg, first=False, last=False): 30 | """DPCNN网络结构中需要重复的block""" 31 | X1 = Conv1D(n_filters, kernel_size=filter_size, padding='same', kernel_regularizer=kernel_reg, bias_regularizer=bias_reg)(X) 32 | X1 = BatchNormalization()(X1) 33 | X1 = PReLU()(X1) 34 | X1 = Conv1D(n_filters, kernel_size=filter_size, padding='same', kernel_regularizer=kernel_reg, bias_regularizer=bias_reg)(X1) 35 | X1 = BatchNormalization()(X1) 36 | X1 = PReLU()(X1) # (, 57, 64) 37 | 38 | if first: 39 | X = Conv1D(n_filters, kernel_size=1, padding='same', kernel_regularizer=kernel_reg, bias_regularizer=bias_reg)(X) # (, 57, 64) 40 | 41 | X = Add()([X, X1]) # (, 57, 64) 42 | 43 | if last: 44 | X = GlobalMaxPooling1D()(X) 45 | else: 46 | X = MaxPooling1D(pool_size=3, strides=2)(X) # (, 28, 64) 47 | return X 48 | 49 | 50 | def model_unit(self, inputs, masking, embedding, n_filters=None, filter_size=None, dp=None, dense_units=None): 51 | """模型主体Unit""" 52 | kernel_reg=regularizers.l2(0.00001) 53 | bias_reg=regularizers.l2(0.00001) 54 | if n_filters is None: 55 | n_filters = self.n_filters 56 | if filter_size is None: 57 | filter_size = self.filter_size 58 | if dp is None: 59 | dp = self.dp 60 | if dense_units is None: 61 | dense_units = self.dense_units 62 | 63 | # Region Embedding 64 | X = masking(inputs) 65 | X = embedding(X) 66 | X = BatchNormalization()(X) # (, 57, 100) 67 | 68 | # 第1层 pre-activation 69 | X = self.block(X, n_filters, filter_size, kernel_reg, bias_reg, first=True) # (, 28, 64) 70 | 71 | # 重复dp次: 不含第1层 72 | flag_last = False 73 | for i in range(dp): 74 | if i + 1 == dp or flag_last: # 最后1层 75 | X = self.block(X, n_filters, filter_size, kernel_reg, bias_reg, last=True) 76 | break # 务必不要忘了break!!! 77 | else: # 中间层 78 | if K.int_shape(X)[1] // 2 < 8: # 此次block操作后没法继续MaxPooling1D,下一层变为最后1层(GlobalMaxPooling1D) 79 | flag_last = True 80 | X = self.block(X, n_filters, filter_size, kernel_reg, bias_reg) 81 | 82 | # 全连接层 83 | X = Dense(dense_units)(X) 84 | X = BatchNormalization()(X) 85 | X = PReLU()(X) 86 | return X 87 | 88 | 89 | def build_model(self): 90 | # 模型主体 91 | if self.config.token_level == 'word': 92 | X = self.model_unit(self.word_input, self.word_masking, self.word_embedding) 93 | inputs = [self.word_input] 94 | 95 | elif self.config.token_level == 'char': 96 | X = self.model_unit(self.char_input, self.char_masking, self.char_embedding) 97 | inputs = [self.char_input] 98 | 99 | else: 100 | # 对word进行特殊处理! 101 | word_X = self.word_embedding(self.word_input) 102 | word_X = SpatialDropout1D(0.25)(word_X) 103 | word_X = Bidirectional(GRU(self.rnn_units, return_sequences=True))(word_X) 104 | word_X = SpatialDropout1D(0.25)(word_X) 105 | word_X = Bidirectional(GRU(self.rnn_units, return_sequences=True))(word_X) 106 | word_maxpool = GlobalMaxPooling1D()(word_X) 107 | word_avgpool = GlobalAveragePooling1D()(word_X) 108 | 109 | char_X = self.model_unit(self.char_input, self.char_masking, self.char_embedding) 110 | X = Concatenate()([word_maxpool, word_avgpool, char_X]) 111 | inputs = [self.word_input, self.char_input] 112 | 113 | 114 | # 结构化特征 115 | if self.config.structured in ['word', 'char', 'both']: 116 | X = Concatenate()([X] + self.structured_input) 117 | inputs = inputs + self.structured_input 118 | 119 | 120 | # 模型结尾 121 | X = Dropout(0.5)(X) 122 | out = Dense(self.n_classes, activation=self.activation)(X) 123 | 124 | self.model = Model(inputs=inputs, outputs=out) -------------------------------------------------------------------------------- /model/TextGRU2_Attn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created: 2019-08-17 19:20:59 4 | Author: liuyao8 5 | Descritipn: 6 | """ 7 | 8 | from keras.layers import BatchNormalization, Bidirectional, GRU, SpatialDropout1D, Lambda, \ 9 | GlobalMaxPooling1D, GlobalAveragePooling1D, Concatenate, Dropout, Dense 10 | from keras.models import Model 11 | 12 | from model.BasicModel import BasicDeepModel 13 | from model.Layers import AttentionWeightedAverage 14 | 15 | 16 | class TextGRU2_Attn(BasicDeepModel): 17 | 18 | def __init__(self, config=None, n_rnns=None, rnn_units=64, dropout_p=0.25, with_attention=True, **kwargs): 19 | if n_rnns is None: 20 | self.n_rnns = (2, 2) if config.token_level == 'both' else 2 21 | self.rnn_units = rnn_units 22 | self.dropout_p = dropout_p 23 | name = 'TextGRU2_Attn_' + config.token_level 24 | BasicDeepModel.__init__(self, config=config, name=name, **kwargs) 25 | 26 | 27 | def model_unit(self, inputs, masking, embedding, n_rnns=None, rnn_units=None, dropout_p=None, with_attention=None): 28 | """模型主体Unit""" 29 | if n_rnns is None: 30 | n_rnns = self.n_rnns 31 | if rnn_units is None: 32 | rnn_units = [self.rnn_units] * n_rnns 33 | if isinstance(rnn_units, int): 34 | rnn_units = [rnn_units] * n_rnns 35 | if dropout_p is None: 36 | dropout_p = [self.dropout_p] * n_rnns 37 | if isinstance(dropout_p, float): 38 | dropout_p = [dropout_p] * n_rnns 39 | if with_attention is None: 40 | with_attention = self.with_attention 41 | 42 | X = masking(inputs) 43 | X = embedding(X) 44 | X = BatchNormalization()(X) 45 | for i in range(n_rnns): 46 | X = Bidirectional(GRU(rnn_units[i], return_sequences=True))(X) 47 | X = SpatialDropout1D(dropout_p[i])(X) 48 | 49 | maxpool = GlobalMaxPooling1D()(X) 50 | avgpool = GlobalAveragePooling1D()(X) 51 | last = Lambda(lambda x: x[:, -1])(X) # TODO 注释掉!? 52 | if with_attention: 53 | attn = AttentionWeightedAverage()(X) 54 | X = Concatenate()([maxpool, avgpool, last, attn]) 55 | else: 56 | X = Concatenate()([maxpool, avgpool, last]) 57 | return X 58 | 59 | 60 | def build_model(self): 61 | # 模型主体 62 | if self.config.token_level == 'word': 63 | X = self.model_unit(self.word_input, self.word_masking, self.word_embedding) 64 | inputs = [self.word_input] 65 | 66 | elif self.config.token_level == 'char': 67 | X = self.model_unit(self.char_input, self.char_masking, self.char_embedding) 68 | inputs = [self.char_input] 69 | 70 | else: 71 | # 对word进行特殊处理! # TODO WHY??? 72 | # TODO 与TextGRU的唯一区别,后续TextAttention和TextAttention2可统一成一个 73 | word_X = self.model_unit(self.word_input, self.word_masking, self.word_embedding, self.n_rnns[0], with_attention=False) 74 | char_X = self.model_unit(self.char_input, self.char_masking, self.char_embedding, self.n_rnns[1]) 75 | X = Concatenate()([word_X, char_X]) 76 | inputs = [self.word_input, self.char_input] 77 | 78 | 79 | # 结构化特征 80 | if self.config.structured in ['word', 'char', 'both']: 81 | X = Concatenate()([X] + self.structured_input) 82 | inputs = inputs + self.structured_input 83 | 84 | 85 | # 模型结尾 86 | X = Dropout(0.5)(X) 87 | out = Dense(self.n_classes, activation=self.activation)(X) 88 | 89 | self.model = Model(inputs=inputs, outputs=out) -------------------------------------------------------------------------------- /model/TextGRU_Attn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created: 2019-08-17 18:43:40 4 | Author: liuyao8 5 | Descritipn: 6 | """ 7 | 8 | from keras.layers import BatchNormalization, Bidirectional, GRU, SpatialDropout1D, Lambda, \ 9 | GlobalMaxPooling1D, GlobalAveragePooling1D, Concatenate, Dropout, Dense 10 | from keras.models import Model 11 | 12 | from model.BasicModel import BasicDeepModel 13 | from model.Layers import AttentionWeightedAverage 14 | 15 | 16 | class TextGRU_Attn(BasicDeepModel): 17 | 18 | def __init__(self, config=None, n_rnns=None, rnn_units=64, dropout_p=0.5, **kwargs): 19 | if n_rnns is None: 20 | self.n_rnns = (2, 2) if config.token_level == 'both' else 2 21 | self.rnn_units = rnn_units 22 | self.dropout_p = dropout_p 23 | name = 'TextGRU_Attn_' + config.token_level 24 | BasicDeepModel.__init__(self, config=config, name=name, **kwargs) 25 | 26 | 27 | def model_unit(self, inputs, masking, embedding, n_rnns=None, rnn_units=None, dropout_p=None): 28 | """模型主体Unit""" 29 | if n_rnns is None: 30 | n_rnns = self.n_rnns 31 | if rnn_units is None: 32 | rnn_units = [self.rnn_units] * n_rnns 33 | if isinstance(rnn_units, int): 34 | rnn_units = [rnn_units] * n_rnns 35 | if dropout_p is None: 36 | dropout_p = [self.dropout_p] * n_rnns 37 | if isinstance(dropout_p, float): 38 | dropout_p = [dropout_p] * n_rnns 39 | 40 | X = masking(inputs) 41 | X = embedding(X) 42 | X = BatchNormalization()(X) 43 | for i in range(n_rnns): 44 | X = Bidirectional(GRU(rnn_units[i], return_sequences=True))(X) 45 | X = SpatialDropout1D(dropout_p[i])(X) 46 | 47 | maxpool = GlobalMaxPooling1D()(X) 48 | avgpool = GlobalAveragePooling1D()(X) 49 | last = Lambda(lambda x: x[:, -1])(X) 50 | attn = AttentionWeightedAverage()(X) 51 | X = Concatenate()([maxpool, avgpool, last, attn]) 52 | return X 53 | 54 | 55 | def build_model(self): 56 | # 模型主体 57 | if self.config.token_level == 'word': 58 | X = self.model_unit(self.word_input, self.word_masking, self.word_embedding) 59 | inputs = [self.word_input] 60 | 61 | elif self.config.token_level == 'char': 62 | X = self.model_unit(self.char_input, self.char_masking, self.char_embedding) 63 | inputs = [self.char_input] 64 | 65 | else: 66 | word_X = self.model_unit(self.word_input, self.word_masking, self.word_embedding, self.n_rnns[0]) 67 | char_X = self.model_unit(self.char_input, self.char_masking, self.char_embedding, self.n_rnns[1]) 68 | X = Concatenate()([word_X, char_X]) 69 | inputs = [self.word_input, self.char_input] 70 | 71 | 72 | # 结构化特征 73 | if self.config.structured in ['word', 'char', 'both']: 74 | X = Concatenate()([X] + self.structured_input) 75 | inputs = inputs + self.structured_input 76 | 77 | 78 | # 模型结尾 79 | X = Dropout(0.5)(X) 80 | out = Dense(self.n_classes, activation=self.activation)(X) 81 | 82 | self.model = Model(inputs=inputs, outputs=out) -------------------------------------------------------------------------------- /model/TextHAN.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created: 2019-08-20 22:58:52 4 | Author: liuyao8 5 | Descritipn: 6 | """ 7 | 8 | from keras.layers import Input, BatchNormalization, Bidirectional, LSTM, TimeDistributed, Dropout, Dense, GRU, Masking, Flatten 9 | from keras.models import Model 10 | from keras.optimizers import Adam 11 | 12 | from model.BasicModel import BasicDeepModel 13 | from model.Layers import Attention, AttentionSelf 14 | 15 | 16 | class TextHAN(BasicDeepModel): 17 | 18 | def __init__(self, config=None, rnn_units1=128, rnn_units2=128, **kwargs): 19 | self.rnn_units1 = rnn_units1 20 | self.rnn_units2 = rnn_units2 21 | self.sent_maxlen = config.SENT_MAXLEN 22 | self.word_maxlen = config.WORD_MAXLEN 23 | self.sent_input = Input(shape=(self.sent_maxlen, self.word_maxlen), dtype='int32', name='sentence1') # (, sent_maxlen, word_maxlen) 24 | name = 'TextHAN' 25 | BasicDeepModel.__init__(self, config=config, name=name, **kwargs) 26 | 27 | 28 | # 方法1:以下参考https://github.com/ShawnyXiao/TextClassification-Keras/blob/master/model/HAN/han.py 29 | # 脚本https://github.com/AlexYangLi/TextClassification/blob/master/models/keras_han_model.py与方法1其实是一样的,只是写法不同 30 | # 脚本https://github.com/richliao/textClassifier/blob/master/textClassifierHATT.py与方法1是一样的 31 | def build_model(self): 32 | # Sentence Part sent_input: (, sent_maxlen, word_maxlen) 33 | X = TimeDistributed(self.word_encoder(), name='word_encoder')(self.sent_input) # (, sent_maxlen, 2*rnn_units1) 34 | X = Masking()(X) # TODO 实验验证,加不加,影响不大。为什么?何时需要? 35 | X = BatchNormalization()(X) 36 | X = Bidirectional(LSTM(self.rnn_units2, return_sequences=True))(X) # (, sent_maxlen, 2*rnn_units2) 37 | X = Attention(self.sent_maxlen)(X) # (, 2*rnn_units2) 38 | 39 | X = Dropout(0.5)(X) 40 | out = Dense(self.n_classes, activation=self.activation)(X) # (, n_classes) 41 | self.model = Model(inputs=self.sent_input, outputs=out) # TODO 注意inputs是Sentence Part的inputs(而非Word Part)! 42 | 43 | 44 | def word_encoder(self): 45 | # Word Part 模型,提供word level的编码功能 46 | word_X = self.word_masking(self.word_input) # (, word_maxlen) 47 | word_X = self.word_embedding(word_X) # (, word_maxlen, word_embed_dim) 48 | word_X = BatchNormalization()(word_X) 49 | word_X = Bidirectional(LSTM(self.rnn_units1, return_sequences=True))(word_X) # (, word_maxlen, 2*rnn_units1) 50 | word_out = Attention(self.word_maxlen)(word_X) # (, 2*rnn_units1) # TODO 能不能使用AttentionAverageWeighted 51 | return Model(inputs=self.word_input, outputs=word_out) 52 | 53 | 54 | def train_evaluate(self, x_train, y_train, x_test, y_test, lr=1e-4, epochs=None): 55 | """经测试,only Step1, only Step2, Step1+Step2, 这3种训练模式效果差不多,only Step2略微好一丁点""" 56 | # 模型训练 57 | print('【' + self.name + '】') 58 | self.mode = 3 59 | epochs = epochs if epochs else (2, self.n_epochs) 60 | 61 | def model_compile_fit(lr=1e-4, epochs=3): 62 | self.model.compile(loss=self.loss, optimizer=Adam(lr=lr), metrics=self.metrics) 63 | return self.model.fit(x_train, y_train, 64 | batch_size=self.batch_size*self.config.n_gpus, 65 | epochs=epochs, 66 | validation_split=0.3, 67 | callbacks=None) 68 | 69 | print('-------------------Step1: 前期冻结Word_Encoder层,编译和训练模型-------------------') 70 | self.model.get_layer('word_encoder').trainable = False # TODO word_encoder由很多层组成,如何只设置其中的Embedding?? 71 | history1 = model_compile_fit(1e-4, 3) 72 | history1 = model_compile_fit(1e-5, 3) 73 | history1 = model_compile_fit(1e-6, 3) 74 | history1 = model_compile_fit(1e-7, 3) 75 | 76 | print('-------------Step2: 训练完参数后,解冻Word_Encoder层,再次编译和训练模型------------') 77 | self.model.get_layer('word_encoder').trainable = True 78 | history2 = model_compile_fit(1e-4, 3) 79 | history2 = model_compile_fit(1e-5, 3) 80 | history2 = model_compile_fit(1e-6, 3) 81 | history2 = model_compile_fit(1e-7, 3) 82 | self.plot_history(history2) 83 | history = (history1, history2) 84 | 85 | # 模型评估 86 | test_acc, scores, sims, vectors, test_pred = self._evaluate(x_test, y_test) 87 | pickle.dump(test_pred, open('./result/' + self.name + '_test_pred.pkl', 'wb')) 88 | return test_acc, scores, sims, vectors, history 89 | 90 | 91 | # 方法2:以下参考https://github.com/yongzhuo/Keras-TextClassification/blob/master/keras_textclassification/m12_HAN/graph.py 92 | # 方法1使用了Attention机制,而方法2使用了Self-Attention即Transformer机制! 93 | # TODO 输入是self.word_embedding.input???待研究! 94 | def build_model2(self): 95 | # Word Part 96 | word_X = self.word_embedding.output # (, word_maxlen, word_embed_dim) 97 | word_X = Bidirectional(GRU(units=self.rnn_units1, return_sequences=True, activation='relu'))(word_X) # (, word_maxlen, 2*rnn_units1) 98 | word_X = AttentionSelf(self.rnn_units*2)(word_X) # (, word_maxlen, 2*rnn_units) 99 | word_X = Dropout(0.5)(word_X) 100 | 101 | # Sentence Part 102 | X = Bidirectional(GRU(units=self.rnn_units2, return_sequences=True, activation='relu'))(word_X) # (, word_maxlen, 2*rnn_units2) 103 | X = AttentionSelf(self.word_embed_dim)(X) # (, word_maxlen, word_embed_dim) 104 | X = Dropout(0.5)(X) 105 | 106 | X = Flatten()(X) # (, word_maxlen * word_embed_dim) 107 | out = Dense(self.n_classes, activation=self.activation)(X) # (, n_classes) 108 | self.model = Model(inputs=self.word_embedding.input, outputs=out) 109 | 110 | 111 | 112 | if __name__ == '__main__': 113 | 114 | import pickle 115 | from Vocabulary import Vocabulary 116 | from Config import Config 117 | config = Config() 118 | 119 | # data和config准备 详情请参考脚本 ModelTrain.py 120 | config = pickle.load(open(config.config_file, 'rb')) 121 | x_train, y_train, x_test, y_test = pickle.load(open(config.data_encoded_file, 'rb')) 122 | 123 | # 根据实际情况修改,也可直接在Config.py里修改,推荐前者 124 | config.n_gpus = 1 125 | config.token_level = 'word' 126 | config.structured = 'none' 127 | config.bert_flag = False 128 | 129 | # 模型创建与训练 130 | texthan = TextHAN(config) 131 | test_acc, scores, sims, vectors, history = texthan.train_evaluate(x_train, y_train, x_test, y_test) 132 | 133 | texthan.model.save(config.model_file) 134 | -------------------------------------------------------------------------------- /model/TextLSTMGRU_Attn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created: 2019-08-17 21:42:28 4 | Author: liuyao8 5 | Descritipn: 6 | """ 7 | 8 | from keras.layers import BatchNormalization, SpatialDropout1D, Bidirectional, LSTM, GRU, \ 9 | GlobalMaxPooling1D, GlobalAveragePooling1D, Concatenate, Dropout, Dense 10 | from keras.models import Model 11 | 12 | from model.BasicModel import BasicDeepModel 13 | from model.Layers import AttentionWeightedAverage 14 | 15 | 16 | class TextLSTMGRU_Attn(BasicDeepModel): 17 | 18 | def __init__(self, config=None, rnn_units=64, dropout_p=0.25, with_attention=False, **kwargs): 19 | self.rnn_units = rnn_units 20 | self.dropout_p = dropout_p 21 | self.with_attention = with_attention 22 | name = 'TextLSTMGRU_Attn_' + config.token_level 23 | BasicDeepModel.__init__(self, config=config, name=name, **kwargs) 24 | 25 | 26 | def model_unit(self, inputs, masking, embedding, dropout_p=None, rnn_units=None, with_attention=None): 27 | """模型主体Unit""" 28 | if dropout_p is None: 29 | dropout_p = (self.dropout_p, self.dropout_p) 30 | if isinstance(dropout_p, float): 31 | dropout_p = (dropout_p, dropout_p) 32 | if rnn_units is None: 33 | rnn_units = (self.rnn_units, self.rnn_units) 34 | if isinstance(rnn_units, int): 35 | rnn_units = (rnn_units, rnn_units) 36 | if with_attention is None: 37 | with_attention = self.with_attention 38 | 39 | X = masking(inputs) 40 | X = embedding(X) 41 | X = BatchNormalization()(X) 42 | X = SpatialDropout1D(dropout_p[0])(X) 43 | X = Bidirectional(LSTM(rnn_units[0], return_sequences=True))(X) 44 | X = SpatialDropout1D(dropout_p[1])(X) 45 | X = Bidirectional(GRU(rnn_units[1], return_sequences=True))(X) 46 | 47 | maxpool = GlobalMaxPooling1D()(X) 48 | avgpool = GlobalAveragePooling1D()(X) 49 | if with_attention: 50 | attn = AttentionWeightedAverage()(X) 51 | X = Concatenate()([maxpool, avgpool, attn]) 52 | else: 53 | X = Concatenate()([maxpool, avgpool]) 54 | return X 55 | 56 | 57 | def build_model(self): 58 | # 模型主体 59 | if self.config.token_level == 'word': 60 | X = self.model_unit(self.word_input, self.word_masking, self.word_embedding, 0.33, 100) 61 | inputs = [self.word_input] 62 | 63 | elif self.config.token_level == 'char': 64 | X = self.model_unit(self.char_input, self.char_masking, self.char_embedding, 0.2, 100) 65 | inputs = [self.char_input] 66 | 67 | else: 68 | word_X = self.model_unit(self.word_input, self.word_masking, self.word_embedding, (0.5, 0.1), 30) 69 | char_X = self.model_unit(self.char_input, self.char_masking, self.char_embedding, 0.2, 100) 70 | X = Concatenate()([word_X, char_X]) 71 | inputs = [self.word_input, self.char_input] 72 | 73 | 74 | # 结构化特征 75 | if self.config.structured in ['word', 'char', 'both']: 76 | X = Concatenate()([X] + self.structured_input) 77 | inputs = inputs + self.structured_input 78 | 79 | 80 | # 模型结尾 81 | X = Dropout(0.5)(X) 82 | out = Dense(self.n_classes, activation=self.activation)(X) 83 | 84 | self.model = Model(inputs=inputs, outputs=out) -------------------------------------------------------------------------------- /model/TextLSTM_Attn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created: 2019-08-16 15:32:58 4 | Author: liuyao8 5 | Descritipn: 6 | """ 7 | 8 | from keras.layers import BatchNormalization, Bidirectional, LSTM, Concatenate, Dropout, \ 9 | Flatten, Dense, Lambda 10 | from keras.models import Model 11 | from keras import backend as K 12 | 13 | from model.BasicModel import BasicDeepModel 14 | from model.Layers import AttentionWeightedAverage 15 | 16 | 17 | class TextLSTM_Attn(BasicDeepModel): 18 | """TextLSTM模型,支持char, word和both,支持Attention""" 19 | 20 | def __init__(self, config=None, n_rnns=None, rnn_units=64, with_sth='mean', **kwargs): 21 | if n_rnns is None: 22 | self.n_rnns = (1, 1) if config.token_level == 'both' else 1 23 | self.rnn_units = rnn_units 24 | assert with_sth in ('mean', 'flatten', 'attention') 25 | self.with_sth = with_sth 26 | name = 'TextLSTM_Attn_' + with_sth + '_' + config.token_level 27 | BasicDeepModel.__init__(self, config=config, name=name, **kwargs) 28 | 29 | 30 | def model_unit(self, inputs, masking, embedding, n_rnns=None, rnn_units=None, with_sth=None): 31 | """模型主体Unit""" 32 | if n_rnns is None: 33 | n_rnns = self.n_rnns 34 | if rnn_units is None: 35 | rnn_units = [self.rnn_units] * n_rnns 36 | if isinstance(rnn_units, int): 37 | rnn_units = [rnn_units] * n_rnns 38 | if with_sth is None: 39 | with_sth = self.with_sth 40 | 41 | X = masking(inputs) 42 | X = embedding(X) 43 | X = BatchNormalization()(X) 44 | for i in range(n_rnns): 45 | X = Bidirectional(LSTM(rnn_units[i], return_sequences=True))(X) # TODO LSTM VS CuDNNLSTM 128需要动态变化? 46 | X = Dropout(0.5)(X) # TODO TextAttention此处为SpatialDropout1D??? 47 | 48 | # X's shape = (None, word_maxlen, 2*rnn_units) # TODO shape要变成2维的,才能输入到输出层!!! 49 | if with_sth == 'mean': 50 | X = Lambda(lambda x: K.mean(x, axis=1))(X) # (None, 2*rnn_units) # TODO 不能写成 X=K.mean(X,axis=1),会报错! 51 | elif with_sth == 'flatten': 52 | X = Flatten()(X) # (None, word_maxlen*2*rnn_units) 53 | elif with_sth == 'attention': 54 | X = AttentionWeightedAverage()(X) # (None, 2*rnn_units) 55 | return X 56 | 57 | 58 | def build_model(self): 59 | # 模型主体 60 | if self.config.token_level == 'word': 61 | X = self.model_unit(self.word_input, self.word_masking, self.word_embedding) 62 | inputs = [self.word_input] 63 | 64 | elif self.config.token_level == 'char': 65 | X = self.model_unit(self.char_input, self.char_masking, self.char_embedding) 66 | inputs = [self.char_input] 67 | 68 | else: 69 | word_X = self.model_unit(self.word_input, self.word_masking, self.word_embedding, self.n_rnns[0]) 70 | char_X = self.model_unit(self.char_input, self.char_masking, self.char_embedding, self.n_rnns[1]) 71 | X = Concatenate()([word_X, char_X]) 72 | inputs = [self.word_input, self.char_input] 73 | 74 | 75 | # 结构化特征 76 | if self.config.structured in ['word', 'char', 'both']: 77 | X = Concatenate()([X] + self.structured_input) 78 | inputs = inputs + self.structured_input 79 | 80 | 81 | # 模型结尾 82 | X = Dropout(0.5)(X) 83 | out = Dense(self.n_classes, activation=self.activation)(X) 84 | 85 | self.model = Model(inputs=inputs, outputs=out) -------------------------------------------------------------------------------- /model/TextRCNN_Attn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created: 2019-08-18 14:52:38 4 | Author: liuyao8 5 | Descritipn: 6 | """ 7 | 8 | from keras.layers import Input, BatchNormalization, Bidirectional, GRU, Dropout, Lambda, \ 9 | Concatenate, Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D, Dense 10 | from keras import backend as K 11 | from keras.models import Model 12 | 13 | from model.BasicModel import BasicDeepModel 14 | from model.Layers import AttentionWeightedAverage 15 | 16 | 17 | class TextRCNN_Attn(BasicDeepModel): 18 | """简易版TextRCNN""" 19 | 20 | def __init__(self, config=None, rnn_units=64, n_filters=64, **kwargs): 21 | self.rnn_units = rnn_units 22 | self.n_filters = n_filters 23 | name = 'TextRCNN_Attn_' + config.token_level 24 | BasicDeepModel.__init__(self, config=config, name=name, **kwargs) 25 | 26 | 27 | def model_unit(self, inputs, left_inputs, right_inputs, masking, embedding, rnn_units=None, n_filters=None): 28 | """模型主体Unit""" 29 | if rnn_units is None: 30 | rnn_units = [self.rnn_units] * 3 31 | if isinstance(rnn_units, int): 32 | rnn_units = [rnn_units] * 3 33 | if n_filters is None: 34 | n_filters = self.n_filters 35 | 36 | X = masking(inputs) 37 | X = embedding(X) 38 | X = BatchNormalization()(X) 39 | X = Bidirectional(GRU(rnn_units[0], return_sequences=True))(X) 40 | 41 | left_X = masking(left_inputs) 42 | left_X = embedding(left_X) 43 | left_X = BatchNormalization()(left_X) 44 | left_X = Bidirectional(GRU(rnn_units[1], return_sequences=True))(left_X) 45 | 46 | right_X = masking(right_inputs) 47 | right_X = embedding(right_X) 48 | right_X = BatchNormalization()(right_X) 49 | right_X = Dropout(0.5)(right_X) 50 | right_X = Bidirectional(GRU(rnn_units[2], return_sequences=True, go_backwards=True))(right_X) 51 | right_X = Lambda(lambda x: K.reverse(x, axes=1))(right_X) 52 | 53 | concat = Concatenate()([X, left_X, right_X]) 54 | concat = Conv1D(n_filters, kernel_size=1, activation='relu')(concat) 55 | 56 | # TODO 为什么没有left_x与x交互的操作!!??right_x与x同理!!?? 57 | # 比如上一个left与上一个word共同生成当前left???(详见论文中的公式1和2!!!) 58 | # 另外,与论文相比或与别的实现相比,下面这些是多余的,应该直接到output=Dense那里 ??? 59 | maxpool = GlobalMaxPooling1D()(concat) 60 | avgpool = GlobalAveragePooling1D()(concat) 61 | attn = AttentionWeightedAverage()(concat) 62 | X = Concatenate()([maxpool, avgpool, attn]) 63 | return X 64 | 65 | 66 | def build_model(self): 67 | # 额外的Input 68 | self.word_left_inputs = Input(shape=(self.word_maxlen, ), name='word_left') 69 | self.word_right_inputs = Input(shape=(self.word_maxlen, ), name='word_right') 70 | self.char_left_inputs = Input(shape=(self.char_maxlen, ), name='char_left') 71 | self.char_right_inputs = Input(shape=(self.char_maxlen, ), name='char_right') 72 | 73 | # 模型主体 74 | if self.config.token_level == 'word': 75 | X = self.model_unit(self.word_input, self.word_left_inputs, self.word_right_inputs, self.word_masking, self.word_embedding) 76 | inputs = [self.word_input, self.word_left_inputs, self.word_right_inputs] 77 | 78 | elif self.config.token_level == 'char': 79 | X = self.model_unit(self.char_input, self.char_left_inputs, self.char_right_inputs, self.char_masking, self.char_embedding) 80 | inputs = [self.char_input, self.char_left_inputs, self.char_right_inputs] 81 | 82 | else: 83 | word_X = self.model_unit(self.word_input, self.word_left_inputs, self.word_right_inputs, self.word_masking, self.word_embedding) 84 | char_X = self.model_unit(self.char_input, self.char_left_inputs, self.char_right_inputs, self.char_masking, self.char_embedding) 85 | X = Concatenate()([word_X, char_X]) 86 | inputs = [self.word_input, self.word_left_inputs, self.word_right_inputs, \ 87 | self.char_input, self.char_left_inputs, self.char_right_inputs] 88 | 89 | 90 | # 结构化特征 91 | if self.config.structured in ['word', 'char', 'both']: 92 | X = Concatenate()([X] + self.structured_input) 93 | inputs = inputs + self.structured_input 94 | 95 | 96 | # 模型结尾 97 | X = Dropout(0.5)(X) 98 | out = Dense(self.n_classes, activation=self.activation)(X) 99 | 100 | self.model = Model(inputs=inputs, outputs=out) --------------------------------------------------------------------------------