├── models ├── __init__.py ├── ranking.py ├── sentence_embedding.py ├── sim_bert.py ├── knowledge_distiilation.py └── model_base.py ├── tasks ├── __init__.py ├── classifier.py ├── embedding_task.py ├── Itr_pair_task.py ├── ner_task.py ├── distillation_task.py └── ranking_task.py ├── bert_service ├── __init__.py ├── docker_start.sh ├── model_saving_utils.py └── embedding_serving.py ├── model_configs ├── __init__.py ├── sentence_embedding.json ├── bert_ner.json ├── sim_bert.json ├── ranking.json ├── classifier.json └── distill_bert.json ├── data_processor ├── embedding_data_generator.py ├── __pycache__ │ ├── embedding.cpython-36.pyc │ ├── embedding.cpython-37.pyc │ ├── tokenizer.cpython-36.pyc │ ├── tokenizer.cpython-37.pyc │ ├── base_processor.cpython-36.pyc │ ├── base_processor.cpython-37.pyc │ ├── ner_data_generator.cpython-36.pyc │ ├── ner_data_generator.cpython-37.pyc │ ├── classifier_data_generator.cpython-36.pyc │ ├── classifier_data_generator.cpython-37.pyc │ ├── text_match_data_generator.cpython-36.pyc │ └── text_match_data_generator.cpython-37.pyc ├── embedding.py ├── base_processor.py ├── classifier_data_generator.py ├── ner_data_generator.py ├── text_match_data_generator_v2.py ├── tokenizer.py └── text_match_data_generator.py ├── requirements.txt ├── model └── __pycache__ │ ├── __init__.cpython-37.pyc │ ├── sim_bert.cpython-37.pyc │ ├── model_base.cpython-37.pyc │ └── sentence_embedding.cpython-37.pyc ├── trainer ├── __pycache__ │ ├── train_base.cpython-36.pyc │ └── train_base.cpython-37.pyc └── train_base.py ├── predictor ├── __pycache__ │ └── predict_base.cpython-36.pyc ├── predict_base.py └── predict.py └── README.md /models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tasks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bert_service/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /model_configs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data_processor/embedding_data_generator.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow==2.4.0 2 | tf-models-official==2.4.0 3 | jieba 4 | gensim 5 | pandas -------------------------------------------------------------------------------- /model/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dextroushands/pretraind_model_for_nlp_tasks/HEAD/model/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /model/__pycache__/sim_bert.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dextroushands/pretraind_model_for_nlp_tasks/HEAD/model/__pycache__/sim_bert.cpython-37.pyc -------------------------------------------------------------------------------- /model/__pycache__/model_base.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dextroushands/pretraind_model_for_nlp_tasks/HEAD/model/__pycache__/model_base.cpython-37.pyc -------------------------------------------------------------------------------- /trainer/__pycache__/train_base.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dextroushands/pretraind_model_for_nlp_tasks/HEAD/trainer/__pycache__/train_base.cpython-36.pyc -------------------------------------------------------------------------------- /trainer/__pycache__/train_base.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dextroushands/pretraind_model_for_nlp_tasks/HEAD/trainer/__pycache__/train_base.cpython-37.pyc -------------------------------------------------------------------------------- /predictor/__pycache__/predict_base.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dextroushands/pretraind_model_for_nlp_tasks/HEAD/predictor/__pycache__/predict_base.cpython-36.pyc -------------------------------------------------------------------------------- /data_processor/__pycache__/embedding.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dextroushands/pretraind_model_for_nlp_tasks/HEAD/data_processor/__pycache__/embedding.cpython-36.pyc -------------------------------------------------------------------------------- /data_processor/__pycache__/embedding.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dextroushands/pretraind_model_for_nlp_tasks/HEAD/data_processor/__pycache__/embedding.cpython-37.pyc -------------------------------------------------------------------------------- /data_processor/__pycache__/tokenizer.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dextroushands/pretraind_model_for_nlp_tasks/HEAD/data_processor/__pycache__/tokenizer.cpython-36.pyc -------------------------------------------------------------------------------- /data_processor/__pycache__/tokenizer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dextroushands/pretraind_model_for_nlp_tasks/HEAD/data_processor/__pycache__/tokenizer.cpython-37.pyc -------------------------------------------------------------------------------- /model/__pycache__/sentence_embedding.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dextroushands/pretraind_model_for_nlp_tasks/HEAD/model/__pycache__/sentence_embedding.cpython-37.pyc -------------------------------------------------------------------------------- /data_processor/__pycache__/base_processor.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dextroushands/pretraind_model_for_nlp_tasks/HEAD/data_processor/__pycache__/base_processor.cpython-36.pyc -------------------------------------------------------------------------------- /data_processor/__pycache__/base_processor.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dextroushands/pretraind_model_for_nlp_tasks/HEAD/data_processor/__pycache__/base_processor.cpython-37.pyc -------------------------------------------------------------------------------- /data_processor/__pycache__/ner_data_generator.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dextroushands/pretraind_model_for_nlp_tasks/HEAD/data_processor/__pycache__/ner_data_generator.cpython-36.pyc -------------------------------------------------------------------------------- /data_processor/__pycache__/ner_data_generator.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dextroushands/pretraind_model_for_nlp_tasks/HEAD/data_processor/__pycache__/ner_data_generator.cpython-37.pyc -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pretraind_model_for_nlp_tasks 2 | 3 | 构建基于预训练模型的nlp任务,主要任务分为4块,分别为: 4 | 5 | 1.sentence embedding句子向量表示 6 | 7 | 2.classifier 文本分类 8 | 9 | 3.text match 文本匹配 10 | 11 | 4.ner 命名实体识别 12 | -------------------------------------------------------------------------------- /data_processor/__pycache__/classifier_data_generator.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dextroushands/pretraind_model_for_nlp_tasks/HEAD/data_processor/__pycache__/classifier_data_generator.cpython-36.pyc -------------------------------------------------------------------------------- /data_processor/__pycache__/classifier_data_generator.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dextroushands/pretraind_model_for_nlp_tasks/HEAD/data_processor/__pycache__/classifier_data_generator.cpython-37.pyc -------------------------------------------------------------------------------- /data_processor/__pycache__/text_match_data_generator.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dextroushands/pretraind_model_for_nlp_tasks/HEAD/data_processor/__pycache__/text_match_data_generator.cpython-36.pyc -------------------------------------------------------------------------------- /data_processor/__pycache__/text_match_data_generator.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dextroushands/pretraind_model_for_nlp_tasks/HEAD/data_processor/__pycache__/text_match_data_generator.cpython-37.pyc -------------------------------------------------------------------------------- /model_configs/sentence_embedding.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name": "sentence_embeddding", 3 | "seq_len": 100, 4 | "pooled_output_size": 256, 5 | "vocab_path": "../chinese_wwm_ext_L-12_H-768_A-12/v2/vocab.txt", 6 | "bert_config_path": "../chinese_wwm_ext_L-12_H-768_A-12/v2/bert_config.json", 7 | "bert_model_path": "../chinese_wwm_ext_L-12_H-768_A-12/v2/bert_model.ckpt-1", 8 | "output_path": "../output_path/sentence_embedding" 9 | } -------------------------------------------------------------------------------- /bert_service/docker_start.sh: -------------------------------------------------------------------------------- 1 | docker stop new_serve 2 | docker rm new_serve 3 | docker rmi my_img 4 | docker run -d --name serving_base tensorflow/serving:2.4.1 5 | 6 | docker cp /Users/donruo/Desktop/project/bert_tasks/chinese_wwm_ext_L-12_H-768_A-12/serve/versions/ serving_base:/models/my_model 7 | 8 | docker commit --change "ENV MODEL_NAME my_model" serving_base my_img 9 | docker stop serving_base 10 | docker rm serving_base 11 | 12 | docker run --name new_serve -p 8501:8501 -p 8500:8500 my_img 13 | -------------------------------------------------------------------------------- /model_configs/bert_ner.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name": "bert_ner", 3 | "learning_rate": 1e-3, 4 | "epoches": 10, 5 | "batch_size": 8, 6 | "optimizer": "adam", 7 | "multi_label": 0, 8 | "tag_categories": 9, 9 | "seq_len": 50, 10 | "dropout_rate": 0.2, 11 | "vocab_path": "../chinese_wwm_ext_L-12_H-768_A-12/v2/vocab.txt", 12 | "bert_config_path": "../chinese_wwm_ext_L-12_H-768_A-12/v2/bert_config.json", 13 | "bert_model_path": "../chinese_wwm_ext_L-12_H-768_A-12/v2/bert_model.ckpt-1", 14 | "output_path": "../output_path/ner", 15 | "ckpt_model_path": "output_path/ckpt_model/bert_ner", 16 | "export_model_path": "output_path/export_model", 17 | "data_path": "/Users/donruo/Desktop/project/nlp_models/corpus/pd2014/", 18 | "test_data": "/Users/donruo/Desktop/project/search_algorithm/query_understand/short_text_classify/data/news/test.tsv" 19 | } -------------------------------------------------------------------------------- /model_configs/sim_bert.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name": "simbert", 3 | "learning_rate": 1e-7, 4 | "epoches": 10, 5 | "batch_size": 8, 6 | "optimizer": "adam", 7 | "multi_label": 0, 8 | "neg_threshold": 0.4, 9 | "freq_filter": 1, 10 | "seq_len": 50, 11 | "dropout_rate": 0.2, 12 | "vocab_path": "../chinese_wwm_ext_L-12_H-768_A-12/v2/vocab.txt", 13 | "bert_config_path": "../chinese_wwm_ext_L-12_H-768_A-12/v2/bert_config.json", 14 | "bert_model_path": "../chinese_wwm_ext_L-12_H-768_A-12/v2/bert_model.ckpt-1", 15 | "output_path": "../output_path/sim", 16 | "ckpt_model_path": "output_path/ckpt_model/simbert", 17 | "export_model_path": "output_path/export_model", 18 | "data_path": "/Users/donruo/Desktop/project/text_match/data/train.csv", 19 | "test_data": "/Users/donruo/Desktop/project/search_algorithm/query_understand/short_text_classify/data/news/test.tsv" 20 | } -------------------------------------------------------------------------------- /model_configs/ranking.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name": "ranking", 3 | "learning_rate": 1e-5, 4 | "epoches": 10, 5 | "batch_size": 2, 6 | "optimizer": "adam", 7 | "multi_label": 0, 8 | "num_classes": 2, 9 | "embedding_size": 300, 10 | "seq_len": 128, 11 | "dropout_rate": 0.2, 12 | "num_samples": 7, 13 | "vocab_path": "../chinese_wwm_ext_L-12_H-768_A-12/v2/vocab.txt", 14 | "bert_config_path": "../chinese_wwm_ext_L-12_H-768_A-12/v2/bert_config.json", 15 | "bert_model_path": "../chinese_wwm_ext_L-12_H-768_A-12/v2/bert_model.ckpt-1", 16 | "output_path": "../output_path/ranking", 17 | "ckpt_model_path": "output_path/ckpt_model/ranking", 18 | "export_model_path": "output_path/export_model", 19 | "data_path": "/Users/donruo/Desktop/project/QA/data/标准FAQ.xlsx", 20 | "test_data": "/Users/donruo/Desktop/project/search_algorithm/query_understand/short_text_classify/data/news/test.tsv" 21 | } -------------------------------------------------------------------------------- /model_configs/classifier.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name": "bert_classifier", 3 | "learning_rate": 1e-5, 4 | "epoches": 10, 5 | "batch_size": 8, 6 | "optimizer": "adam", 7 | "multi_label": 0, 8 | "num_classes": 15, 9 | "embedding_size": 300, 10 | "seq_len": 128, 11 | "dropout_rate": 0.2, 12 | "vocab_path": "../chinese_wwm_ext_L-12_H-768_A-12/v2/vocab.txt", 13 | "bert_config_path": "../chinese_wwm_ext_L-12_H-768_A-12/v2/bert_config.json", 14 | "bert_model_path": "../chinese_wwm_ext_L-12_H-768_A-12/v2/bert_model.ckpt-1", 15 | "output_path": "../output_path", 16 | "ckpt_model_path": "output_path/ckpt_model/bert_classifier", 17 | "export_model_path": "output_path/export_model", 18 | "data_path": "/Users/donruo/Desktop/project/search_algorithm/query_understand/short_text_classify/data/news/train.tsv", 19 | "test_data": "/Users/donruo/Desktop/project/search_algorithm/query_understand/short_text_classify/data/news/test.tsv" 20 | } -------------------------------------------------------------------------------- /model_configs/distill_bert.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name": "distill_model", 3 | "learning_rate": 1e-7, 4 | "epoches": 10, 5 | "batch_size": 8, 6 | "optimizer": "adam", 7 | "multi_label": 0, 8 | "neg_threshold": 0.4, 9 | "freq_filter": 1, 10 | "seq_len": 50, 11 | "dropout_rate": 0.2, 12 | "use_word2vec": 0, 13 | "t": 2, 14 | "alpha": 0.4, 15 | "embedding_size": 128, 16 | "hidden_size": 64, 17 | "output_size": 128, 18 | "is_training": 1, 19 | "vocab_path": "../chinese_wwm_ext_L-12_H-768_A-12/v2/vocab.txt", 20 | "bert_config_path": "../chinese_wwm_ext_L-12_H-768_A-12/v2/bert_config.json", 21 | "bert_model_path": "../chinese_wwm_ext_L-12_H-768_A-12/v2/bert_model.ckpt-1", 22 | "output_path": "../output_path/distill", 23 | "ckpt_model_path": "output_path/ckpt_model/distill_bert", 24 | "export_model_path": "output_path/export_model", 25 | "data_path": "/Users/donruo/Desktop/project/text_match/data/train.csv", 26 | "test_data": "/Users/donruo/Desktop/project/search_algorithm/query_understand/short_text_classify/data/news/test.tsv" 27 | } -------------------------------------------------------------------------------- /models/ranking.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from official.nlp.modeling import layers 3 | from official.nlp.modeling import networks 4 | 5 | 6 | class Ranking(tf.keras.Model): 7 | ''' 8 | bert的排序模型 9 | ''' 10 | def __init__(self, config, network, **kwargs): 11 | self.config = config 12 | # 定义模型输入 13 | word_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_word_ids') 14 | mask = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_mask') 15 | type_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_type_ids') 16 | input = [word_ids, mask, type_ids] 17 | 18 | _output = network(input) 19 | classifier = networks.Classification( 20 | input_width=_output[1].shape[-1], 21 | num_classes=self.config['num_classes'], 22 | output='logits', 23 | name='sentence_prediction') 24 | _logits = classifier(_output[1]) #[batch_size*samples_num, 1] 25 | logits = tf.split(_logits, num_or_size_splits=self.config['batch_size'], axis=0) 26 | _relations = tf.keras.layers.Activation(tf.nn.sigmoid)(logits) 27 | predictions = tf.reshape(tf.argmax(_relations), [-1]) 28 | outputs = dict(logits=logits, predictions=predictions) 29 | super(Ranking, self).__init__(inputs=input, outputs=outputs, **kwargs) 30 | -------------------------------------------------------------------------------- /predictor/predict_base.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import os 3 | import pickle 4 | import json 5 | from data_processor.tokenizer import tokenizer 6 | from models.model_base import BaseModel 7 | 8 | 9 | class BasePredictor(BaseModel): 10 | ''' 11 | 构建预测的基础对象 12 | ''' 13 | def __init__(self, config): 14 | self.tokenizer = tokenizer(config) 15 | super(BasePredictor, self).__init__(config) 16 | 17 | def load_ckpt_model(self, model, path, model_name): 18 | ''' 19 | 加载ckpt模型 20 | :param model_path: 21 | :return: 22 | ''' 23 | # models = self.create_model() 24 | path = '../'+os.path.join(path, model_name) 25 | model.load_weights(path) 26 | # ckpt = tf.train.Checkpoint(model=model) 27 | # init_checkpoint = path 28 | # 29 | # ckpt.restore(init_checkpoint).assert_existing_objects_matched() 30 | 31 | return model 32 | 33 | def create_model(self): 34 | ''' 35 | 创建模型 36 | :return: 37 | ''' 38 | raise NotImplemented 39 | 40 | def load_vocab(self): 41 | ''' 42 | 加载词典 43 | :return: 44 | ''' 45 | raise NotImplemented 46 | 47 | def predict(self, sentence): 48 | ''' 49 | 预测句子结果 50 | :param sentence: 51 | :return: 52 | ''' 53 | raise NotImplemented 54 | -------------------------------------------------------------------------------- /models/sentence_embedding.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from official.nlp.keras_nlp.encoders.bert_encoder import BertEncoder 4 | 5 | class SentenceEmbedding(tf.keras.Model): 6 | ''' 7 | 句子向量 8 | ''' 9 | def __init__(self, 10 | encoder_network: tf.keras.Model, 11 | # sequence_length, 12 | config = None, 13 | **kwargs): 14 | # self.encoder_network = encoder_network 15 | self.config = config 16 | # self.sequence_length = sequence_length 17 | 18 | # sequence_length = tf.keras.Input(shape=(None,), dtype=tf.int32, name='seqence_length') 19 | sequence_length = self.config['seq_len'] 20 | inputs = encoder_network.inputs 21 | outputs = encoder_network(inputs) 22 | if isinstance(outputs, list): 23 | sequence_output = outputs[0][-1] 24 | cls_output = outputs[1] 25 | encoder_outputs = outputs[0] 26 | else: 27 | sequence_output = outputs['sequence_output'] 28 | cls_output = outputs['pooled_output'] 29 | encoder_outputs = outputs['encoder_outputs'] 30 | 31 | #取第一层和最后一层的均值作为句子embedding 32 | # if isinstance(sequence_length, int): 33 | first_layer_outputs = encoder_outputs[0][:, :sequence_length, :] 34 | last_layer_outputs = encoder_outputs[-1][:, :sequence_length, :] 35 | average = (first_layer_outputs + last_layer_outputs) / 2.0 36 | sentence_embedding = tf.reduce_mean(average, axis=1) 37 | # else: 38 | # sentence_embedding = [] 39 | # for i in range(self.config['batch_size']): 40 | # first_layer_outputs = encoder_outputs[0][:, :sequence_length[i], :] 41 | # last_layer_outputs = encoder_outputs[-1][:, :sequence_length[i], :] 42 | # average = (first_layer_outputs + last_layer_outputs) / 2.0 43 | # sentence_embedding.append(tf.reduce_mean(average, axis=1)) 44 | _pooler_layer = tf.keras.layers.Dense( 45 | units=self.config['pooled_output_size'], 46 | activation='tanh', 47 | kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02), 48 | name='pooler_transform') 49 | outputs = _pooler_layer(sentence_embedding) 50 | 51 | super(SentenceEmbedding, self).__init__(inputs=inputs, outputs=outputs, **kwargs) -------------------------------------------------------------------------------- /bert_service/model_saving_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Utilities to save models.""" 16 | 17 | import os 18 | 19 | from absl import logging 20 | import tensorflow as tf 21 | import typing 22 | 23 | 24 | def export_bert_model(model_export_path: typing.Text, 25 | model: tf.keras.Model, 26 | checkpoint_dir: typing.Optional[typing.Text] = None, 27 | restore_model_using_load_weights: bool = False) -> None: 28 | """Export BERT model for serving which does not include the optimizer. 29 | 30 | Args: 31 | model_export_path: Path to which exported model will be saved. 32 | model: Keras model object to export. 33 | checkpoint_dir: Path from which model weights will be loaded, if 34 | specified. 35 | restore_model_using_load_weights: Whether to use checkpoint.restore() API 36 | for custom checkpoint or to use model.load_weights() API. There are 2 37 | different ways to save checkpoints. One is using tf.train.Checkpoint and 38 | another is using Keras model.save_weights(). Custom training loop 39 | implementation uses tf.train.Checkpoint API and Keras ModelCheckpoint 40 | callback internally uses model.save_weights() API. Since these two API's 41 | cannot be used toghether, model loading logic must be take into account 42 | how model checkpoint was saved. 43 | 44 | Raises: 45 | ValueError when either model_export_path or model is not specified. 46 | """ 47 | if not model_export_path: 48 | raise ValueError('model_export_path must be specified.') 49 | if not isinstance(model, tf.keras.Model): 50 | raise ValueError('model must be a tf.keras.Model object.') 51 | 52 | if checkpoint_dir: 53 | if restore_model_using_load_weights: 54 | model_weight_path = os.path.join(checkpoint_dir, 'checkpoint') 55 | assert tf.io.gfile.exists(model_weight_path) 56 | model.load_weights(model_weight_path) 57 | else: 58 | checkpoint = tf.train.Checkpoint(model=model) 59 | 60 | # Restores the model from latest checkpoint. 61 | latest_checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir) 62 | assert latest_checkpoint_file 63 | logging.info('Checkpoint file %s found and restoring from ' 64 | 'checkpoint', latest_checkpoint_file) 65 | checkpoint.restore( 66 | latest_checkpoint_file).assert_existing_objects_matched() 67 | 68 | model.save(model_export_path, include_optimizer=False, save_format='tf') 69 | -------------------------------------------------------------------------------- /trainer/train_base.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from models.model_base import BaseModel 4 | 5 | 6 | class TrainBase(BaseModel): 7 | ''' 8 | 模型训练基础 9 | ''' 10 | def __init__(self, train_config): 11 | self.epoches = train_config['epoches'] 12 | self.data_generator = None 13 | 14 | super(TrainBase, self).__init__(train_config) 15 | 16 | def train(self, model): 17 | ''' 18 | 训练过程 19 | :return: 20 | ''' 21 | model.summary() 22 | optimizer = self.get_optimizer() 23 | metrics = self.build_metrics() 24 | batch_num = 0 25 | valid_loss = 0 26 | best_acc = 0 27 | mean_acc = 0 28 | for i in range(self.epoches): 29 | print("------------start train epoch {}--------------------".format(i)) 30 | for train_batch in self.data_generator.gen_data(self.data_generator.train_data, self.data_generator.train_label): 31 | train_input = self.build_inputs(train_batch) 32 | train_loss = self.train_step(train_input, model, optimizer, metrics) 33 | print(train_loss) 34 | batch_num += 1 35 | 36 | if batch_num % 3 == 0: 37 | print("------------start validation epoch {}--------------".format(i)) 38 | count = 0 39 | sum_acc = 0 40 | for valid_batch in self.data_generator.gen_data(self.data_generator.eval_data, self.data_generator.eval_label): 41 | count += 1 42 | valid_input = self.build_inputs(valid_batch) 43 | valid_loss = self.validation_step(valid_input, model, metrics=metrics) 44 | print("accuracy: {}".format(metrics[0].result().numpy())+'\n') 45 | sum_acc += metrics[0].result().numpy() 46 | mean_acc = sum_acc/count 47 | if mean_acc > best_acc: 48 | best_acc = mean_acc 49 | # print('save models') 50 | self.save_ckpt_model(model) 51 | self.save_pb_model(model) 52 | 53 | def fit_train(self, model): 54 | ''' 55 | 使用fit训练模型 56 | :return: 57 | ''' 58 | optimizer = self.get_optimizer() 59 | metrics = self.build_metrics() 60 | 61 | model = self.compile_model(model, 62 | optimizer=optimizer, 63 | train_step=self.train_step, 64 | validation_step=self.validation_step, 65 | metrics=metrics) 66 | model.summary() 67 | dataset = self.data_generator.gen_data(self.data_generator.train_data, self.data_generator.train_label) 68 | valid_data = self.data_generator.gen_data(self.data_generator.eval_data, self.data_generator.eval_label) 69 | # dataset = dataset.repeat() 70 | # valid_data = valid_data.repeat() 71 | # dataset = self.build_inputs(data_) 72 | logs = model.fit(dataset, epochs=2, steps_per_epoch=3, validation_data=valid_data, validation_steps=1) 73 | # self.assertIn("loss", logs.history) 74 | # self.assertIn("accuracy", logs.history) 75 | 76 | -------------------------------------------------------------------------------- /models/sim_bert.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | # from __future__ import google_type_annotations 4 | from __future__ import print_function 5 | 6 | import tensorflow as tf 7 | from official.nlp.modeling import layers 8 | from official.nlp.modeling import networks 9 | 10 | 11 | class SimBert(tf.keras.Model): 12 | """ 13 | bert句子相似度模型 14 | """ 15 | 16 | def __init__(self, 17 | network, 18 | config, 19 | initializer='glorot_uniform', 20 | dropout_rate=0.1, 21 | ): 22 | self._self_setattr_tracking = False 23 | self._network = network 24 | self._config = { 25 | 'network': network, 26 | 'initializer': initializer, 27 | } 28 | self.config = config 29 | #定义两个句子的输入 30 | # 定义输入 31 | word_ids_a = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_word_ids_a') 32 | mask_a = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_mask_a') 33 | type_ids_a = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_type_ids_a') 34 | word_ids_b = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_word_ids_b') 35 | mask_b = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_mask_b') 36 | type_ids_b = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_type_ids_b') 37 | input_a = [word_ids_a, mask_a, type_ids_a] 38 | input_b = [word_ids_b, mask_b, type_ids_b] 39 | 40 | #计算encoder 41 | outputs_a = network.predict_step(input_a) 42 | outputs_b = network.predict_step(input_b) 43 | 44 | cls_output_a = outputs_a[1] 45 | query_embedding_output = tf.keras.layers.Dropout(rate=dropout_rate)(cls_output_a) 46 | 47 | cls_output_b = outputs_b[1] 48 | sim_query_embedding_output = tf.keras.layers.Dropout(rate=dropout_rate)(cls_output_b) 49 | 50 | # 余弦函数计算相似度 51 | # cos_similarity余弦相似度[batch_size, similarity] 52 | query_norm = tf.sqrt(tf.reduce_sum(tf.square(query_embedding_output), axis=-1), name='query_norm') 53 | sim_query_norm = tf.sqrt(tf.reduce_sum(tf.square(sim_query_embedding_output), axis=-1), name='sim_query_norm') 54 | 55 | dot = tf.reduce_sum(tf.multiply(query_embedding_output, sim_query_embedding_output), axis=-1) 56 | cos_similarity = tf.divide(dot, (query_norm * sim_query_norm), name='cos_similarity') 57 | self.similarity = cos_similarity 58 | 59 | # 预测为正例的概率 60 | cond = (self.similarity > self.config["neg_threshold"]) 61 | pos = tf.where(cond, tf.square(self.similarity), 1 - tf.square(self.similarity)) 62 | neg = tf.where(cond, 1 - tf.square(self.similarity), tf.square(self.similarity)) 63 | predictions = [[neg[i], pos[i]] for i in range(self.config['batch_size'])] 64 | 65 | self.logits = self.similarity 66 | outputs = dict(logits=self.logits, predictions=predictions) 67 | 68 | super(SimBert, self).__init__(inputs=[input_a, input_b], outputs=outputs) 69 | 70 | @property 71 | def checkpoint_items(self): 72 | return dict(encoder=self._network) 73 | 74 | def get_config(self): 75 | return self._config 76 | 77 | @classmethod 78 | def from_config(cls, config, custom_objects=None): 79 | return cls(**config) -------------------------------------------------------------------------------- /data_processor/embedding.py: -------------------------------------------------------------------------------- 1 | import gensim 2 | import os 3 | 4 | from data_processor.tokenizer import tokenizer 5 | import numpy as np 6 | import h5py 7 | import logging 8 | from collections import Counter 9 | import pandas as pd 10 | from itertools import chain 11 | from gensim import corpora, models 12 | import gensim 13 | logger = logging.getLogger(__name__) 14 | 15 | class embedding(tokenizer): 16 | ''' 17 | 文本向量化 18 | ''' 19 | def __init__(self, embedding_config): 20 | self.config = embedding_config 21 | super(embedding, self).__init__(embedding_config) 22 | 23 | def load_word2vec_model(self): 24 | ''' 25 | 加载word2vec模型 26 | :return: 27 | ''' 28 | model_path = self.config.get('word2vec_path') 29 | if not os.path.exists(model_path): 30 | raise Exception("model_path did not exit, please check path") 31 | model = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=False) 32 | return model 33 | 34 | def load_bert_base(self): 35 | ''' 36 | 加载bert_base模型 37 | ''' 38 | model_path = self.config['bert_model_path'] 39 | 40 | def get_word_vectors(self, tokens): 41 | ''' 42 | 获取词向量 43 | :param tokens: 44 | :return: 45 | ''' 46 | features = [] 47 | embedding_size = self.config['embedding_size'] 48 | word_vectors = np.zeros(embedding_size).tolist() 49 | model = self.load_word2vec_model() 50 | for word in tokens: 51 | if word in model.index2word: 52 | features.append(model.word_vec(word)) 53 | else: 54 | features.append(word_vectors) 55 | print("{} is not in vocabulary!".format(word)) 56 | # print(features) 57 | return features 58 | 59 | def save_vectors(self, vectors, name): 60 | ''' 61 | 保存向量到文件中 62 | :param vectors: 63 | :return: 64 | ''' 65 | file_path = os.path.join(self.config['output_path'], name + '.npy') 66 | np.save(file_path, vectors) 67 | 68 | @staticmethod 69 | def trans_to_tf_idf(inputs, dictionary, tf_idf_model): 70 | vocab_size = len(dictionary) 71 | input_ids = [] 72 | for question in inputs: 73 | # question_ids = [] 74 | # for question in questions: 75 | bow_vec = dictionary.doc2bow(question) 76 | tfidf_vec = tf_idf_model[bow_vec] 77 | vec = [0] * vocab_size 78 | for item in tfidf_vec: 79 | vec[item[0]] = item[1] 80 | # question_ids.append(vec) 81 | input_ids.append(vec) 82 | return input_ids 83 | 84 | @staticmethod 85 | def train_tf_idf(inputs): 86 | sentences = inputs 87 | dictionary = corpora.Dictionary(sentences) 88 | corpus = [dictionary.doc2bow(sentence) for sentence in sentences] 89 | tfidf_model = models.TfidfModel(corpus) 90 | return dictionary, tfidf_model 91 | 92 | def get_one_hot_vectors(self, tokens): 93 | ''' 94 | 获取one-hot向量 95 | :param tokens: 96 | :return: 97 | ''' 98 | raise NotImplemented 99 | 100 | def get_tf_idf_vectors(self, tokens): 101 | ''' 102 | 获取tf-idf向量 103 | :param tokens: 104 | :return: 105 | ''' 106 | raise NotImplemented 107 | 108 | 109 | 110 | -------------------------------------------------------------------------------- /data_processor/base_processor.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 数据预处理的基础对象 3 | ''' 4 | import os 5 | import jieba 6 | from collections import Counter 7 | import jieba.posseg as pseg 8 | import pandas as pd 9 | 10 | class data_base(object): 11 | ''' 12 | 中文文本处理的基础组件 13 | ''' 14 | def __init__(self, data_config): 15 | self.config = data_config 16 | 17 | 18 | @staticmethod 19 | def read_data(path): 20 | ''' 21 | 读取数据集 22 | :param path: 23 | :return: text, label 24 | ''' 25 | texts = [] 26 | labels = [] 27 | with open(path, "rb", encoding='utf8') as f: 28 | for line in f.readlines(): 29 | text, label = line.strip().split(' ') 30 | texts.append(text.strip()) 31 | labels.append(label.strip()) 32 | return texts, labels 33 | 34 | 35 | @staticmethod 36 | def _read_data(path): 37 | """ 38 | 读取多标签数据 39 | :return: 返回分词后的文本内容和标签,inputs = [[]], labels = [[]] 40 | """ 41 | inputs = [] 42 | labels = [] 43 | train_data = pd.read_csv(path, error_bad_lines=False, sep='\t') 44 | print(train_data.columns) 45 | print(train_data.head(2)) 46 | inputs = train_data['text_a'].values.tolist()[:100] 47 | labels = train_data['label'].values.tolist()[:100] 48 | labels = [str(label) for label in labels] 49 | # inputs = [list(i) for i in inputs] 50 | 51 | return inputs, labels 52 | 53 | def get_all_words(self, tokens): 54 | ''' 55 | 对已经分词的数据直接获取所有词 56 | :param tokens: 57 | :return: 58 | ''' 59 | all_words = [] 60 | [all_words.extend(i) for i in tokens] 61 | return all_words 62 | 63 | def cut_words(self, texts): 64 | ''' 65 | 分词 66 | :param text: 67 | :return: 68 | ''' 69 | all_words = [] 70 | for text in texts: 71 | words = jieba.lcut(text) 72 | all_words.extend(words) 73 | return all_words 74 | 75 | def cut_chars(self, texts): 76 | ''' 77 | 将文本分割成字 78 | :param text: 79 | :return: 80 | ''' 81 | all_chars = [] 82 | for text in texts: 83 | chars = list(text) 84 | all_chars.extend(chars) 85 | return all_chars 86 | 87 | def word_pos_filter(self, pos_filter, text): 88 | ''' 89 | 根据词性过滤文本 90 | :param pos: ['nr'...] 91 | :param text: 92 | :return: 93 | ''' 94 | words = [] 95 | pos_text = pseg.lcut(text) 96 | for word, pos in pos_text: 97 | if pos not in pos_filter: 98 | words.append(word) 99 | return words 100 | 101 | def word_freq_filter(self, freq, all_words): 102 | ''' 103 | 词频过滤 104 | :param freq: 105 | :return: 106 | ''' 107 | print(all_words) 108 | word_count = Counter(all_words) # 统计词频 109 | sort_word_count = sorted(word_count.items(), key=lambda x: x[1], reverse=True) 110 | 111 | # 去除低频词 112 | words = [item[0] for item in sort_word_count if item[1] >= freq] 113 | return words 114 | 115 | def get_vocab(self, all_words): 116 | ''' 117 | 获取词列表 118 | :param all_words: 119 | :return: 120 | ''' 121 | word_count = Counter(all_words) # 统计词频 122 | sort_word_count = sorted(word_count.items(), key=lambda x: x[1], reverse=True) 123 | vocab = [item[0] for item in sort_word_count] 124 | 125 | return vocab 126 | 127 | def remove_stop_words(self, all_words): 128 | ''' 129 | 去除停用词 130 | :param all_words: 131 | :return: 132 | ''' 133 | stop_words = self.load_stop_words(self.config['stop_word_path']) 134 | words = [word for word in all_words if word not in stop_words] 135 | return words 136 | 137 | def load_stop_words(self, stop_word_path): 138 | ''' 139 | 加载停用词表 140 | :param stop_word_path: 141 | :return: 142 | ''' 143 | with open(stop_word_path, "r", encoding="utf8") as fr: 144 | stop_words = [line.strip() for line in fr.readlines()] 145 | return stop_words 146 | 147 | -------------------------------------------------------------------------------- /predictor/predict.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import os 3 | import json 4 | import pickle 5 | import sys 6 | sys.path.append(os.path.abspath(os.path.dirname(os.getcwd()))) 7 | 8 | import numpy as np 9 | from models.sentence_embedding import SentenceEmbedding 10 | from tasks.classifier import ClassifierTask 11 | from tasks.ner_task import NERTask 12 | from tasks.Itr_pair_task import ItrTask 13 | 14 | 15 | from predictor.predict_base import BasePredictor 16 | import pandas as pd 17 | 18 | class Predictor(BasePredictor): 19 | ''' 20 | 预测类 21 | ''' 22 | def __init__(self, config): 23 | 24 | self.config = config 25 | super(Predictor, self).__init__(config) 26 | 27 | self.word_to_index = None 28 | self.label_to_index = None 29 | self.word_vectors = None 30 | self.vocab_size = None 31 | 32 | self.load_vocab() 33 | #创建模型并加载参数 34 | self.model = self.create_model() 35 | self.model = self.load_ckpt_model(self.model, self.config['ckpt_model_path'], self.config['model_name']) 36 | 37 | 38 | def create_model(self): 39 | ''' 40 | 创建模型 41 | :return: 42 | ''' 43 | model = None 44 | if self.config['model_name'] == 'bert_classifier': 45 | model = ClassifierTask(self.config).build_model() 46 | 47 | if self.config['model_name'] == 'bert_ner': 48 | model = NERTask(self.config).build_model() 49 | 50 | if self.config['model_name'] == 'simbert': 51 | model = ItrTask(self.config).build_model() 52 | 53 | return model 54 | 55 | def load_vocab(self): 56 | ''' 57 | 加载词典 58 | :return: 59 | ''' 60 | # with open(os.path.join(self.config['output_path'], "word_to_index.pkl"), "rb") as f: 61 | # self.word_to_index = pickle.load(f) 62 | with open(os.path.join(self.config['output_path'], "label_to_index.pkl"), "rb") as f: 63 | self.label_to_index = pickle.load(f) 64 | 65 | # if self.config['use_word2vec']: 66 | # if os.path.exists(os.path.join(self.config['output_path'], "word_vectors.npy")): 67 | # print("load word_vectors") 68 | # self.word_vectors = np.load(os.path.join(self.config['output_path'], "word_vectors.npy"), 69 | # allow_pickle=True) 70 | 71 | def predict(self, sentence): 72 | ''' 73 | 句子预测 74 | :param sentence:list 75 | :return: 76 | ''' 77 | word_ids, segment_ids, word_mask, sequence_length = [], [], [], [] 78 | 79 | _word_ids, _segment_ids, _word_mask, _sequence_length = self.tokenizer.encode(sentence) 80 | word_ids.append(_word_ids) 81 | segment_ids.append(_segment_ids) 82 | word_mask.append(_word_mask) 83 | sequence_length.append(_sequence_length) 84 | inputs = dict( 85 | input_word_ids=word_ids, 86 | input_mask=word_mask, 87 | input_type_ids=segment_ids, 88 | ) 89 | 90 | infer_input = { 91 | "input_word_ids": tf.convert_to_tensor(inputs['input_word_ids']), 92 | "input_mask": tf.convert_to_tensor(inputs['input_mask']), 93 | "input_type_ids": tf.convert_to_tensor(inputs['input_type_ids']), 94 | } 95 | logits = self.model(infer_input, training=False) 96 | predictions = self.get_predictions(logits) 97 | label = self.tokenizer.ids_to_tokens(predictions, self.label_to_index) 98 | return label 99 | 100 | def sequence_predict(self, sentence): 101 | ''' 102 | 序列标注预测 103 | :param sentence: 104 | :return: 105 | ''' 106 | word_ids, segment_ids, word_mask, sequence_length = [], [], [], [] 107 | 108 | _word_ids, _segment_ids, _word_mask, _sequence_length = self.tokenizer.encode(sentence) 109 | word_ids.append(_word_ids) 110 | segment_ids.append(_segment_ids) 111 | word_mask.append(_word_mask) 112 | sequence_length.append(_sequence_length) 113 | inputs = dict( 114 | input_word_ids=word_ids, 115 | input_mask=word_mask, 116 | input_type_ids=segment_ids, 117 | ) 118 | 119 | infer_input = { 120 | "input_word_ids": tf.convert_to_tensor(inputs['input_word_ids']), 121 | "input_mask": tf.convert_to_tensor(inputs['input_mask']), 122 | "input_type_ids": tf.convert_to_tensor(inputs['input_type_ids']), 123 | } 124 | outputs = self.model(infer_input, training=False) 125 | # decode_results = outputs.numpy().tolist() 126 | predictions = self.get_predictions(outputs)[0][1:_sequence_length] 127 | 128 | label = self.tokenizer.ids_to_tokens(predictions, self.label_to_index) 129 | return label 130 | 131 | 132 | 133 | if __name__=='__main__': 134 | with open("../model_configs/bert_ner.json", 'r') as fr: 135 | config = json.load(fr) 136 | predictor = Predictor(config) 137 | test_data = pd.read_csv(config['test_data'], error_bad_lines=False, sep='\t') 138 | inputs = test_data['text_a'].values.tolist()[:10] 139 | labels = test_data['label'].values.tolist()[:10] 140 | labels = [str(label) for label in labels] 141 | predictions = [] 142 | count = 0 143 | 144 | for i,sentence in enumerate(inputs): 145 | prediction = predictor.sequence_predict(sentence) 146 | # prediction = predictor.predict(sentence) 147 | print(prediction) 148 | if prediction[0] == labels[i]: 149 | print(sentence) 150 | count += 1 151 | predictions.extend(prediction) 152 | # print(predictions) 153 | print(count/100) 154 | print(inputs[5]) 155 | -------------------------------------------------------------------------------- /tasks/classifier.py: -------------------------------------------------------------------------------- 1 | from official.nlp.bert import tokenization 2 | import tensorflow as tf 3 | from official.nlp.configs import bert 4 | from official.nlp.configs import encoders 5 | from official.nlp.data import pretrain_dataloader 6 | 7 | from official.nlp.tasks.tagging import TaggingTask 8 | from trainer.train_base import TrainBase 9 | from official.nlp.modeling.models import BertClassifier 10 | import os 11 | import json 12 | from data_processor.classifier_data_generator import ClassifierDataGenerator 13 | from official.nlp.modeling.networks import BertEncoder 14 | from official.modeling import tf_utils 15 | from official.nlp.bert import configs as bert_configs 16 | 17 | 18 | 19 | class ClassifierTask(TrainBase): 20 | ''' 21 | 基于bert的分类任务 22 | ''' 23 | def __init__(self, task_config): 24 | self.config = task_config 25 | self.loss = 'loss' 26 | super(ClassifierTask, self).__init__(task_config) 27 | self.data_generator = ClassifierDataGenerator(task_config) 28 | 29 | 30 | def build_model(self): 31 | ''' 32 | 构建模型 33 | ''' 34 | # encoder_network = encoders.build_encoder(encoders.EncoderConfig( 35 | # bert=encoders.BertEncoderConfig(vocab_size=21128))) 36 | encoder_network = self.build_encoder() 37 | 38 | 39 | 40 | model = BertClassifier(network=encoder_network, 41 | num_classes=self.config['num_classes']) 42 | # ckpt = tf.train.Checkpoint(models=models) 43 | 44 | # init_checkpoint = self.config['bert_model_path'] 45 | 46 | # ckpt.restore(init_checkpoint).assert_existing_objects_matched() 47 | 48 | # models.load_weights(init_checkpoint).assert_existing_objects_matched() 49 | return model 50 | 51 | def build_encoder(self): 52 | bert_config = bert_configs.BertConfig.from_json_file(self.config['bert_config_path']) 53 | cfg = bert_config 54 | bert_encoder = BertEncoder( 55 | vocab_size=cfg.vocab_size, 56 | hidden_size=cfg.hidden_size, 57 | num_layers=cfg.num_hidden_layers, 58 | num_attention_heads=cfg.num_attention_heads, 59 | intermediate_size=cfg.intermediate_size, 60 | activation=tf_utils.get_activation(cfg.hidden_act), 61 | dropout_rate=cfg.hidden_dropout_prob, 62 | attention_dropout_rate=cfg.attention_probs_dropout_prob, 63 | max_sequence_length=cfg.max_position_embeddings, 64 | type_vocab_size=cfg.type_vocab_size, 65 | initializer=tf.keras.initializers.TruncatedNormal( 66 | stddev=cfg.initializer_range), 67 | embedding_width=cfg.embedding_size, 68 | return_all_encoder_outputs=True) 69 | # ckpt = tf.train.Checkpoint(model=bert_encoder) 70 | # init_checkpoint = self.config['bert_model_path'] 71 | # ckpt.restore(init_checkpoint).assert_existing_objects_matched() 72 | # bert_encoder.load_weights(init_checkpoint) 73 | return bert_encoder 74 | 75 | def build_losses(self, labels, model_outputs, metrics, aux_losses=None) -> tf.Tensor: 76 | ''' 77 | 构建损失 78 | ''' 79 | if self.config['num_classes'] > 1: 80 | losses = tf.keras.losses.sparse_categorical_crossentropy(labels, 81 | tf.cast(model_outputs, tf.float32), 82 | from_logits=True) 83 | else: 84 | losses = tf.keras.losses.categorical_crossentropy(labels, 85 | tf.cast(model_outputs, tf.float32), 86 | from_logits=True 87 | ) 88 | # metrics['losses'].update_state(losses) 89 | loss = tf.reduce_mean(losses) 90 | 91 | return loss 92 | 93 | def build_inputs(self, inputs): 94 | ''' 95 | 构建输入 96 | ''' 97 | train_input = { 98 | "input_word_ids": tf.convert_to_tensor(inputs['input_word_ids']), 99 | "input_mask": tf.convert_to_tensor(inputs['input_mask']), 100 | "input_type_ids": tf.convert_to_tensor(inputs['input_type_ids']), 101 | "labels": inputs['input_target_ids'] 102 | } 103 | return train_input 104 | 105 | def build_metrics(self, training=None): 106 | ''' 107 | 构建评价指标 108 | :param training: 109 | :return: 110 | ''' 111 | # del training 112 | metrics = [ 113 | tf.keras.metrics.SparseCategoricalAccuracy(name='classifier_metrics') 114 | ] 115 | 116 | # metrics = dict([(metric.name, metric) for metric in metrics]) 117 | 118 | return metrics 119 | 120 | def check_exist_model(self, model): 121 | ''' 122 | 检查是否存在模型文件 123 | :return: 124 | ''' 125 | # ckpt = tf.train.Checkpoint(models=models) 126 | init_checkpoint = os.path.join(self.config['ckpt_model_path'], self.config['model_name']) 127 | 128 | # ckpt.restore(init_checkpoint).assert_existing_objects_matched() 129 | model.load_weights(init_checkpoint).assert_existing_objects_matched() 130 | 131 | 132 | if __name__=='__main__': 133 | with open("../model_configs/classifier.json", 'r') as fr: 134 | config = json.load(fr) 135 | print(config) 136 | classifier = ClassifierTask(config) 137 | 138 | model = classifier.build_model() 139 | bert_encoder = classifier.build_encoder() 140 | ckpt = tf.train.Checkpoint(model=bert_encoder) 141 | init_checkpoint = config['bert_model_path'] 142 | ckpt.restore(init_checkpoint).assert_existing_objects_matched() 143 | # config = models.get_config() 144 | classifier.train(model) 145 | 146 | 147 | -------------------------------------------------------------------------------- /tasks/embedding_task.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow_addons.layers import crf 3 | 4 | import json 5 | import os 6 | from trainer.train_base import TrainBase 7 | from data_processor.classifier_data_generator import ClassifierDataGenerator 8 | from data_processor.ner_data_generator import NERDataGenerator 9 | 10 | # from official.nlp.keras_nlp.encoders.bert_encoder import BertEncoder 11 | from official.nlp.modeling.networks import BertEncoder 12 | from official.modeling import tf_utils 13 | from official.nlp.bert import configs as bert_configs 14 | from data_processor.tokenizer import tokenizer 15 | from official.nlp.configs import encoders 16 | import dataclasses 17 | from official.modeling.hyperparams import base_config 18 | from official.core import base_task 19 | from official.core import config_definitions as cfg 20 | from official.core import task_factory 21 | from typing import List, Optional, Tuple 22 | 23 | from models.sentence_embedding import SentenceEmbedding 24 | import requests 25 | import numpy as np 26 | 27 | @dataclasses.dataclass 28 | class ModelConfig(base_config.Config): 29 | """A base span labeler configuration.""" 30 | encoder: encoders.EncoderConfig = encoders.EncoderConfig() 31 | head_dropout: float = 0.1 32 | head_initializer_range: float = 0.02 33 | 34 | 35 | @dataclasses.dataclass 36 | class embeddingConfig(cfg.TaskConfig): 37 | """The models config.""" 38 | # At most one of `init_checkpoint` and `hub_module_url` can be specified. 39 | init_checkpoint: str = '' 40 | hub_module_url: str = '' 41 | model: ModelConfig = ModelConfig() 42 | 43 | # The real class names, the order of which should match real label id. 44 | # Note that a word may be tokenized into multiple word_pieces tokens, and 45 | # we asssume the real label id (non-negative) is assigned to the first token 46 | # of the word, and a negative label id is assigned to the remaining tokens. 47 | # The negative label id will not contribute to loss and metrics. 48 | class_names: Optional[List[str]] = None 49 | train_data: cfg.DataConfig = cfg.DataConfig() 50 | validation_data: cfg.DataConfig = cfg.DataConfig() 51 | 52 | class EmbeddingTask(object): 53 | ''' 54 | 抽取句子向量任务 55 | ''' 56 | def __init__(self, config): 57 | self.config = config 58 | 59 | def build_model(self): 60 | ''' 61 | 构建模型 62 | ''' 63 | # encoder_network = encoders.build_encoder(encoders.EncoderConfig(bert=encoders.BertEncoderConfig(vocab_size=21128, 64 | # num_layers=1))) 65 | bert_config = bert_configs.BertConfig.from_json_file(self.config['bert_config_path']) 66 | cfg = bert_config 67 | bert_encoder = BertEncoder( 68 | vocab_size=cfg.vocab_size, 69 | hidden_size=cfg.hidden_size, 70 | num_layers=cfg.num_hidden_layers, 71 | num_attention_heads=cfg.num_attention_heads, 72 | intermediate_size=cfg.intermediate_size, 73 | activation=tf_utils.get_activation(cfg.hidden_act), 74 | dropout_rate=cfg.hidden_dropout_prob, 75 | attention_dropout_rate=cfg.attention_probs_dropout_prob, 76 | max_sequence_length=cfg.max_position_embeddings, 77 | type_vocab_size=cfg.type_vocab_size, 78 | initializer=tf.keras.initializers.TruncatedNormal( 79 | stddev=cfg.initializer_range), 80 | embedding_width=cfg.embedding_size, 81 | return_all_encoder_outputs=True) 82 | model = SentenceEmbedding(bert_encoder, self.config) 83 | ckpt = tf.train.Checkpoint(model=bert_encoder) 84 | init_checkpoint = self.config['bert_model_path'] 85 | 86 | ckpt.restore(init_checkpoint).assert_existing_objects_matched() 87 | 88 | return model 89 | 90 | def build_inputs(self, text): 91 | ''' 92 | 构建输入 93 | ''' 94 | tokenize = tokenizer(self.config) 95 | 96 | batch_token_ids, batch_segment_ids, batch_mask, batch_seq_len = [], [], [], [] 97 | word_ids, segment_ids, word_mask, seq_len = tokenize.encode(text) 98 | word_ids = np.array(word_ids, dtype="float32").tolist() 99 | segment_ids = np.array(segment_ids, dtype="float32").tolist() 100 | word_mask = np.array(word_mask, dtype="float32").tolist() 101 | batch_token_ids.append(word_ids) 102 | batch_segment_ids.append(segment_ids) 103 | batch_mask.append(word_mask) 104 | batch_seq_len.append(seq_len) 105 | inputs = dict( 106 | input_word_ids=word_ids, 107 | input_mask=word_mask, 108 | input_type_ids=segment_ids, 109 | ) 110 | 111 | infer_input = { 112 | "input_word_ids": tf.convert_to_tensor(inputs['input_word_ids']), 113 | "input_mask": tf.convert_to_tensor(inputs['input_mask']), 114 | "input_type_ids": tf.convert_to_tensor(inputs['input_type_ids']), 115 | } 116 | 117 | return inputs, infer_input, tf.reshape(tf.convert_to_tensor(batch_seq_len), shape=(-1,)) 118 | 119 | 120 | def inference_one(self, text): 121 | ''' 122 | 推理一条数据 123 | ''' 124 | inputs, infer_inputs, seq_len = self.build_inputs(text) 125 | # model = self.build_model() 126 | # outputs = model(infer_inputs) 127 | data = json.dumps({"signature_name": "serving_default", "inputs":inputs['input_word_ids'], 128 | }) 129 | headers = {"content-type": "application/json"} 130 | json_response = requests.post('http://localhost:8501/v1/models/my_model:predict', 131 | data=data, headers=headers) 132 | outputs = json.loads(json_response.text) 133 | print(outputs) 134 | return outputs 135 | 136 | if __name__=='__main__': 137 | with open("../model_configs/sentence_embedding.json", 'r') as fr: 138 | config = json.load(fr) 139 | print(config) 140 | embedding = EmbeddingTask(config) 141 | text = '你好' 142 | result = embedding.inference_one(text) 143 | print(result) 144 | -------------------------------------------------------------------------------- /data_processor/classifier_data_generator.py: -------------------------------------------------------------------------------- 1 | from data_processor.embedding import embedding 2 | import numpy as np 3 | import pandas as pd 4 | import pickle 5 | import os 6 | 7 | 8 | class ClassifierDataGenerator(embedding): 9 | ''' 10 | 生成训练数据 11 | ''' 12 | def __init__(self, config): 13 | super(ClassifierDataGenerator, self).__init__(config) 14 | self.config = config 15 | self.batch_size = config['batch_size'] 16 | self.load_data() 17 | self.train_data, self.train_label, self.eval_data, self.eval_label = self.train_eval_split(self.word_ids, 18 | self.segment_ids, 19 | self.word_mask, 20 | self.sequence_length, 21 | self.labels_idx, 0.2) 22 | 23 | def load_data(self): 24 | ''' 25 | 加载预处理好的数据 26 | :return: 27 | ''' 28 | 29 | if os.path.exists(os.path.join(self.config['output_path'], "train_tokens.pkl")) and \ 30 | os.path.exists(os.path.join(self.config['output_path'], "label_to_index.pkl")): 31 | print("load existed train data") 32 | # with open(os.path.join(self.config['output_path'], "word_to_index.pkl"), "rb") as f: 33 | # self.word_to_index = pickle.load(f) 34 | with open(os.path.join(self.config['output_path'], "label_to_index.pkl"), "rb") as f: 35 | self.label_to_index = pickle.load(f) 36 | with open(os.path.join(self.config['output_path'], "train_tokens.pkl"), "rb") as f: 37 | train_data = pickle.load(f) 38 | 39 | if os.path.exists(os.path.join(self.config['output_path'], "word_vectors.npy")): 40 | print("load word_vectors") 41 | self.word_vectors = np.load(os.path.join(self.config['output_path'], "word_vectors.npy"), 42 | allow_pickle=True) 43 | 44 | self.word_ids, self.segment_ids, self.word_mask, self.sequence_length, self.labels_idx = np.array(train_data["word_ids"]), \ 45 | np.array(train_data["segment_ids"]),\ 46 | np.array(train_data["word_mask"]),\ 47 | np.array(train_data["sequence_length"]),\ 48 | np.array(train_data["labels_idx"]) 49 | 50 | # self.vocab = self.word_to_index.keys() 51 | # self.vocab_size = len(self.vocab) 52 | else: 53 | # 1,读取原始数据 54 | inputs, labels = self._read_data(self.config['data_path']) 55 | print("read finished") 56 | 57 | # 选择分词方式 58 | # if self.config['embedding_type'] == 'char': 59 | # all_words = self.cut_chars(inputs) 60 | # else: 61 | # all_words = self.cut_words(inputs) 62 | # word_to_index = self.word_to_index(all_words) 63 | label_to_index = self.label_to_index(labels) 64 | 65 | word_ids, segment_ids, word_mask, sequence_length, label_ids = self.save_input_tokens(inputs, labels, label_to_index) 66 | print('text to tokens process finished') 67 | 68 | # # 2,得到去除低频词和停用词的词汇表 69 | # word_to_index, all_words = self.word_to_index(inputs) 70 | # print("word process finished") 71 | # 72 | # # 3,得到词汇表 73 | # label_to_index = self.label_to_index(labels) 74 | # print("vocab process finished") 75 | # 76 | # # 4,输入转索引 77 | # inputs_idx = [self.tokens_to_ids(text, word_to_index) for text in all_words] 78 | # print("index transform finished") 79 | # 80 | # # 5,对输入做padding 81 | # inputs_idx = self.padding(inputs_idx) 82 | # print("padding finished") 83 | # 84 | # # 6,标签转索引 85 | # labels_idx = self.tokens_to_ids(labels, label_to_index) 86 | # print("label index transform finished") 87 | 88 | # 7, 加载词向量 89 | # if self.config['word2vec_path']: 90 | # word_vectors = self.get_word_vectors(self.vocab) 91 | # self.word_vectors = word_vectors 92 | # 将本项目的词向量保存起来 93 | # self.save_vectors(self.word_vectors, 'word_vectors') 94 | 95 | # train_data = dict(inputs_idx=inputs_idx, labels_idx=labels_idx) 96 | # with open(os.path.join(self.config['output_path'], "train_data.pkl"), "wb") as fw: 97 | # pickle.dump(train_data, fw) 98 | # labels_idx = labels 99 | self.word_ids, self.segment_ids, self.word_mask, self.sequence_length, self.labels_idx = word_ids, segment_ids, word_mask, sequence_length, label_ids 100 | 101 | 102 | def train_eval_split(self, word_ids, segment_ids, word_mask, sequence_length, labels, rate): 103 | ''' 104 | 划分训练和验证集 105 | :param data: 106 | :param labels: 107 | :param rate: 108 | :return: 109 | ''' 110 | # np.random.shuffle(data) 111 | perm = int(len(word_ids) * rate) 112 | train_data = (word_ids[perm:], segment_ids[perm:], word_mask[perm:], sequence_length[perm:]) 113 | eval_data = (word_ids[:perm], segment_ids[:perm], word_mask[:perm], sequence_length[:perm]) 114 | train_label = labels[perm:] 115 | eval_label = labels[:perm] 116 | return train_data, train_label, eval_data, eval_label 117 | 118 | 119 | def gen_data(self, input_idx, labels_idx): 120 | ''' 121 | 生成批次数据 122 | :return: 123 | ''' 124 | word_ids, segment_ids, word_mask, sequence_length = input_idx[0], input_idx[1], input_idx[2], input_idx[3] 125 | batch_word_ids, batch_segment_ids, batch_word_mask, batch_sequence_length, batch_output_ids = [], [], [], [], [] 126 | 127 | for i in range(len(word_ids)): 128 | word_id = word_ids[i] 129 | segment_id = segment_ids[i] 130 | mask = word_mask[i] 131 | seq_len = sequence_length[i] 132 | target_ids = labels_idx[i] 133 | batch_word_ids.append(word_id) 134 | batch_segment_ids.append(segment_id) 135 | batch_word_mask.append(mask) 136 | batch_sequence_length.append(seq_len) 137 | batch_output_ids.extend(target_ids) 138 | 139 | if len(batch_word_ids) == self.batch_size: 140 | yield dict( 141 | input_word_ids=np.array(batch_word_ids, dtype="int64"), 142 | input_mask=np.array(batch_word_mask, dtype="int64"), 143 | input_type_ids=np.array(batch_segment_ids, dtype="int64"), 144 | sequence_length=np.array(batch_sequence_length, dtype="int64"), 145 | input_target_ids=np.array(batch_output_ids, dtype="float32") 146 | ) 147 | batch_word_ids, batch_segment_ids, batch_word_mask, batch_sequence_length, batch_output_ids = [], [], [], [], [] 148 | 149 | -------------------------------------------------------------------------------- /models/knowledge_distiilation.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from official.nlp.modeling import layers 3 | from official.nlp.modeling import networks 4 | 5 | 6 | class Distill_model(tf.keras.Model): 7 | ''' 8 | 使用dssm进行知识蒸馏 9 | ''' 10 | def __init__(self, 11 | config, 12 | teacher_network, 13 | vocab_size, 14 | word_vectors, 15 | **kwargs): 16 | self.config = config 17 | self.vocab_size = vocab_size 18 | self.word_vectors = word_vectors 19 | #冻结teacher network的参数 20 | for layer in teacher_network.layers: 21 | layer.trainable = False 22 | #定义学生模型输入 23 | query = tf.keras.layers.Input(shape=(None,), dtype=tf.int64, name='input_x_ids') 24 | sim_query = tf.keras.layers.Input(shape=(None,), dtype=tf.int64, name='input_y_ids') 25 | #定义老师模型输入 26 | word_ids_a = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_word_ids_a') 27 | mask_a = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_mask_a') 28 | type_ids_a = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_type_ids_a') 29 | word_ids_b = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_word_ids_b') 30 | mask_b = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_mask_b') 31 | type_ids_b = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_type_ids_b') 32 | input_a = [word_ids_a, mask_a, type_ids_a] 33 | input_b = [word_ids_b, mask_b, type_ids_b] 34 | teacher_input = [input_a, input_b] 35 | 36 | #teacher_softlabel 37 | teacher_output = teacher_network(teacher_input) 38 | 39 | teacher_soft_label = softmax_t(self.config['t'], teacher_output['logits']) 40 | 41 | # embedding层 42 | # 利用词嵌入矩阵将输入数据转成词向量,shape=[batch_size, seq_len, embedding_size] 43 | class GatherLayer(tf.keras.layers.Layer): 44 | def __init__(self, config, vocab_size, word_vectors): 45 | super(GatherLayer, self).__init__() 46 | self.config = config 47 | 48 | self.vocab_size = vocab_size 49 | self.word_vectors = word_vectors 50 | 51 | def build(self, input_shape): 52 | with tf.name_scope('embedding'): 53 | if not self.config['use_word2vec']: 54 | self.embedding_w = tf.Variable(tf.keras.initializers.glorot_normal()( 55 | shape=[self.vocab_size, self.config['embedding_size']], 56 | dtype=tf.float32), trainable=True, name='embedding_w') 57 | else: 58 | self.embedding_w = tf.Variable(tf.cast(self.word_vectors, tf.float32), trainable=True, 59 | name='embedding_w') 60 | self.build = True 61 | 62 | def call(self, inputs, **kwargs): 63 | return tf.gather(self.embedding_w, inputs, name='embedded_words') 64 | 65 | def get_config(self): 66 | config = super(GatherLayer, self).get_config() 67 | 68 | return config 69 | 70 | 71 | shared_net = tf.keras.Sequential([GatherLayer(config, vocab_size, word_vectors), 72 | shared_lstm_layer(config)]) 73 | 74 | query_embedding_output = shared_net.predict_step(query) 75 | sim_query_embedding_output = shared_net.predict_step(sim_query) 76 | 77 | 78 | # 余弦函数计算相似度 79 | # cos_similarity余弦相似度[batch_size, similarity] 80 | query_norm = tf.sqrt(tf.reduce_sum(tf.square(query_embedding_output), axis=-1), name='query_norm') 81 | sim_query_norm = tf.sqrt(tf.reduce_sum(tf.square(sim_query_embedding_output), axis=-1), name='sim_query_norm') 82 | 83 | dot = tf.reduce_sum(tf.multiply(query_embedding_output, sim_query_embedding_output), axis=-1) 84 | cos_similarity = tf.divide(dot, (query_norm * sim_query_norm), name='cos_similarity') 85 | self.similarity = cos_similarity 86 | 87 | # 预测为正例的概率 88 | cond = (self.similarity > self.config["neg_threshold"]) 89 | pos = tf.where(cond, tf.square(self.similarity), 1 - tf.square(self.similarity)) 90 | neg = tf.where(cond, 1 - tf.square(self.similarity), tf.square(self.similarity)) 91 | predictions = [[neg[i], pos[i]] for i in range(self.config['batch_size'])] 92 | 93 | self.logits = self.similarity 94 | student_soft_label = softmax_t(self.config['t'], self.logits) 95 | student_hard_label = self.logits 96 | if self.config['is_training']: 97 | #训练时候蒸馏 98 | outputs = dict(student_soft_label=student_soft_label, student_hard_label=student_hard_label, teacher_soft_label=teacher_soft_label, predictions=predictions) 99 | super(Distill_model, self).__init__(inputs=[query, sim_query, teacher_input], outputs=outputs, **kwargs) 100 | else: 101 | #预测时候只加载学生模型 102 | outputs = dict(predictions=predictions) 103 | super(Distill_model, self).__init__(inputs=[query, sim_query], outputs=outputs, **kwargs) 104 | 105 | 106 | 107 | def softmax_t(t, logits): 108 | ''' 109 | 带参数t的softmax 110 | ''' 111 | _sum = tf.reduce_sum(tf.exp(logits/t)) 112 | return tf.exp(logits/t) / _sum 113 | 114 | class shared_lstm_layer(tf.keras.layers.Layer): 115 | ''' 116 | 共享lstm层参数 117 | ''' 118 | def __init__(self, config): 119 | self.config = config 120 | super(shared_lstm_layer, self).__init__() 121 | 122 | def build(self, input_shape): 123 | forward_layer_1 = tf.keras.layers.LSTM(self.config['hidden_size'], dropout=self.config['dropout_rate'], 124 | return_sequences=True) 125 | backward_layer_1 = tf.keras.layers.LSTM(self.config['hidden_size'], dropout=self.config['dropout_rate'], 126 | return_sequences=True, go_backwards=True) 127 | forward_layer_2 = tf.keras.layers.LSTM(self.config['hidden_size'], dropout=self.config['dropout_rate'], 128 | return_sequences=True) 129 | backward_layer_2 = tf.keras.layers.LSTM(self.config['hidden_size'], dropout=self.config['dropout_rate'], 130 | return_sequences=True, go_backwards=True) 131 | self.bilstm_1 = tf.keras.layers.Bidirectional(forward_layer_1, backward_layer=backward_layer_1) 132 | self.bilstm_2 = tf.keras.layers.Bidirectional(forward_layer_2, backward_layer=backward_layer_2) 133 | self.layer_dropout = tf.keras.layers.Dropout(0.4) 134 | self.output_dense = tf.keras.layers.Dense(self.config['output_size']) 135 | 136 | super(shared_lstm_layer, self).build(input_shape) 137 | 138 | def get_config(self): 139 | config = {} 140 | return config 141 | 142 | def call(self, inputs, **kwargs): 143 | query_res_1 = self.bilstm_1(inputs) 144 | query_res_1 = self.layer_dropout(query_res_1) 145 | query_res_2 = self.bilstm_2(query_res_1) 146 | 147 | #取时间步的平均值,摊平[batch_size, forward_size+backward_size] 148 | avg_query_embedding = tf.reduce_mean(query_res_2, axis=1) 149 | tmp_query_embedding = tf.reshape(avg_query_embedding, [self.config['batch_size'], self.config['hidden_size']*2]) 150 | # 全连接层[batch_size, dense_dim] 151 | query_embedding_output = self.output_dense(tmp_query_embedding) 152 | query_embedding_output = tf.keras.activations.relu(query_embedding_output) 153 | return query_embedding_output -------------------------------------------------------------------------------- /data_processor/ner_data_generator.py: -------------------------------------------------------------------------------- 1 | from data_processor.embedding import embedding 2 | import numpy as np 3 | import pandas as pd 4 | import pickle 5 | import os 6 | 7 | 8 | class NERDataGenerator(embedding): 9 | ''' 10 | 生成训练数据 11 | ''' 12 | def __init__(self, config): 13 | super(NERDataGenerator, self).__init__(config) 14 | self.config = config 15 | self.batch_size = config['batch_size'] 16 | self.load_data() 17 | self.train_data, self.train_label, self.eval_data, self.eval_label = self.train_eval_split(self.word_ids, 18 | self.segment_ids, 19 | self.word_mask, 20 | self.sequence_length, 21 | self.labels_idx, 0.2) 22 | 23 | def read_data(self, path): 24 | inputs = [] 25 | labels = [] 26 | with open(os.path.join(path, 'source_BIO_2014_cropus.txt'), 'r', encoding='utf-8') as fr: 27 | for line in fr.readlines(): 28 | inputs.append(line.split(sep=' ')) 29 | with open(os.path.join(path, 'target_BIO_2014_cropus.txt'), 'r', encoding='utf-8') as fr: 30 | for line in fr.readlines(): 31 | labels.append(line.split(sep=' ')) 32 | return inputs[:100], labels[:100] 33 | 34 | def get_labels(self): 35 | return ['O', 'B_LOC', 'I_LOC', 'B_PER', 'I_PER', 'B_ORG', 'I_ORG', 'B_T', 'I_T'] 36 | 37 | def save_input_tokens(self, texts, labels, label_to_index): 38 | ''' 39 | 保存处理完成的输入tokens,方便后续加载 40 | :param texts: 41 | :return: 42 | ''' 43 | word_ids, segment_ids, word_mask, sequence_length = [], [], [], [] 44 | label_ids = [] 45 | for i, text in enumerate(texts): 46 | _word_ids, _segment_ids, _word_mask, _sequence_length = self.encode(text) 47 | word_ids.append(_word_ids) 48 | segment_ids.append(_segment_ids) 49 | word_mask.append(_word_mask) 50 | sequence_length.append(_sequence_length) 51 | label_id = self.seq_labels_to_ids(labels[i], label_to_index) 52 | label_ids.append(label_id) 53 | input_tokens = dict(word_ids=word_ids, segment_ids=segment_ids, word_mask=word_mask, 54 | sequence_length=sequence_length, labels_idx=label_ids) 55 | if not os.path.exists(self.config['output_path']): 56 | os.mkdir(self.config['output_path']) 57 | # 保存准备训练的tokens数据 58 | with open(os.path.join(self.config['output_path'], 'train_tokens.pkl'), "wb") as fw: 59 | pickle.dump(input_tokens, fw) 60 | # 保存预处理的word_to_index数据 61 | # with open(os.path.join(self.config['output_path'], 'word_to_index.pkl'), "wb") as fw: 62 | # pickle.dump(word_to_index, fw) 63 | # 保存预处理的word_to_index数据 64 | with open(os.path.join(self.config['output_path'], 'label_to_index.pkl'), "wb") as fw: 65 | pickle.dump(label_to_index, fw) 66 | return word_ids, segment_ids, word_mask, sequence_length, label_ids 67 | 68 | def load_data(self): 69 | ''' 70 | 加载预处理好的数据 71 | :return: 72 | ''' 73 | 74 | if os.path.exists(os.path.join(self.config['output_path'], "train_tokens.pkl")) and \ 75 | os.path.exists(os.path.join(self.config['output_path'], "label_to_index.pkl")): 76 | print("load existed train data") 77 | # with open(os.path.join(self.config['output_path'], "word_to_index.pkl"), "rb") as f: 78 | # self.word_to_index = pickle.load(f) 79 | with open(os.path.join(self.config['output_path'], "label_to_index.pkl"), "rb") as f: 80 | self.label_to_index = pickle.load(f) 81 | with open(os.path.join(self.config['output_path'], "train_tokens.pkl"), "rb") as f: 82 | train_data = pickle.load(f) 83 | 84 | if os.path.exists(os.path.join(self.config['output_path'], "word_vectors.npy")): 85 | print("load word_vectors") 86 | self.word_vectors = np.load(os.path.join(self.config['output_path'], "word_vectors.npy"), 87 | allow_pickle=True) 88 | 89 | self.word_ids, self.segment_ids, self.word_mask, self.sequence_length, self.labels_idx = np.array(train_data["word_ids"]), \ 90 | np.array(train_data["segment_ids"]), \ 91 | np.array(train_data["word_mask"]), \ 92 | np.array(train_data["sequence_length"]), \ 93 | np.array(train_data["labels_idx"]) 94 | 95 | # self.vocab = self.word_to_index.keys() 96 | # self.vocab_size = len(self.vocab) 97 | else: 98 | # 1,读取原始数据 99 | inputs, labels = self.read_data(self.config['data_path']) 100 | print("read finished") 101 | targets = self.get_labels() 102 | label_to_index = self.label_to_index(targets) 103 | 104 | word_ids, segment_ids, word_mask, sequence_length, label_ids = self.save_input_tokens(inputs, labels, 105 | label_to_index) 106 | print('text to tokens process finished') 107 | 108 | 109 | self.word_ids, self.segment_ids, self.word_mask, self.sequence_length, self.labels_idx = word_ids, segment_ids, word_mask, sequence_length, label_ids 110 | 111 | def train_eval_split(self, word_ids, segment_ids, word_mask, sequence_length, labels, rate): 112 | ''' 113 | 划分训练和验证集 114 | :param data: 115 | :param labels: 116 | :param rate: 117 | :return: 118 | ''' 119 | # np.random.shuffle(data) 120 | perm = int(len(word_ids) * rate) 121 | train_data = (word_ids[perm:], segment_ids[perm:], word_mask[perm:], sequence_length[perm:]) 122 | eval_data = (word_ids[:perm], segment_ids[:perm], word_mask[:perm], sequence_length[:perm]) 123 | train_label = labels[perm:] 124 | eval_label = labels[:perm] 125 | return train_data, train_label, eval_data, eval_label 126 | 127 | 128 | def gen_data(self, input_idx, labels_idx): 129 | ''' 130 | 生成批次数据 131 | :return: 132 | ''' 133 | word_ids, segment_ids, word_mask, sequence_length = input_idx[0], input_idx[1], input_idx[2], input_idx[3] 134 | batch_word_ids, batch_segment_ids, batch_word_mask, batch_sequence_length, batch_output_ids = [], [], [], [], [] 135 | 136 | for i in range(len(word_ids)): 137 | word_id = word_ids[i] 138 | segment_id = segment_ids[i] 139 | mask = word_mask[i] 140 | seq_len = sequence_length[i] 141 | target_ids = labels_idx[i] 142 | batch_word_ids.append(word_id) 143 | batch_segment_ids.append(segment_id) 144 | batch_word_mask.append(mask) 145 | batch_sequence_length.append(seq_len) 146 | batch_output_ids.append(target_ids) 147 | 148 | if len(batch_word_ids) == self.batch_size: 149 | yield dict( 150 | input_word_ids=np.array(batch_word_ids, dtype="int64"), 151 | input_mask=np.array(batch_word_mask, dtype="int64"), 152 | input_type_ids=np.array(batch_segment_ids, dtype="int64"), 153 | sequence_length=np.array(batch_sequence_length, dtype="int64"), 154 | input_target_ids=np.array(batch_output_ids, dtype="float32") 155 | ) 156 | batch_word_ids, batch_segment_ids, batch_word_mask, batch_sequence_length, batch_output_ids = [], [], [], [], [] 157 | 158 | -------------------------------------------------------------------------------- /data_processor/text_match_data_generator_v2.py: -------------------------------------------------------------------------------- 1 | from data_processor.embedding import embedding 2 | import numpy as np 3 | import pandas as pd 4 | import pickle 5 | import os 6 | from random import shuffle 7 | import random 8 | import copy 9 | from itertools import chain 10 | 11 | class TextMatchDataGeneratorV2(embedding): 12 | ''' 13 | 生成训练数据 14 | ''' 15 | def __init__(self, config): 16 | super(TextMatchDataGeneratorV2, self).__init__(config) 17 | self.config = config 18 | self.batch_size = config['batch_size'] 19 | self.load_data() 20 | self.train_data, self.train_label, self.eval_data, self.eval_label = self.train_eval_split(self.word_idx, self.segment_idx, self.word_mask, self.sequence_length,self.labels_idx, 0.2) 21 | 22 | def read_data(self, file_path): 23 | ''' 24 | 加载训练数据 25 | ''' 26 | # df = pd.read_csv(file_path) 27 | # query = [jieba.lcut(i) for i in df['sentence1'].values[0:data_size]] 28 | # sim = [jieba.lcut(i) for i in df['sentence2'].values[0:data_size]] 29 | # query = [list(i) for i in df['sentence1'].values] 30 | # sim = [list(i) for i in df['sentence2'].values] 31 | # import pandas as pd 32 | work_data = pd.read_excel(file_path) 33 | std_query_list = work_data['standard_questions'].tolist() 34 | sim_query_list = work_data['sim_questions'].tolist() 35 | # std_answer_list = work_data['standard_answers'].tolist() 36 | sim = [] 37 | 38 | for i in range(len(std_query_list)): 39 | _sim = sim_query_list[i].split('||') 40 | sim.append(_sim) 41 | 42 | 43 | return std_query_list, sim 44 | 45 | def negative_sampling(self, queries, sim): 46 | ''' 47 | 随机负采样 48 | ''' 49 | new_queries = [] 50 | labels = [] 51 | for i, item in enumerate(queries): 52 | copy_questions = copy.copy(queries) 53 | copy_questions.remove(item) 54 | neg_samples = random.sample(copy_questions, 5) 55 | pos_samples = random.sample(sim[i], 2) 56 | new_queries.append([item] + pos_samples + neg_samples) 57 | labels.append([1]*2 + [0]*5) 58 | return new_queries, labels 59 | 60 | def save_ranking_tokens(self, queries, sim): 61 | ''' 62 | 保存处理完成的输入tokens,方便后续加载 63 | :param texts: 64 | :return: 65 | ''' 66 | 67 | word_ids, segment_ids, word_mask, sequence_length = [], [], [], [] 68 | word_ids_list, segment_ids_list, word_mask_list, sequence_length_list = [], [], [], [] 69 | new_queries, label_ids = self.negative_sampling(queries, sim) 70 | 71 | for j, questions in enumerate(new_queries): 72 | for i, query in enumerate(questions[1:]): 73 | 74 | _word_ids, _segment_ids, _word_mask, _sequence_length = self.encode_v2(query[0], query) 75 | 76 | word_ids.append(_word_ids) 77 | segment_ids.append(_segment_ids) 78 | word_mask.append(_word_mask) 79 | sequence_length.append(_sequence_length) 80 | 81 | word_ids_list.append(word_ids) 82 | segment_ids_list.append(segment_ids) 83 | word_mask_list.append(word_mask) 84 | sequence_length_list.append(sequence_length) 85 | word_ids, segment_ids, word_mask, sequence_length = [], [], [], [] 86 | 87 | 88 | # label_id = self.labels_to_ids([labels[i]], label_to_index) 89 | # label_ids_list.append(label_ids) 90 | input_tokens = dict(word_ids=word_ids_list, query_segment_ids=segment_ids_list, query_word_mask=word_mask_list, 91 | sequence_length=sequence_length_list,labels_idx=label_ids) 92 | if not os.path.exists(self.config['output_path']): 93 | os.mkdir(self.config['output_path']) 94 | #保存准备训练的tokens数据 95 | with open(os.path.join(self.config['output_path'], 'train_tokens.pkl'), "wb") as fw: 96 | pickle.dump(input_tokens, fw) 97 | # 保存预处理的label_to_index数据 98 | # with open(os.path.join(self.config['output_path'], 'label_to_index.pkl'), "wb") as fw: 99 | # pickle.dump(label_to_index, fw) 100 | return word_ids_list, segment_ids_list, word_mask_list, sequence_length_list, label_ids 101 | 102 | def load_data(self): 103 | ''' 104 | 加载预处理好的数据 105 | :return: 106 | ''' 107 | 108 | if os.path.exists(os.path.join(self.config['output_path'], "train_tokens.pkl")) or \ 109 | os.path.exists(os.path.join(self.config['output_path'], "label_to_index.pkl")): 110 | print("load existed train data") 111 | # with open(os.path.join(self.config['output_path'], "word_to_index.pkl"), "rb") as f: 112 | # self.word_to_index = pickle.load(f) 113 | # with open(os.path.join(self.config['output_path'], "label_to_index.pkl"), "rb") as f: 114 | # self.label_to_index = pickle.load(f) 115 | with open(os.path.join(self.config['output_path'], "train_tokens.pkl"), "rb") as f: 116 | train_data = pickle.load(f) 117 | 118 | self.word_idx, self.segment_idx, self.word_mask, self.sequence_length, \ 119 | self.labels_idx = np.array(train_data["word_ids"]), \ 120 | np.array(train_data["query_segment_ids"]), \ 121 | np.array(train_data["query_word_mask"]), \ 122 | np.array(train_data["sequence_length"]), \ 123 | np.array(train_data["labels_idx"]) 124 | else: 125 | # 1,读取原始数据 126 | query, sim = self.read_data(self.config['data_path']) 127 | print("read finished") 128 | 129 | # label_to_index = self.label_to_index(labels) 130 | 131 | word_ids, segment_ids, word_mask, sequence_length, label_ids = self.save_ranking_tokens(query, sim) 132 | print('text to tokens process finished') 133 | 134 | # train_data = dict(inputs_idx=inputs_idx, labels_idx=labels_idx) 135 | # with open(os.path.join(self.config['output_path'], "train_data.pkl"), "wb") as fw: 136 | # pickle.dump(train_data, fw) 137 | # labels_idx = labels 138 | self.word_idx, self.segment_idx, self.word_mask, self.sequence_length, \ 139 | self.labels_idx = word_ids, segment_ids, word_mask, sequence_length, label_ids 140 | 141 | def train_eval_split(self, word_ids, segment_ids, word_mask, sequence_length, 142 | labels, rate): 143 | 144 | split_index = int(len(word_ids) * rate) 145 | train_data = (word_ids[split_index:], segment_ids[split_index:], word_mask[split_index:], 146 | sequence_length[split_index:]) 147 | train_label = labels[split_index:] 148 | eval_data = (word_ids[:split_index], segment_ids[:split_index], word_mask[:split_index], 149 | sequence_length[:split_index]) 150 | eval_label = labels[:split_index] 151 | 152 | return train_data, train_label, eval_data, eval_label 153 | 154 | def gen_data(self, inputs_idx, labels_idx): 155 | ''' 156 | 生成批次数据 157 | :return: 158 | ''' 159 | word_ids, segment_ids, word_mask, sequence_length = inputs_idx[0], inputs_idx[1],inputs_idx[2],inputs_idx[3] 160 | batch_word_ids, batch_segment_ids, batch_word_mask, batch_sequence_length, batch_output_ids= [], [], [], [], [] 161 | 162 | for i in range(len(word_ids)): 163 | batch_word_ids.append(word_ids[i]) 164 | batch_segment_ids.append(segment_ids[i]) 165 | batch_word_mask.append(word_mask[i]) 166 | batch_sequence_length.append(sequence_length[i]) 167 | 168 | batch_output_ids.append(labels_idx[i]) 169 | 170 | 171 | if len(batch_output_ids) == self.batch_size: 172 | yield dict( 173 | input_word_ids=np.array(list(chain(*batch_word_ids)), dtype="int32"), 174 | input_mask=np.array(list(chain(*batch_word_mask)), dtype="int32"), 175 | input_type_ids=np.array(list(chain(*batch_segment_ids)), dtype="int32"), 176 | input_target_ids=np.array(list(chain(*batch_output_ids)), dtype="float32") 177 | ) 178 | batch_word_ids, batch_segment_ids, batch_word_mask, batch_sequence_length, batch_output_ids = [], [], [], [], [] 179 | 180 | -------------------------------------------------------------------------------- /tasks/Itr_pair_task.py: -------------------------------------------------------------------------------- 1 | from official.nlp.bert import tokenization 2 | import tensorflow as tf 3 | from official.nlp.configs import bert 4 | from official.nlp.configs import encoders 5 | from official.nlp.data import pretrain_dataloader 6 | 7 | from official.nlp.tasks.tagging import TaggingTask 8 | from trainer.train_base import TrainBase 9 | from official.nlp.modeling.models import BertClassifier 10 | import os 11 | import json 12 | from data_processor.text_match_data_generator import TextMatchDataGenerator 13 | from official.nlp.modeling.networks import BertEncoder 14 | from official.modeling import tf_utils 15 | from official.nlp.bert import configs as bert_configs 16 | from models.sim_bert import SimBert 17 | 18 | 19 | 20 | class ItrTask(TrainBase): 21 | ''' 22 | 基于bert的分类任务 23 | ''' 24 | def __init__(self, task_config): 25 | self.config = task_config 26 | self.loss = 'loss' 27 | super(ItrTask, self).__init__(task_config) 28 | self.data_generator = TextMatchDataGenerator(task_config) 29 | 30 | 31 | def build_model(self): 32 | ''' 33 | 构建模型 34 | ''' 35 | # encoder_network = encoders.build_encoder(encoders.EncoderConfig( 36 | # bert=encoders.BertEncoderConfig(vocab_size=21128))) 37 | encoder_network = self.build_encoder() 38 | model = SimBert(network=encoder_network, config=self.config) 39 | 40 | return model 41 | 42 | def build_encoder(self): 43 | bert_config = bert_configs.BertConfig.from_json_file(self.config['bert_config_path']) 44 | cfg = bert_config 45 | bert_encoder = BertEncoder( 46 | vocab_size=cfg.vocab_size, 47 | hidden_size=cfg.hidden_size, 48 | num_layers=cfg.num_hidden_layers, 49 | num_attention_heads=cfg.num_attention_heads, 50 | intermediate_size=cfg.intermediate_size, 51 | activation=tf_utils.get_activation(cfg.hidden_act), 52 | dropout_rate=cfg.hidden_dropout_prob, 53 | attention_dropout_rate=cfg.attention_probs_dropout_prob, 54 | max_sequence_length=cfg.max_position_embeddings, 55 | type_vocab_size=cfg.type_vocab_size, 56 | initializer=tf.keras.initializers.TruncatedNormal( 57 | stddev=cfg.initializer_range), 58 | embedding_width=cfg.embedding_size, 59 | return_all_encoder_outputs=True) 60 | # ckpt = tf.train.Checkpoint(model=bert_encoder) 61 | # init_checkpoint = self.config['bert_model_path'] 62 | # ckpt.restore(init_checkpoint).assert_existing_objects_matched() 63 | # bert_encoder.load_weights(init_checkpoint) 64 | return bert_encoder 65 | 66 | def build_losses(self, labels, model_outputs, metrics, aux_losses=None) -> tf.Tensor: 67 | ''' 68 | 构建损失 69 | ''' 70 | with tf.name_scope('TextMatchTask/losses'): 71 | if self.config['model_name'] == 'simbert': 72 | # 构建对比损失 73 | y = tf.reshape(labels, (-1,)) 74 | similarity = model_outputs['logits'] 75 | cond = (similarity < self.config["neg_threshold"]) 76 | zeros = tf.zeros_like(similarity, dtype=tf.float32) 77 | ones = tf.ones_like(similarity, dtype=tf.float32) 78 | squre_similarity = tf.square(similarity) 79 | neg_similarity = tf.where(cond, squre_similarity, zeros) 80 | 81 | pos_loss = y * (tf.square(ones - similarity) / 4) 82 | neg_loss = (ones - y) * neg_similarity 83 | losses = pos_loss + neg_loss 84 | loss = tf.reduce_mean(losses) 85 | return loss 86 | 87 | metrics = dict([(metric.name, metric) for metric in metrics]) 88 | losses = tf.keras.losses.sparse_categorical_crossentropy(labels, 89 | tf.cast(model_outputs['predictions'], tf.float32), 90 | from_logits=True) 91 | 92 | loss = tf.reduce_mean(losses) 93 | 94 | return loss 95 | 96 | def build_inputs(self, inputs): 97 | ''' 98 | 构建输入 99 | ''' 100 | train_input = { 101 | "input_word_ids_a": tf.convert_to_tensor(inputs['input_word_ids_a']), 102 | "input_mask_a": tf.convert_to_tensor(inputs['input_mask_a']), 103 | "input_type_ids_a": tf.convert_to_tensor(inputs['input_type_ids_a']), 104 | "input_word_ids_b": tf.convert_to_tensor(inputs['input_word_ids_b']), 105 | "input_mask_b": tf.convert_to_tensor(inputs['input_mask_b']), 106 | "input_type_ids_b": tf.convert_to_tensor(inputs['input_type_ids_b']), 107 | "labels": inputs['input_target_ids'] 108 | } 109 | return train_input 110 | 111 | def train_step(self, 112 | inputs, 113 | model: tf.keras.Model, 114 | optimizer: tf.keras.optimizers.Optimizer, 115 | metrics=None): 116 | ''' 117 | 进行训练,前向和后向计算 118 | :param inputs: 119 | :param model: 120 | :param optimizer: 121 | :param metrics: 122 | :return: 123 | ''' 124 | 125 | with tf.GradientTape() as tape: 126 | outputs = model(inputs, training=True) 127 | loss = self.build_losses(inputs["labels"], outputs, metrics, aux_losses=None) 128 | 129 | tvars = model.trainable_variables 130 | grads = tape.gradient(loss, tvars) 131 | grads, _ = tf.clip_by_global_norm(grads, clip_norm=5.0) 132 | optimizer.apply_gradients(list(zip(grads, tvars))) 133 | labels = inputs['labels'] 134 | logs = {self.loss: loss} 135 | if metrics: 136 | self.process_metrics(metrics, labels, outputs['predictions']) 137 | logs.update({m.name: m.result() for m in model.metrics}) 138 | if model.compiled_metrics: 139 | self.process_compiled_metrics(model.compiled_metrics, labels, outputs['predictions']) 140 | logs.update({m.name: m.result() for m in metrics or []}) 141 | logs.update({m.name: m.result() for m in model.metrics}) 142 | return logs 143 | 144 | def validation_step(self, inputs, model: tf.keras.Model, metrics=None): 145 | ''' 146 | 验证集验证模型 147 | :param input: 148 | :param model: 149 | :return: 150 | ''' 151 | labels = inputs['labels'] 152 | outputs = self.inference_step(inputs, model) 153 | loss = self.build_losses(labels, outputs, metrics, aux_losses=model.losses) 154 | 155 | logs = {self.loss: loss} 156 | if metrics: 157 | self.process_metrics(metrics, labels, outputs['predictions']) 158 | if model.compiled_metrics: 159 | self.process_compiled_metrics(model.compiled_metrics, labels, outputs['predictions']) 160 | logs.update({m.name: m.result() for m in metrics or []}) 161 | logs.update({m.name: m.result() for m in model.metrics}) 162 | return logs 163 | 164 | def build_metrics(self, training=None): 165 | ''' 166 | 构建评价指标 167 | :param training: 168 | :return: 169 | ''' 170 | # del training 171 | metrics = [ 172 | tf.keras.metrics.SparseCategoricalAccuracy(name='text_match_metrics') 173 | ] 174 | 175 | return metrics 176 | 177 | def check_exist_model(self, model): 178 | ''' 179 | 检查是否存在模型文件 180 | :return: 181 | ''' 182 | # ckpt = tf.train.Checkpoint(models=models) 183 | init_checkpoint = os.path.join(self.config['ckpt_model_path'], self.config['model_name']) 184 | 185 | # ckpt.restore(init_checkpoint).assert_existing_objects_matched() 186 | model.load_weights(init_checkpoint).assert_existing_objects_matched() 187 | 188 | 189 | if __name__=='__main__': 190 | with open("../model_configs/sim_bert.json", 'r') as fr: 191 | config = json.load(fr) 192 | print(config) 193 | Itr_pair = ItrTask(config) 194 | 195 | model = Itr_pair.build_model() 196 | bert_encoder = Itr_pair.build_encoder() 197 | ckpt = tf.train.Checkpoint(model=bert_encoder) 198 | init_checkpoint = config['bert_model_path'] 199 | ckpt.restore(init_checkpoint).assert_existing_objects_matched() 200 | # config = models.get_config() 201 | # Itr_pair.train(model) 202 | print(model.layers) 203 | 204 | 205 | -------------------------------------------------------------------------------- /bert_service/embedding_serving.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Examples of SavedModel export for tf-serving.""" 16 | 17 | from absl import app 18 | from absl import flags 19 | import tensorflow as tf 20 | 21 | from official.nlp.bert import bert_models 22 | from official.nlp.bert import configs 23 | from tasks.embedding_task import EmbeddingTask 24 | from official.nlp.modeling.networks import BertEncoder 25 | from official.modeling import tf_utils 26 | from official.nlp.bert import configs as bert_configs 27 | from data_processor.tokenizer import tokenizer 28 | from models.sentence_embedding import SentenceEmbedding 29 | import json 30 | 31 | 32 | root_path = '/Users/donruo/Desktop/project/bert_tasks/' 33 | 34 | flags.DEFINE_string("bert_config_file", root_path+'chinese_wwm_ext_L-12_H-768_A-12/v2/bert_config.json', 35 | "Bert configuration file to define core bert layers.") 36 | flags.DEFINE_string("model_checkpoint_path", root_path+'chinese_wwm_ext_L-12_H-768_A-12/v2/bert_model.ckpt-1', 37 | "File path to TF model checkpoint.") 38 | flags.DEFINE_string("export_path", root_path+'chinese_wwm_ext_L-12_H-768_A-12/serve/versions/1', 39 | "Destination folder to export the serving SavedModel.") 40 | flags.DEFINE_string("config_path", root_path+'model_configs/sentence_embedding.json', "embedding model configurations") 41 | 42 | FLAGS = flags.FLAGS 43 | 44 | 45 | class BertServing(tf.keras.Model): 46 | """Bert transformer encoder model for serving.""" 47 | 48 | def __init__(self, config, bert_config, name_to_features, name="serving_model"): 49 | super(BertServing, self).__init__(name=name) 50 | 51 | cfg = bert_config 52 | self.bert_encoder = BertEncoder( 53 | vocab_size=cfg.vocab_size, 54 | hidden_size=cfg.hidden_size, 55 | num_layers=cfg.num_hidden_layers, 56 | num_attention_heads=cfg.num_attention_heads, 57 | intermediate_size=cfg.intermediate_size, 58 | activation=tf_utils.get_activation(cfg.hidden_act), 59 | dropout_rate=cfg.hidden_dropout_prob, 60 | attention_dropout_rate=cfg.attention_probs_dropout_prob, 61 | max_sequence_length=cfg.max_position_embeddings, 62 | type_vocab_size=cfg.type_vocab_size, 63 | initializer=tf.keras.initializers.TruncatedNormal( 64 | stddev=cfg.initializer_range), 65 | embedding_width=cfg.embedding_size, 66 | return_all_encoder_outputs=True) 67 | self.model = SentenceEmbedding(self.bert_encoder, config) 68 | # ckpt = tf.train.Checkpoint(model=self.bert_encoder) 69 | # init_checkpoint = self.config['bert_model_path'] 70 | # 71 | # ckpt.restore(init_checkpoint).assert_existing_objects_matched() 72 | self.name_to_features = name_to_features 73 | 74 | def call(self, inputs): 75 | input_word_ids = inputs["input_word_ids"] 76 | input_mask = inputs["input_mask"] 77 | input_type_ids = inputs["input_type_ids"] 78 | infer_input = { 79 | "input_word_ids": input_word_ids, 80 | "input_mask": input_mask, 81 | "input_type_ids": input_type_ids, 82 | } 83 | encoder_outputs = self.model( 84 | infer_input) 85 | return encoder_outputs 86 | 87 | def serve_body(self, input_ids, input_mask=None, segment_ids=None): 88 | if segment_ids is None: 89 | # Requires CLS token is the first token of inputs. 90 | segment_ids = tf.zeros_like(input_ids) 91 | if input_mask is None: 92 | # The mask has 1 for real tokens and 0 for padding tokens. 93 | input_mask = tf.where( 94 | tf.equal(input_ids, 0), tf.zeros_like(input_ids), 95 | tf.ones_like(input_ids)) 96 | 97 | inputs = dict( 98 | input_word_ids=input_ids, input_mask=input_mask, input_type_ids=segment_ids) 99 | return self.call(inputs) 100 | 101 | @tf.function 102 | def serve(self, input_ids, input_mask=None, segment_ids=None): 103 | outputs = self.serve_body(input_ids, input_mask, segment_ids) 104 | # Returns a dictionary to control SignatureDef output signature. 105 | return {"outputs": outputs} 106 | 107 | @tf.function 108 | def serve_examples(self, inputs): 109 | features = tf.io.parse_example(inputs, self.name_to_features) 110 | for key in list(features.keys()): 111 | t = features[key] 112 | if t.dtype == tf.int64: 113 | t = tf.cast(t, tf.int32) 114 | features[key] = t 115 | return self.serve( 116 | features["input_word_ids"], 117 | input_mask=features["input_mask"] if "input_mask" in features else None, 118 | segment_ids=features["input_type_ids"] 119 | if "input_type_ids" in features else None) 120 | 121 | @classmethod 122 | def export(cls, model, export_dir): 123 | if not isinstance(model, cls): 124 | raise ValueError("Invalid model instance: %s, it should be a %s" % 125 | (model, cls)) 126 | 127 | signatures = { 128 | "serving_default": 129 | model.serve.get_concrete_function( 130 | input_ids=tf.TensorSpec( 131 | shape=[None, None], dtype=tf.float32, name="inputs")), 132 | } 133 | if model.name_to_features: 134 | signatures[ 135 | "serving_examples"] = model.serve_examples.get_concrete_function( 136 | tf.TensorSpec(shape=[None], dtype=tf.string, name="examples")) 137 | tf.saved_model.save(model.model, export_dir=export_dir, signatures=signatures) 138 | 139 | 140 | def main(_): 141 | config_path = FLAGS.config_path 142 | with open(config_path, 'r') as fr: 143 | config = json.load(fr) 144 | sequence_length = config['seq_len'] 145 | if sequence_length is not None and sequence_length > 0: 146 | name_to_features = { 147 | "input_word_ids": tf.io.FixedLenFeature([sequence_length], tf.int64), 148 | "input_mask": tf.io.FixedLenFeature([sequence_length], tf.int64), 149 | "input_type_ids": tf.io.FixedLenFeature([sequence_length], tf.int64), 150 | } 151 | else: 152 | name_to_features = None 153 | bert_config = bert_configs.BertConfig.from_json_file(FLAGS.bert_config_file) 154 | serving_model = BertServing( 155 | config=config, bert_config=bert_config, name_to_features=name_to_features) 156 | checkpoint = tf.train.Checkpoint(model=serving_model.bert_encoder) 157 | checkpoint.restore(FLAGS.model_checkpoint_path 158 | ).assert_existing_objects_matched() 159 | '''.run_restore_ops()''' 160 | BertServing.export(serving_model, FLAGS.export_path) 161 | 162 | def get_serving_predict(self, sentence): 163 | ''' 164 | 使用tf-serving加载模型 165 | :param sentence: 166 | :return: 167 | ''' 168 | # docker 169 | # run - t - -rm - p 8500: 8500 \ 170 | # - v "/Users/donruo/Desktop/project/search_algorithm/ranking/tf_ranking/examples/output/export/latest_exporter/1614153823/" \ 171 | # - e MODEL_NAME = saved_model \ 172 | # tensorflow / serving: 1.15.0 & 173 | sentence = list(sentence) 174 | sentence_ids = self.sentence_to_idx(sentence) 175 | # print(sentence_ids) 176 | embedded_words = [] 177 | [embedded_words.append(self.word_vectors[i].tolist()) for i in sentence_ids] 178 | # print(len(embedded_words)) 179 | # tf.contrib.util.make_tensor_proto(padding_sentence, 180 | # dtype=tf.int64, 181 | # shape=[1, 50]).SerializeToString()) 182 | 183 | data = json.dumps({"signature_name": "classifier", "instances": [{"inputs": sentence_ids, "keep_prob": 1.0}]}) 184 | headers = {"content-type": "application/json"} 185 | json_response = requests.post('http://localhost:8501/v1/models/savedModel:predict', 186 | data=data, headers=headers) 187 | prediction = json.loads(json_response.text) 188 | print(prediction) 189 | 190 | return prediction 191 | 192 | if __name__ == "__main__": 193 | flags.mark_flag_as_required("bert_config_file") 194 | flags.mark_flag_as_required("model_checkpoint_path") 195 | flags.mark_flag_as_required("export_path") 196 | flags.mark_flag_as_required('config_path') 197 | app.run(main) 198 | -------------------------------------------------------------------------------- /tasks/ner_task.py: -------------------------------------------------------------------------------- 1 | from official.nlp.bert import tokenization 2 | import tensorflow as tf 3 | from official.nlp.configs import bert 4 | from official.nlp.configs import encoders 5 | from official.nlp.data import pretrain_dataloader 6 | 7 | from official.nlp.tasks.tagging import TaggingTask 8 | from trainer.train_base import TrainBase 9 | from official.nlp.modeling.models import BertTokenClassifier 10 | import os 11 | import json 12 | from data_processor.ner_data_generator import NERDataGenerator 13 | from official.nlp.modeling.networks import BertEncoder 14 | from official.modeling import tf_utils 15 | from official.nlp.bert import configs as bert_configs 16 | 17 | def _masked_labels_and_weights(y_true): 18 | """Masks negative values from token level labels. 19 | 20 | Args: 21 | y_true: Token labels, typically shape (batch_size, seq_len), where tokens 22 | with negative labels should be ignored during loss/accuracy calculation. 23 | 24 | Returns: 25 | (masked_y_true, masked_weights) where `masked_y_true` is the input 26 | with each negative label replaced with zero and `masked_weights` is 0.0 27 | where negative labels were replaced and 1.0 for original labels. 28 | """ 29 | # Ignore the classes of tokens with negative values. 30 | mask = tf.greater_equal(y_true, 0) 31 | # Replace negative labels, which are out of bounds for some loss functions, 32 | # with zero. 33 | masked_y_true = tf.where(mask, y_true, 0) 34 | return masked_y_true, tf.cast(mask, tf.float32) 35 | 36 | class NERTask(TrainBase): 37 | ''' 38 | 基于bert的分类任务 39 | ''' 40 | def __init__(self, task_config): 41 | self.config = task_config 42 | self.loss = 'loss' 43 | super(NERTask, self).__init__(task_config) 44 | self.data_generator = NERDataGenerator(task_config) 45 | 46 | 47 | def build_model(self): 48 | ''' 49 | 构建模型 50 | ''' 51 | # encoder_network = encoders.build_encoder(encoders.EncoderConfig( 52 | # bert=encoders.BertEncoderConfig(vocab_size=21128))) 53 | encoder_network = self.build_encoder() 54 | 55 | 56 | 57 | model = BertTokenClassifier(network=encoder_network, 58 | num_classes=self.config['tag_categories'], 59 | dropout_rate=self.config['dropout_rate'], 60 | output='logits') 61 | # ckpt = tf.train.Checkpoint(models=models) 62 | 63 | # init_checkpoint = self.config['bert_model_path'] 64 | 65 | # ckpt.restore(init_checkpoint).assert_existing_objects_matched() 66 | 67 | # models.load_weights(init_checkpoint).assert_existing_objects_matched() 68 | return model 69 | 70 | def build_encoder(self): 71 | bert_config = bert_configs.BertConfig.from_json_file(self.config['bert_config_path']) 72 | cfg = bert_config 73 | bert_encoder = BertEncoder( 74 | vocab_size=cfg.vocab_size, 75 | hidden_size=cfg.hidden_size, 76 | num_layers=cfg.num_hidden_layers, 77 | num_attention_heads=cfg.num_attention_heads, 78 | intermediate_size=cfg.intermediate_size, 79 | activation=tf_utils.get_activation(cfg.hidden_act), 80 | dropout_rate=cfg.hidden_dropout_prob, 81 | attention_dropout_rate=cfg.attention_probs_dropout_prob, 82 | max_sequence_length=cfg.max_position_embeddings, 83 | type_vocab_size=cfg.type_vocab_size, 84 | initializer=tf.keras.initializers.TruncatedNormal( 85 | stddev=cfg.initializer_range), 86 | embedding_width=cfg.embedding_size, 87 | return_all_encoder_outputs=False) 88 | # ckpt = tf.train.Checkpoint(model=bert_encoder) 89 | # init_checkpoint = self.config['bert_model_path'] 90 | # ckpt.restore(init_checkpoint).assert_existing_objects_matched() 91 | # bert_encoder.load_weights(init_checkpoint) 92 | return bert_encoder 93 | 94 | def build_losses(self, labels, model_outputs, metrics, aux_losses=None) -> tf.Tensor: 95 | ''' 96 | 构建损失 97 | ''' 98 | masked_labels, masked_weights = _masked_labels_and_weights(labels) 99 | metrics = dict([(metric.name, metric) for metric in metrics]) 100 | losses = tf.keras.losses.sparse_categorical_crossentropy(masked_labels, 101 | tf.cast(model_outputs, tf.float32), 102 | from_logits=True) 103 | # metrics['losses'].update_state(losses) 104 | loss = losses 105 | numerator_loss = tf.reduce_sum(loss * masked_weights) 106 | denominator_loss = tf.reduce_sum(masked_weights) 107 | loss = tf.math.divide_no_nan(numerator_loss, denominator_loss) 108 | 109 | return loss 110 | 111 | def train_step(self, 112 | inputs, 113 | model:tf.keras.Model, 114 | optimizer: tf.keras.optimizers.Optimizer, 115 | metrics=None): 116 | ''' 117 | 进行训练,前向和后向计算 118 | :param inputs: 119 | :param model: 120 | :param optimizer: 121 | :param metrics: 122 | :return: 123 | ''' 124 | with tf.GradientTape() as tape: 125 | outputs = model(inputs, training=True) 126 | outputs = outputs[:, 1:self.config['seq_len'] + 1, :] 127 | loss = self.build_losses(labels=inputs['labels'], model_outputs=outputs, metrics=metrics, aux_losses=None) 128 | tvars = model.trainable_variables 129 | grads = tape.gradient(loss, tvars) 130 | grads, _ = tf.clip_by_global_norm(grads, clip_norm=5.0) 131 | optimizer.apply_gradients(list(zip(grads, tvars))) 132 | labels = inputs['labels'] 133 | logs = {self.loss: loss} 134 | if metrics: 135 | self.process_metrics(metrics, labels, outputs) 136 | logs.update({m.name: m.result() for m in model.metrics}) 137 | if model.compiled_metrics: 138 | self.process_compiled_metrics(model.compiled_metrics, labels, outputs) 139 | logs.update({m.name: m.result() for m in metrics or []}) 140 | logs.update({m.name: m.result() for m in model.metrics}) 141 | return logs 142 | 143 | def validation_step(self, inputs, model:tf.keras.Model, metrics=None): 144 | ''' 145 | 验证集验证模型 146 | :param input: 147 | :param model: 148 | :return: 149 | ''' 150 | labels = inputs['labels'] 151 | outputs = self.inference_step(inputs, model) 152 | outputs = outputs[:, 1:self.config['seq_len'] + 1, :] 153 | loss = self.build_losses(labels, outputs, metrics, aux_losses=model.losses) 154 | 155 | logs = {self.loss: loss} 156 | if metrics: 157 | self.process_metrics(metrics, labels, outputs) 158 | if model.compiled_metrics: 159 | self.process_compiled_metrics(model.compiled_metrics, labels, outputs) 160 | logs.update({m.name: m.result() for m in metrics or []}) 161 | logs.update({m.name: m.result() for m in model.metrics}) 162 | return logs 163 | 164 | def build_inputs(self, inputs): 165 | ''' 166 | 构建输入 167 | ''' 168 | train_input = { 169 | "input_word_ids": tf.convert_to_tensor(inputs['input_word_ids']), 170 | "input_mask": tf.convert_to_tensor(inputs['input_mask']), 171 | "input_type_ids": tf.convert_to_tensor(inputs['input_type_ids']), 172 | "labels": inputs['input_target_ids'] 173 | } 174 | return train_input 175 | 176 | def build_metrics(self, training=None): 177 | ''' 178 | 构建评价指标 179 | :param training: 180 | :return: 181 | ''' 182 | # del training 183 | metrics = [ 184 | tf.keras.metrics.SparseCategoricalAccuracy(name='ner_metrics') 185 | ] 186 | 187 | # metrics = dict([(metric.name, metric) for metric in metrics]) 188 | 189 | return metrics 190 | 191 | def check_exist_model(self, model): 192 | ''' 193 | 检查是否存在模型文件 194 | :return: 195 | ''' 196 | # ckpt = tf.train.Checkpoint(models=models) 197 | init_checkpoint = os.path.join(self.config['ckpt_model_path'], self.config['model_name']) 198 | 199 | # ckpt.restore(init_checkpoint).assert_existing_objects_matched() 200 | model.load_weights(init_checkpoint).assert_existing_objects_matched() 201 | 202 | 203 | if __name__=='__main__': 204 | with open("../model_configs/bert_ner.json", 'r') as fr: 205 | config = json.load(fr) 206 | print(config) 207 | ner = NERTask(config) 208 | 209 | model = ner.build_model() 210 | bert_encoder = ner.build_encoder() 211 | ckpt = tf.train.Checkpoint(model=bert_encoder) 212 | init_checkpoint = config['bert_model_path'] 213 | ckpt.restore(init_checkpoint).assert_existing_objects_matched() 214 | # config = models.get_config() 215 | ner.train(model) 216 | 217 | 218 | -------------------------------------------------------------------------------- /data_processor/tokenizer.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 文本转化成tokens 3 | ''' 4 | from data_processor.base_processor import data_base 5 | from itertools import chain 6 | import numpy as np 7 | import pickle 8 | import os 9 | from official.nlp.bert import tokenization 10 | 11 | 12 | class tokenizer(data_base): 13 | ''' 14 | 文本转tokens 15 | ''' 16 | def __init__(self, token_configs): 17 | self.token_configs = token_configs 18 | super(tokenizer, self).__init__(token_configs) 19 | 20 | def tokens_to_ids(self, tokens, tokens_to_index): 21 | ''' 22 | token转索引 23 | :param tokens: 24 | :return: 25 | ''' 26 | ids = [tokens_to_index.get(token, 1) for token in tokens] 27 | return ids 28 | 29 | def labels_to_ids(self, labels, labels_to_index): 30 | ''' 31 | token转索引 32 | :param tokens: 33 | :return: 34 | ''' 35 | ids = [labels_to_index.get(token) for token in labels] 36 | return ids 37 | 38 | def seq_labels_to_ids(self, labels, labels_to_index): 39 | ''' 40 | token转索引 41 | :param tokens: 42 | :return: 43 | ''' 44 | if len(labels) < self.config['seq_len']: 45 | labels += ['O'] * (self.config['seq_len'] - len(labels)) 46 | else: 47 | labels = labels[:self.config['seq_len']] 48 | nan_id = labels_to_index.get('O') 49 | ids = [labels_to_index.get(token, nan_id) for token in labels] 50 | return ids 51 | 52 | def seq2seq_label_process(self, labels): 53 | ''' 54 | seq2seq任务处理label数据,在头尾添加, 55 | :param labels: 56 | :return: 57 | ''' 58 | res = [] 59 | for line in labels: 60 | line.insert(0, "") 61 | line.insert(-1, "") 62 | res.append(line) 63 | return res 64 | 65 | def ids_to_tokens(self, ids, tokens_to_index): 66 | ''' 67 | 索引转成token 68 | :param ids: 69 | :return: 70 | ''' 71 | tokens = [list(tokens_to_index.keys())[id] for id in ids] 72 | return tokens 73 | 74 | def multi_label_to_index(self, labels, label_to_index): 75 | ''' 76 | 多标签数据转索引 77 | :param labels: 78 | :return: 79 | ''' 80 | label_idxs = np.zeros((len(labels), len(label_to_index))) 81 | 82 | for i, label in enumerate(labels): 83 | for l in label: 84 | id = label_to_index.get(l) 85 | label_idxs[i, id] = 1 86 | return label_idxs 87 | 88 | def word_to_index(self, all_words): 89 | ''' 90 | 生成词汇-索引字典 91 | :param texts: 92 | :return: 93 | ''' 94 | 95 | #是否过滤低频词 96 | if self.config['freq_filter']: 97 | vocab = self.word_freq_filter(self.config['freq_filter'], all_words) 98 | else: 99 | vocab = self.get_vocab(all_words) 100 | #设置词典大小 101 | vocab = ["", ""] + vocab 102 | self.vocab_size = self.config['vocab_size'] 103 | if len(vocab) < self.vocab_size: 104 | self.vocab_size = len(vocab) 105 | self.vocab = vocab[:self.vocab_size] 106 | #构建词典索引 107 | word_to_index = dict(zip(vocab, list(range(len(vocab))))) 108 | 109 | return word_to_index 110 | 111 | 112 | def label_to_index(self, labels): 113 | ''' 114 | 标签索引字典 115 | :param labels: 116 | :return: 117 | ''' 118 | if not self.config['multi_label']: 119 | unique_labels = list(set(labels)) # 单标签转换 120 | else: 121 | unique_labels = list(set(chain(*labels)))#多标签转换 122 | label_to_index = dict(zip(unique_labels, list(range(len(unique_labels))))) 123 | return label_to_index 124 | 125 | def padding(self, tokens): 126 | ''' 127 | 将输入序列做定长处理 128 | :param tokens: 129 | :return: 130 | ''' 131 | if len(tokens) < self.config['seq_len']: 132 | tokens += [0] * (self.config['seq_len'] - len(tokens)) 133 | else: 134 | tokens = tokens[:self.config['seq_len']] 135 | return tokens 136 | 137 | def encode(self, text): 138 | ''' 139 | 句子转成token 140 | :param file_path: 141 | :return: 142 | ''' 143 | _tokenizer = tokenization.FullTokenizer(self.config['vocab_path'], do_lower_case=True) 144 | if isinstance(text, str): 145 | 146 | split_tokens = _tokenizer.tokenize(text) 147 | else: 148 | split_tokens = text 149 | if len(split_tokens) > self.config['seq_len']: 150 | split_tokens = split_tokens[:self.config['seq_len']] 151 | sequence_length = self.config['seq_len'] 152 | else: 153 | sequence_length = len(split_tokens) 154 | while (len(split_tokens) < self.config['seq_len']): 155 | split_tokens.append("[PAD]") 156 | # word_mask = [[1]*(maxlen+2) for i in range(data_len)] 157 | 158 | tokens = [] 159 | tokens.append("[CLS]") 160 | for i in split_tokens: 161 | if i not in _tokenizer.vocab: 162 | tokens.append("[UNK]") 163 | print(i) 164 | continue 165 | tokens.append(i) 166 | tokens.append("[SEP]") 167 | word_ids = _tokenizer.convert_tokens_to_ids(tokens) 168 | word_mask = [] 169 | for i in word_ids: 170 | if i == "[PAD]": 171 | word_mask.append(0) 172 | else: 173 | word_mask.append(1) 174 | segment_ids = [0] * len(word_ids) 175 | return word_ids, segment_ids, word_mask, sequence_length 176 | 177 | def encode_v2(self, text_1, text_2): 178 | ''' 179 | 交互式文本匹配编码 180 | ''' 181 | _tokenizer = tokenization.FullTokenizer(self.config['vocab_path'], do_lower_case=True) 182 | if isinstance(text_1, str): 183 | split_tokens_1 = _tokenizer.tokenize(text_1) 184 | else: 185 | split_tokens_1 = text_1 186 | if isinstance(text_2, str): 187 | split_tokens_2 = _tokenizer.tokenize(text_2) 188 | else: 189 | split_tokens_2 = text_1 190 | 191 | if len(split_tokens_1) + len(split_tokens_2) > self.config['seq_len']: 192 | split_tokens_2 = split_tokens_2[:self.config['seq_len'] - len(split_tokens_1)] 193 | sequence_length = self.config['seq_len'] 194 | else: 195 | sequence_length = len(split_tokens_1) + len(split_tokens_2) 196 | while (len(split_tokens_1) + len(split_tokens_2) < self.config['seq_len']): 197 | split_tokens_2.append("[PAD]") 198 | 199 | tokens = [] 200 | segment_ids = [] 201 | tokens.append("[CLS]") 202 | segment_ids.append(0) 203 | for i in split_tokens_1: 204 | if i not in _tokenizer.vocab: 205 | tokens.append("[UNK]") 206 | print(i) 207 | continue 208 | tokens.append(i) 209 | segment_ids.append(0) 210 | tokens.append("[SEP]") 211 | segment_ids.append(0) 212 | for i in split_tokens_2: 213 | if i not in _tokenizer.vocab: 214 | tokens.append("[UNK]") 215 | print(i) 216 | continue 217 | tokens.append(i) 218 | segment_ids.append(1) 219 | tokens.append("[SEP]") 220 | segment_ids.append(1) 221 | word_ids = _tokenizer.convert_tokens_to_ids(tokens) 222 | word_mask = [] 223 | for i in word_ids: 224 | if i == "[PAD]": 225 | word_mask.append(0) 226 | else: 227 | word_mask.append(1) 228 | return word_ids, segment_ids, word_mask, sequence_length 229 | 230 | def save_input_tokens(self, texts, labels, label_to_index): 231 | ''' 232 | 保存处理完成的输入tokens,方便后续加载 233 | :param texts: 234 | :return: 235 | ''' 236 | 237 | word_ids, segment_ids, word_mask, sequence_length = [], [], [], [] 238 | label_ids = [] 239 | for i,text in enumerate(texts): 240 | 241 | _word_ids, _segment_ids, _word_mask, _sequence_length = self.encode(text) 242 | word_ids.append(_word_ids) 243 | segment_ids.append(_segment_ids) 244 | word_mask.append(_word_mask) 245 | sequence_length.append(_sequence_length) 246 | label_id = self.labels_to_ids([labels[i]], label_to_index) 247 | label_ids.append(label_id) 248 | 249 | 250 | input_tokens = dict(word_ids=word_ids, segment_ids=segment_ids, word_mask=word_mask, sequence_length=sequence_length, labels_idx=label_ids) 251 | if not os.path.exists(self.config['output_path']): 252 | os.mkdir(self.config['output_path']) 253 | #保存准备训练的tokens数据 254 | with open(os.path.join(self.config['output_path'], 'train_tokens.pkl'), "wb") as fw: 255 | pickle.dump(input_tokens, fw) 256 | # 保存预处理的word_to_index数据 257 | # with open(os.path.join(self.config['output_path'], 'word_to_index.pkl'), "wb") as fw: 258 | # pickle.dump(word_to_index, fw) 259 | # 保存预处理的word_to_index数据 260 | with open(os.path.join(self.config['output_path'], 'label_to_index.pkl'), "wb") as fw: 261 | pickle.dump(label_to_index, fw) 262 | return word_ids, segment_ids, word_mask, sequence_length, label_ids -------------------------------------------------------------------------------- /models/model_base.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tensorflow as tf 3 | import functools 4 | from typing import Any, Callable, Optional 5 | import abc 6 | 7 | 8 | class BaseModel(tf.Module, metaclass=abc.ABCMeta): 9 | ''' 10 | 模型的基类 11 | ''' 12 | def __init__(self, config): 13 | self.config = config 14 | super(BaseModel, self).__init__() 15 | 16 | def build_model(self): 17 | ''' 18 | 创建模型 19 | :return: 20 | ''' 21 | raise NotImplemented 22 | 23 | def build_inputs(self, inputs): 24 | ''' 25 | 创建输入 26 | :return: 27 | ''' 28 | raise NotImplemented 29 | 30 | def build_losses(self, labels, model_outputs, metrics, aux_losses) -> tf.Tensor: 31 | ''' 32 | 计算loss值 33 | :param labels: 34 | :param model_outputs: 35 | :param metrics: 36 | :return: 37 | ''' 38 | raise NotImplemented 39 | 40 | def build_metrics(self, training: bool = True): 41 | """ 42 | 获取模型训练/验证的评价指标 43 | :param training: 44 | :return: 45 | """ 46 | del training 47 | return [] 48 | 49 | def compile_model(self, 50 | model: tf.keras.Model, 51 | optimizer: tf.keras.optimizers.Optimizer, 52 | loss=None, 53 | train_step: Optional[Callable[..., Any]] = None, 54 | validation_step: Optional[Callable[..., Any]] = None, 55 | **kwargs) -> tf.keras.Model: 56 | """Compiles the models with objects created by the task. 57 | 58 | The method should not be used in any customized training implementation. 59 | 60 | Args: 61 | model: a keras.Model. 62 | optimizer: the keras optimizer. 63 | loss: a callable/list of losses. 64 | train_step: optional train step function defined by the task. 65 | validation_step: optional validation_step step function defined by the 66 | task. 67 | **kwargs: other kwargs consumed by keras.Model compile(). 68 | 69 | Returns: 70 | a compiled keras.Model. 71 | """ 72 | if bool(loss is None) == bool(train_step is None): 73 | raise ValueError("`loss` and `train_step` should be exclusive to " 74 | "each other.") 75 | model.compile(optimizer=optimizer, loss=loss, **kwargs) 76 | 77 | if train_step: 78 | model.train_step = functools.partial( 79 | train_step, model=model, optimizer=model.optimizer) 80 | if validation_step: 81 | model.test_step = functools.partial(validation_step, model=model) 82 | return model 83 | 84 | def process_metrics(self, metrics, labels, model_outputs): 85 | ''' 86 | 处理并更新评价指标 87 | :param metrics: 88 | :param labels: 89 | :param model_outputs: 90 | :return: 91 | ''' 92 | for metric in metrics: 93 | metric.update_state(labels, model_outputs) 94 | 95 | def process_compiled_metrics(self, compiled_metrics, labels, model_outputs): 96 | ''' 97 | 处理并更新compiled metrics 98 | :param compiled_metrics: 99 | :param labels: 100 | :param model_outputs: 101 | :return: 102 | ''' 103 | compiled_metrics.update_state(labels, model_outputs) 104 | 105 | def get_optimizer(self): 106 | ''' 107 | 选择优化算法 108 | :return: 109 | ''' 110 | option = self.config['optimizer'] 111 | optimizer = None 112 | learning_rate = self.config['learning_rate'] 113 | if option == 'adam': 114 | optimizer = tf.keras.optimizers.Adam(learning_rate) 115 | if option == 'rmsprop': 116 | optimizer = tf.keras.optimizers.RMSprop(learning_rate) 117 | if option == 'sgd': 118 | optimizer = tf.keras.optimizers.SGD(learning_rate) 119 | return optimizer 120 | 121 | def train_step(self, 122 | inputs, 123 | model:tf.keras.Model, 124 | optimizer: tf.keras.optimizers.Optimizer, 125 | metrics=None): 126 | ''' 127 | 进行训练,前向和后向计算 128 | :param inputs: 129 | :param model: 130 | :param optimizer: 131 | :param metrics: 132 | :return: 133 | ''' 134 | with tf.GradientTape() as tape: 135 | outputs = model(inputs, training=True) 136 | loss = self.build_losses(labels=inputs['labels'], model_outputs=outputs, metrics=metrics, aux_losses=None) 137 | tvars = model.trainable_variables 138 | grads = tape.gradient(loss, tvars) 139 | grads, _ = tf.clip_by_global_norm(grads, clip_norm=5.0) 140 | optimizer.apply_gradients(list(zip(grads, tvars))) 141 | labels = inputs['labels'] 142 | logs = {self.loss: loss} 143 | if metrics: 144 | self.process_metrics(metrics, labels, outputs) 145 | logs.update({m.name: m.result() for m in model.metrics}) 146 | if model.compiled_metrics: 147 | self.process_compiled_metrics(model.compiled_metrics, labels, outputs) 148 | logs.update({m.name: m.result() for m in metrics or []}) 149 | logs.update({m.name: m.result() for m in model.metrics}) 150 | return logs 151 | 152 | 153 | def validation_step(self, inputs, model:tf.keras.Model, metrics=None): 154 | ''' 155 | 验证集验证模型 156 | :param input: 157 | :param model: 158 | :return: 159 | ''' 160 | labels = inputs['labels'] 161 | outputs = self.inference_step(inputs, model) 162 | loss = self.build_losses(labels, outputs, metrics, aux_losses=model.losses) 163 | 164 | logs = {self.loss: loss} 165 | if metrics: 166 | self.process_metrics(metrics, labels, outputs) 167 | if model.compiled_metrics: 168 | self.process_compiled_metrics(model.compiled_metrics, labels, outputs) 169 | logs.update({m.name: m.result() for m in metrics or []}) 170 | logs.update({m.name: m.result() for m in model.metrics}) 171 | return logs 172 | 173 | def inference_step(self, inputs, model:tf.keras.Model): 174 | ''' 175 | 模型推理 176 | :param inputs: 177 | :param model: 178 | :return: 179 | ''' 180 | return model(inputs, training=False) 181 | 182 | def get_predictions(self, logits): 183 | ''' 184 | 模型预测结果 185 | :param input: 186 | :param models: 187 | :return: 188 | ''' 189 | 190 | predictions = tf.keras.layers.Activation( 191 | tf.nn.log_softmax, dtype=tf.float32)(logits).numpy() 192 | predictions = tf.argmax(predictions, axis=-1, name='predictions') 193 | 194 | return predictions 195 | 196 | def save_ckpt_model(self, model:tf.keras.Model): 197 | ''' 198 | 将模型保存成ckpt格式 199 | :param model: 200 | :return: 201 | ''' 202 | save_path = os.path.join(os.path.abspath(os.path.dirname(os.getcwd())), 203 | self.config["ckpt_model_path"]) 204 | if not os.path.exists(save_path): 205 | os.makedirs(save_path) 206 | model_save_path = os.path.join(save_path, self.config["model_name"]) 207 | 208 | # checkpoint = tf.train.Checkpoint(models) 209 | # checkpoint.save(model_save_path + '/models.ckpt') 210 | model.save_weights(model_save_path) 211 | 212 | def save_pb_model(self, model:tf.keras.Model, checkpoint_dir=None, restore_model_using_load_weights=True): 213 | ''' 214 | 将模型保存成pb格式 215 | :param model: 216 | :return: 217 | ''' 218 | save_path = os.path.join(os.path.abspath(os.path.dirname(os.getcwd())), 219 | self.config["export_model_path"]) 220 | if not os.path.exists(save_path): 221 | os.makedirs(save_path) 222 | model_export_path = os.path.join(save_path, self.config["model_name"]) 223 | 224 | if checkpoint_dir: 225 | # Keras compile/fit() was used to save checkpoint using 226 | # models.save_weights(). 227 | if restore_model_using_load_weights: 228 | model_weight_path = os.path.join(checkpoint_dir, 'checkpoint') 229 | assert tf.io.gfile.exists(model_weight_path) 230 | model.load_weights(model_weight_path) 231 | 232 | # tf.train.Checkpoint API was used via custom training loop logic. 233 | else: 234 | checkpoint = tf.train.Checkpoint(model=model) 235 | 236 | # Restores the models from latest checkpoint. 237 | latest_checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir) 238 | assert latest_checkpoint_file 239 | 240 | checkpoint.restore( 241 | latest_checkpoint_file).assert_existing_objects_matched() 242 | 243 | model.save(model_export_path, include_optimizer=False, save_format='tf') 244 | 245 | def load_ckpt_model(self, model, path, model_name): 246 | ''' 247 | 加载ckpt模型 248 | :param model_path: 249 | :return: 250 | ''' 251 | # models = self.create_model() 252 | path = os.path.join(path, model_name) 253 | model.load_weights(path) 254 | return model 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | -------------------------------------------------------------------------------- /tasks/distillation_task.py: -------------------------------------------------------------------------------- 1 | from official.nlp.bert import tokenization 2 | import tensorflow as tf 3 | from official.nlp.configs import bert 4 | from official.nlp.configs import encoders 5 | from official.nlp.data import pretrain_dataloader 6 | 7 | from official.nlp.tasks.tagging import TaggingTask 8 | from trainer.train_base import TrainBase 9 | from official.nlp.modeling.models import BertClassifier 10 | import os 11 | import json 12 | from data_processor.text_match_data_generator import TextMatchDataGenerator 13 | from official.nlp.modeling.networks import BertEncoder 14 | from official.modeling import tf_utils 15 | from official.nlp.bert import configs as bert_configs 16 | from models.knowledge_distiilation import Distill_model 17 | from models.sim_bert import SimBert 18 | 19 | 20 | 21 | class DistillTask(TrainBase): 22 | ''' 23 | 基于bert的知识蒸馏任务 24 | ''' 25 | def __init__(self, task_config): 26 | self.config = task_config 27 | self.loss = 'loss' 28 | super(DistillTask, self).__init__(task_config) 29 | self.data_generator = TextMatchDataGenerator(task_config) 30 | 31 | 32 | def build_model(self): 33 | ''' 34 | 构建模型 35 | ''' 36 | # encoder_network = encoders.build_encoder(encoders.EncoderConfig( 37 | # bert=encoders.BertEncoderConfig(vocab_size=21128))) 38 | bert_config = bert_configs.BertConfig.from_json_file(self.config['bert_config_path']) 39 | encoder_network = self.build_encoder() 40 | teacher_network = SimBert(network=encoder_network, config=self.config) 41 | 42 | model = Distill_model(teacher_network=teacher_network, config=self.config, vocab_size=bert_config.vocab_size, word_vectors=None) 43 | 44 | return model 45 | 46 | def build_encoder(self): 47 | bert_config = bert_configs.BertConfig.from_json_file(self.config['bert_config_path']) 48 | cfg = bert_config 49 | bert_encoder = BertEncoder( 50 | vocab_size=cfg.vocab_size, 51 | hidden_size=cfg.hidden_size, 52 | num_layers=cfg.num_hidden_layers, 53 | num_attention_heads=cfg.num_attention_heads, 54 | intermediate_size=cfg.intermediate_size, 55 | activation=tf_utils.get_activation(cfg.hidden_act), 56 | dropout_rate=cfg.hidden_dropout_prob, 57 | attention_dropout_rate=cfg.attention_probs_dropout_prob, 58 | max_sequence_length=cfg.max_position_embeddings, 59 | type_vocab_size=cfg.type_vocab_size, 60 | initializer=tf.keras.initializers.TruncatedNormal( 61 | stddev=cfg.initializer_range), 62 | embedding_width=cfg.embedding_size, 63 | return_all_encoder_outputs=True) 64 | # ckpt = tf.train.Checkpoint(model=bert_encoder) 65 | # init_checkpoint = self.config['bert_model_path'] 66 | # ckpt.restore(init_checkpoint).assert_existing_objects_matched() 67 | # bert_encoder.load_weights(init_checkpoint) 68 | return bert_encoder 69 | 70 | def build_losses(self, labels, model_outputs, metrics, aux_losses=None) -> tf.Tensor: 71 | ''' 72 | 构建损失 73 | ''' 74 | with tf.name_scope('TextMatchTask/losses'): 75 | if self.config['model_name'] == 'distill_model': 76 | # mse损失计算 77 | y = tf.reshape(labels, (-1,)) 78 | student_soft_label = model_outputs['student_soft_label'] 79 | teacher_soft_label = model_outputs['teacher_soft_label'] 80 | mse_loss = tf.keras.losses.mean_squared_error(teacher_soft_label, student_soft_label) 81 | 82 | #ce损失计算 83 | similarity = model_outputs['student_hard_label'] 84 | cond = (similarity < self.config["neg_threshold"]) 85 | zeros = tf.zeros_like(similarity, dtype=tf.float32) 86 | ones = tf.ones_like(similarity, dtype=tf.float32) 87 | squre_similarity = tf.square(similarity) 88 | neg_similarity = tf.where(cond, squre_similarity, zeros) 89 | 90 | pos_loss = y * (tf.square(ones - similarity) / 4) 91 | neg_loss = (ones - y) * neg_similarity 92 | ce_loss = pos_loss+neg_loss 93 | losses = self.config['alpha']*mse_loss + (1-self.config['alpha'])*ce_loss 94 | loss = tf.reduce_mean(losses) 95 | return loss 96 | 97 | metrics = dict([(metric.name, metric) for metric in metrics]) 98 | losses = tf.keras.losses.sparse_categorical_crossentropy(labels, 99 | tf.cast(model_outputs['predictions'], tf.float32), 100 | from_logits=True) 101 | 102 | loss = tf.reduce_mean(losses) 103 | 104 | return loss 105 | 106 | def build_inputs(self, inputs): 107 | ''' 108 | 构建输入 109 | ''' 110 | train_input = { 111 | "input_x_ids": tf.convert_to_tensor(inputs['input_word_ids_a']), 112 | "input_y_ids": tf.convert_to_tensor(inputs['input_word_ids_b']), 113 | "input_word_ids_a": tf.convert_to_tensor(inputs['input_word_ids_a']), 114 | "input_mask_a": tf.convert_to_tensor(inputs['input_mask_a']), 115 | "input_type_ids_a": tf.convert_to_tensor(inputs['input_type_ids_a']), 116 | "input_word_ids_b": tf.convert_to_tensor(inputs['input_word_ids_b']), 117 | "input_mask_b": tf.convert_to_tensor(inputs['input_mask_b']), 118 | "input_type_ids_b": tf.convert_to_tensor(inputs['input_type_ids_b']), 119 | "labels": inputs['input_target_ids'] 120 | } 121 | return train_input 122 | 123 | def train_step(self, 124 | inputs, 125 | model: tf.keras.Model, 126 | optimizer: tf.keras.optimizers.Optimizer, 127 | metrics=None): 128 | ''' 129 | 进行训练,前向和后向计算 130 | :param inputs: 131 | :param model: 132 | :param optimizer: 133 | :param metrics: 134 | :return: 135 | ''' 136 | 137 | with tf.GradientTape() as tape: 138 | outputs = model(inputs, training=True) 139 | loss = self.build_losses(inputs["labels"], outputs, metrics, aux_losses=None) 140 | 141 | tvars = model.trainable_variables 142 | grads = tape.gradient(loss, tvars) 143 | grads, _ = tf.clip_by_global_norm(grads, clip_norm=5.0) 144 | optimizer.apply_gradients(list(zip(grads, tvars))) 145 | labels = inputs['labels'] 146 | logs = {self.loss: loss} 147 | if metrics: 148 | self.process_metrics(metrics, labels, outputs['predictions']) 149 | logs.update({m.name: m.result() for m in model.metrics}) 150 | if model.compiled_metrics: 151 | self.process_compiled_metrics(model.compiled_metrics, labels, outputs['predictions']) 152 | logs.update({m.name: m.result() for m in metrics or []}) 153 | logs.update({m.name: m.result() for m in model.metrics}) 154 | return logs 155 | 156 | def validation_step(self, inputs, model: tf.keras.Model, metrics=None): 157 | ''' 158 | 验证集验证模型 159 | :param input: 160 | :param model: 161 | :return: 162 | ''' 163 | labels = inputs['labels'] 164 | outputs = self.inference_step(inputs, model) 165 | loss = self.build_losses(labels, outputs, metrics, aux_losses=model.losses) 166 | 167 | logs = {self.loss: loss} 168 | if metrics: 169 | self.process_metrics(metrics, labels, outputs['predictions']) 170 | if model.compiled_metrics: 171 | self.process_compiled_metrics(model.compiled_metrics, labels, outputs['predictions']) 172 | logs.update({m.name: m.result() for m in metrics or []}) 173 | logs.update({m.name: m.result() for m in model.metrics}) 174 | return logs 175 | 176 | def build_metrics(self, training=None): 177 | ''' 178 | 构建评价指标 179 | :param training: 180 | :return: 181 | ''' 182 | # del training 183 | metrics = [ 184 | tf.keras.metrics.SparseCategoricalAccuracy(name='text_match_metrics') 185 | ] 186 | 187 | return metrics 188 | 189 | def check_exist_model(self, model): 190 | ''' 191 | 检查是否存在模型文件 192 | :return: 193 | ''' 194 | # ckpt = tf.train.Checkpoint(models=models) 195 | init_checkpoint = os.path.join(self.config['ckpt_model_path'], self.config['model_name']) 196 | 197 | # ckpt.restore(init_checkpoint).assert_existing_objects_matched() 198 | model.load_weights(init_checkpoint).assert_existing_objects_matched() 199 | 200 | 201 | if __name__=='__main__': 202 | with open("../model_configs/distill_bert.json", 'r') as fr: 203 | config = json.load(fr) 204 | print(config) 205 | distill_pair = DistillTask(config) 206 | 207 | model = distill_pair.build_model() 208 | bert_encoder = distill_pair.build_encoder() 209 | ckpt = tf.train.Checkpoint(model=bert_encoder) 210 | init_checkpoint = config['bert_model_path'] 211 | ckpt.restore(init_checkpoint).assert_existing_objects_matched() 212 | # config = models.get_config() 213 | # new_model = tf.keras.Model(inputs=model.inputs[0:2], outputs=model.output['predictions']) 214 | # for layer in model.layers: 215 | # if layer.name!='sim_bert': 216 | # new_model.add(layer) 217 | distill_pair.train(model) 218 | # print(new_model.summary()) 219 | 220 | 221 | -------------------------------------------------------------------------------- /tasks/ranking_task.py: -------------------------------------------------------------------------------- 1 | from official.nlp.bert import tokenization 2 | import tensorflow as tf 3 | 4 | 5 | from official.nlp.tasks.tagging import TaggingTask 6 | from trainer.train_base import TrainBase 7 | from official.nlp.modeling.models import BertClassifier 8 | import os 9 | import json 10 | from data_processor.text_match_data_generator_v2 import TextMatchDataGeneratorV2 11 | from official.nlp.modeling.networks import BertEncoder 12 | from official.modeling import tf_utils 13 | from official.nlp.bert import configs as bert_configs 14 | from models.ranking import Ranking 15 | import numpy as np 16 | 17 | 18 | 19 | class RankingTask(TrainBase): 20 | ''' 21 | 基于bert的分类任务 22 | ''' 23 | def __init__(self, task_config): 24 | self.config = task_config 25 | self.loss = 'loss' 26 | super(RankingTask, self).__init__(task_config) 27 | self.data_generator = TextMatchDataGeneratorV2(task_config) 28 | 29 | 30 | def build_model(self): 31 | ''' 32 | 构建模型 33 | ''' 34 | # encoder_network = encoders.build_encoder(encoders.EncoderConfig( 35 | # bert=encoders.BertEncoderConfig(vocab_size=21128))) 36 | encoder_network = self.build_encoder() 37 | model = Ranking(network=encoder_network, config=self.config) 38 | 39 | return model 40 | 41 | def build_encoder(self): 42 | bert_config = bert_configs.BertConfig.from_json_file(self.config['bert_config_path']) 43 | cfg = bert_config 44 | bert_encoder = BertEncoder( 45 | vocab_size=cfg.vocab_size, 46 | hidden_size=cfg.hidden_size, 47 | num_layers=cfg.num_hidden_layers, 48 | num_attention_heads=cfg.num_attention_heads, 49 | intermediate_size=cfg.intermediate_size, 50 | activation=tf_utils.get_activation(cfg.hidden_act), 51 | dropout_rate=cfg.hidden_dropout_prob, 52 | attention_dropout_rate=cfg.attention_probs_dropout_prob, 53 | max_sequence_length=cfg.max_position_embeddings, 54 | type_vocab_size=cfg.type_vocab_size, 55 | initializer=tf.keras.initializers.TruncatedNormal( 56 | stddev=cfg.initializer_range), 57 | embedding_width=cfg.embedding_size, 58 | return_all_encoder_outputs=True) 59 | # ckpt = tf.train.Checkpoint(model=bert_encoder) 60 | # init_checkpoint = self.config['bert_model_path'] 61 | # ckpt.restore(init_checkpoint).assert_existing_objects_matched() 62 | # bert_encoder.load_weights(init_checkpoint) 63 | return bert_encoder 64 | 65 | def lambda_rank_loss(self, scores, labels): 66 | ''' 67 | lambda rank损失 68 | ''' 69 | #delta_lambda计算 70 | rank = tf.range(1., tf.cast(self.config['num_samples'], dtype=tf.float32) + 1) 71 | rank = tf.tile(rank, [self.config['batch_size']]) 72 | rank = tf.reshape(rank, tf.shape(labels)) 73 | rel = 2 ** labels - 1 74 | sorted_label = tf.sort(labels, direction='DESCENDING') 75 | sorted_rel = 2 ** sorted_label - 1 76 | cg_discount = tf.math.log(1. + rank) 77 | dcg_m = rel / cg_discount 78 | dcg = tf.reduce_sum(dcg_m) 79 | stale_ij = dcg_m 80 | new_ij = rel / tf.transpose(cg_discount, perm=[0, 2, 1]) 81 | stale_ji = tf.transpose(stale_ij, perm=[0, 2, 1]) 82 | new_ji = tf.transpose(new_ij, perm=[0, 2, 1]) 83 | #new dcg 84 | dcg_new = dcg - stale_ij + new_ij - stale_ji + new_ji 85 | #delta dcg 86 | dcg_max = tf.reduce_sum(sorted_rel / cg_discount) 87 | ndcg_delta = tf.abs(dcg_new - dcg) / dcg_max 88 | 89 | # 90 | s_i_minus_s_j = scores - tf.transpose(scores, perm=[0, 2, 1]) 91 | #上三角矩阵 92 | mask1 = tf.linalg.band_part(ndcg_delta, 0, -1) 93 | #下三角矩阵 94 | mask2 = tf.linalg.band_part(s_i_minus_s_j, -1, 0) 95 | _loss = mask1 * tf.transpose(mask2, perm=[0, 2, 1]) 96 | loss = tf.reduce_sum(_loss) 97 | return loss 98 | 99 | 100 | def build_losses(self, labels, model_outputs, metrics, aux_losses=None) -> tf.Tensor: 101 | ''' 102 | 构建NDCG损失 103 | ''' 104 | def _ndcg(rank, relations): 105 | _dcg = [(np.power(2, relations[i]) - 1) / np.log2(rank[i] + 1) for i in range(len(relations))] 106 | _sort_similarity = sorted(relations, reverse=True) 107 | _idcg = [(np.power(2, _sort_similarity[i]) - 1) / np.log2(rank[i] + 1) for i in range(len(_sort_similarity))] 108 | _ndcg = tf.reduce_sum(_dcg) / tf.reduce_sum(_idcg) 109 | return _ndcg 110 | 111 | 112 | 113 | with tf.name_scope('TextMatchTask/lambdas'): 114 | # 构建ndcg损失 115 | tf.transpose(labels) 116 | y = tf.reshape(labels, [self.config['batch_size'], 1, self.config['num_samples']]) 117 | similarity = model_outputs['logits'] 118 | 119 | _relations = tf.keras.layers.Activation(tf.nn.sigmoid)(similarity) 120 | relations = tf.reshape(_relations[:, :, 1], tf.shape(y)) 121 | # rank = [i for i in range(1, self.config['num_samples']+1)] 122 | # _dcg = [(np.power(2,relations[i])-1) / np.log2(rank[i]+1) for i in range(len(relations))] 123 | # _sort_similarity = [sorted(item, reverse=True) for item in _dcg] 124 | # _idcg = [(tf.pow(2,r)-1) / (tf.math.log(rank+1)/tf.math.log(2)) for r in _sort_similarity] 125 | # _ndcg = tf.reduce_sum(_dcg) / tf.reduce_sum(_idcg) 126 | # ndcg = [_ndcg(rank, relations[i]) for i in range(len(relations))] 127 | 128 | # y = [_ndcg(rank, y[i]) for i in range(len(y))] 129 | metrics = dict([(metric.name, metric) for metric in metrics]) 130 | # losses = tf.keras.losses.sparse_categorical_crossentropy(y, 131 | # tf.cast(ndcg, tf.float32), 132 | # from_logits=True) 133 | 134 | loss = self.lambda_rank_loss(relations, y) 135 | 136 | return loss 137 | 138 | def build_inputs(self, inputs): 139 | ''' 140 | 构建输入 141 | ''' 142 | train_input = { 143 | "input_word_ids": tf.convert_to_tensor(inputs['input_word_ids']), 144 | "input_mask": tf.convert_to_tensor(inputs['input_mask']), 145 | "input_type_ids": tf.convert_to_tensor(inputs['input_type_ids']), 146 | "labels": inputs['input_target_ids'] 147 | } 148 | return train_input 149 | 150 | def train_step(self, 151 | inputs, 152 | model: tf.keras.Model, 153 | optimizer: tf.keras.optimizers.Optimizer, 154 | metrics=None): 155 | ''' 156 | 进行训练,前向和后向计算 157 | :param inputs: 158 | :param model: 159 | :param optimizer: 160 | :param metrics: 161 | :return: 162 | ''' 163 | 164 | with tf.GradientTape() as tape: 165 | outputs = model(inputs, training=True) 166 | loss = self.build_losses(inputs["labels"], outputs, metrics, aux_losses=None) 167 | 168 | tvars = model.trainable_variables 169 | grads = tape.gradient(loss, tvars) 170 | grads, _ = tf.clip_by_global_norm(grads, clip_norm=5.0) 171 | optimizer.apply_gradients(list(zip(grads, tvars))) 172 | labels = inputs['labels'] 173 | logs = {self.loss: loss} 174 | if metrics: 175 | self.process_metrics(metrics, tf.reshape(labels, (-1,1)), tf.reshape(outputs['predictions'], (-1,1))) 176 | logs.update({m.name: m.result() for m in model.metrics}) 177 | if model.compiled_metrics: 178 | self.process_compiled_metrics(model.compiled_metrics, tf.reshape(labels, (-1,1)), tf.reshape(outputs['predictions'], (-1,1))) 179 | logs.update({m.name: m.result() for m in metrics or []}) 180 | logs.update({m.name: m.result() for m in model.metrics}) 181 | return logs 182 | 183 | def validation_step(self, inputs, model: tf.keras.Model, metrics=None): 184 | ''' 185 | 验证集验证模型 186 | :param input: 187 | :param model: 188 | :return: 189 | ''' 190 | labels = inputs['labels'] 191 | outputs = self.inference_step(inputs, model) 192 | loss = self.build_losses(labels, outputs, metrics, aux_losses=model.losses) 193 | 194 | logs = {self.loss: loss} 195 | if metrics: 196 | self.process_metrics(metrics, tf.reshape(labels, (-1,1)), tf.reshape(outputs['predictions'], (-1,1))) 197 | if model.compiled_metrics: 198 | self.process_compiled_metrics(model.compiled_metrics, tf.reshape(labels, (-1,1)), tf.reshape(outputs['predictions'], (-1,1))) 199 | logs.update({m.name: m.result() for m in metrics or []}) 200 | logs.update({m.name: m.result() for m in model.metrics}) 201 | return logs 202 | 203 | def build_metrics(self, training=None): 204 | ''' 205 | 构建评价指标 206 | :param training: 207 | :return: 208 | ''' 209 | # del training 210 | metrics = [ 211 | tf.keras.metrics.SparseCategoricalAccuracy(name='text_match_metrics') 212 | ] 213 | 214 | return metrics 215 | 216 | def check_exist_model(self, model): 217 | ''' 218 | 检查是否存在模型文件 219 | :return: 220 | ''' 221 | # ckpt = tf.train.Checkpoint(models=models) 222 | init_checkpoint = os.path.join(self.config['ckpt_model_path'], self.config['model_name']) 223 | 224 | # ckpt.restore(init_checkpoint).assert_existing_objects_matched() 225 | model.load_weights(init_checkpoint).assert_existing_objects_matched() 226 | 227 | 228 | if __name__=='__main__': 229 | with open("../model_configs/ranking.json", 'r') as fr: 230 | config = json.load(fr) 231 | print(config) 232 | Itr_pair = RankingTask(config) 233 | 234 | model = Itr_pair.build_model() 235 | bert_encoder = Itr_pair.build_encoder() 236 | ckpt = tf.train.Checkpoint(model=bert_encoder) 237 | init_checkpoint = config['bert_model_path'] 238 | ckpt.restore(init_checkpoint).assert_existing_objects_matched() 239 | # config = models.get_config() 240 | Itr_pair.train(model) 241 | # print(model.layers) 242 | 243 | 244 | -------------------------------------------------------------------------------- /data_processor/text_match_data_generator.py: -------------------------------------------------------------------------------- 1 | from data_processor.embedding import embedding 2 | import numpy as np 3 | import pandas as pd 4 | import pickle 5 | import os 6 | from random import shuffle 7 | 8 | class TextMatchDataGenerator(embedding): 9 | ''' 10 | 生成训练数据 11 | ''' 12 | def __init__(self, config): 13 | super(TextMatchDataGenerator, self).__init__(config) 14 | self.config = config 15 | self.batch_size = config['batch_size'] 16 | self.load_data() 17 | self.train_data, self.train_label, self.eval_data, self.eval_label = self.train_eval_split(self.query_word_idx, self.query_segment_idx, self.query_word_mask, self.query_sequence_length, \ 18 | self.sim_word_idx, self.sim_segment_idx, self.sim_word_mask, self.sim_sequence_length, self.labels_idx, 0.2) 19 | 20 | def read_data(self, file_path, data_size=100): 21 | ''' 22 | 加载训练数据 23 | ''' 24 | df = pd.read_csv(file_path) 25 | # query = [jieba.lcut(i) for i in df['sentence1'].values[0:data_size]] 26 | # sim = [jieba.lcut(i) for i in df['sentence2'].values[0:data_size]] 27 | query = [list(i) for i in df['sentence1'].values[0:data_size]] 28 | sim = [list(i) for i in df['sentence2'].values[0:data_size]] 29 | label = df['label'].values[0:data_size] 30 | 31 | return query, sim, label 32 | 33 | def save_input_tokens(self, query, sim, labels, label_to_index): 34 | ''' 35 | 保存处理完成的输入tokens,方便后续加载 36 | :param texts: 37 | :return: 38 | ''' 39 | 40 | query_word_ids, query_segment_ids, query_word_mask, query_sequence_length = [], [], [], [] 41 | sim_word_ids, sim_segment_ids, sim_word_mask, sim_sequence_length = [], [], [], [] 42 | 43 | label_ids = [] 44 | for i in range(len(query)): 45 | _query_word_ids, _query_segment_ids, _query_word_mask, _query_sequence_length = self.encode(query[i]) 46 | _sim_word_ids, _sim_segment_ids, _sim_word_mask, _sim_sequence_length = self.encode(sim[i]) 47 | 48 | query_word_ids.append(_query_word_ids) 49 | query_segment_ids.append(_query_segment_ids) 50 | query_word_mask.append(_query_word_mask) 51 | query_sequence_length.append(_query_sequence_length) 52 | 53 | sim_word_ids.append(_sim_word_ids) 54 | sim_segment_ids.append(_sim_segment_ids) 55 | sim_word_mask.append(_sim_word_mask) 56 | sim_sequence_length.append(_sim_sequence_length) 57 | 58 | label_id = self.labels_to_ids([labels[i]], label_to_index) 59 | label_ids.append(label_id) 60 | input_tokens = dict(query_word_ids=query_word_ids, query_segment_ids=query_segment_ids, query_word_mask=query_word_mask, 61 | query_sequence_length=query_sequence_length,sim_word_ids=sim_word_ids, 62 | sim_segment_ids=sim_segment_ids, sim_word_mask=sim_word_mask, 63 | sim_sequence_length=sim_sequence_length,labels_idx=label_ids) 64 | if not os.path.exists(self.config['output_path']): 65 | os.mkdir(self.config['output_path']) 66 | #保存准备训练的tokens数据 67 | with open(os.path.join(self.config['output_path'], 'train_tokens.pkl'), "wb") as fw: 68 | pickle.dump(input_tokens, fw) 69 | # 保存预处理的label_to_index数据 70 | with open(os.path.join(self.config['output_path'], 'label_to_index.pkl'), "wb") as fw: 71 | pickle.dump(label_to_index, fw) 72 | return query_word_ids, query_segment_ids, query_word_mask, query_sequence_length,\ 73 | sim_word_ids, sim_segment_ids, sim_word_mask, sim_sequence_length, label_ids 74 | 75 | def load_data(self): 76 | ''' 77 | 加载预处理好的数据 78 | :return: 79 | ''' 80 | 81 | if os.path.exists(os.path.join(self.config['output_path'], "train_tokens.pkl")) and \ 82 | os.path.exists(os.path.join(self.config['output_path'], "label_to_index.pkl")): 83 | print("load existed train data") 84 | # with open(os.path.join(self.config['output_path'], "word_to_index.pkl"), "rb") as f: 85 | # self.word_to_index = pickle.load(f) 86 | with open(os.path.join(self.config['output_path'], "label_to_index.pkl"), "rb") as f: 87 | self.label_to_index = pickle.load(f) 88 | with open(os.path.join(self.config['output_path'], "train_tokens.pkl"), "rb") as f: 89 | train_data = pickle.load(f) 90 | 91 | self.query_word_idx, self.query_segment_idx, self.query_word_mask, self.query_sequence_length, \ 92 | self.sim_word_idx, self.sim_segment_idx, self.sim_word_mask, self.sim_sequence_length, self.labels_idx = np.array(train_data["query_word_ids"]), \ 93 | np.array(train_data["query_segment_ids"]), \ 94 | np.array(train_data["query_word_mask"]), \ 95 | np.array(train_data["query_sequence_length"]), \ 96 | np.array(train_data["sim_word_ids"]), \ 97 | np.array(train_data["sim_segment_ids"]), \ 98 | np.array(train_data["sim_word_mask"]), \ 99 | np.array(train_data["sim_sequence_length"]), \ 100 | np.array(train_data["labels_idx"]) 101 | else: 102 | # 1,读取原始数据 103 | query, sim, labels = self.read_data(self.config['data_path']) 104 | print("read finished") 105 | 106 | label_to_index = self.label_to_index(labels) 107 | 108 | query_word_ids, query_segment_ids, query_word_mask, query_sequence_length, \ 109 | sim_word_ids, sim_segment_ids, sim_word_mask, sim_sequence_length, label_ids = self.save_input_tokens(query, sim, labels, label_to_index) 110 | print('text to tokens process finished') 111 | 112 | # train_data = dict(inputs_idx=inputs_idx, labels_idx=labels_idx) 113 | # with open(os.path.join(self.config['output_path'], "train_data.pkl"), "wb") as fw: 114 | # pickle.dump(train_data, fw) 115 | # labels_idx = labels 116 | self.query_word_idx, self.query_segment_idx, self.query_word_mask, self.query_sequence_length, \ 117 | self.sim_word_idx, self.sim_segment_idx, self.sim_word_mask, self.sim_sequence_length,self.labels_idx = query_word_ids, query_segment_ids, query_word_mask, query_sequence_length,\ 118 | sim_word_ids, sim_segment_ids, sim_word_mask, sim_sequence_length, label_ids 119 | 120 | def train_eval_split(self, query_word_ids, query_segment_ids, query_word_mask, query_sequence_length, 121 | sim_word_ids, sim_segment_ids, sim_word_mask, sim_sequence_length, labels, rate): 122 | 123 | split_index = int(len(query_word_ids) * rate) 124 | train_data = (query_word_ids[split_index:], query_segment_ids[split_index:], query_word_mask[split_index:], 125 | query_sequence_length[split_index:], sim_word_ids[split_index:], sim_segment_ids[split_index:], 126 | sim_word_mask[split_index:], sim_sequence_length[split_index:]) 127 | train_label = labels[split_index:] 128 | eval_data = (query_word_ids[:split_index], query_segment_ids[:split_index], query_word_mask[:split_index], 129 | query_sequence_length[:split_index], sim_word_ids[:split_index], sim_segment_ids[:split_index], 130 | sim_word_mask[:split_index], sim_sequence_length[:split_index]) 131 | eval_label = labels[:split_index] 132 | 133 | return train_data, train_label, eval_data, eval_label 134 | 135 | def gen_data(self, inputs_idx, labels_idx): 136 | ''' 137 | 生成批次数据 138 | :return: 139 | ''' 140 | query_word_ids, query_segment_ids, query_word_mask, query_sequence_length, \ 141 | sim_word_ids, sim_segment_ids, sim_word_mask, sim_sequence_length = inputs_idx[0], inputs_idx[1],inputs_idx[2],\ 142 | inputs_idx[3],inputs_idx[4],inputs_idx[5],\ 143 | inputs_idx[6],inputs_idx[7] 144 | batch_word_ids_a, batch_segment_ids_a, batch_word_mask_a, batch_sequence_length_a, \ 145 | batch_word_ids_b, batch_segment_ids_b, batch_word_mask_b, batch_sequence_length_b, batch_output_ids= [], [], [], [], [], [], [], [], [] 146 | 147 | for i in range(len(query_word_ids)): 148 | batch_word_ids_a.append(query_word_ids[i]) 149 | batch_segment_ids_a.append(query_segment_ids[i]) 150 | batch_word_mask_a.append(query_word_mask[i]) 151 | batch_sequence_length_a.append(query_sequence_length[i]) 152 | 153 | batch_word_ids_b.append(sim_word_ids[i]) 154 | batch_segment_ids_b.append(sim_segment_ids[i]) 155 | batch_word_mask_b.append(sim_word_mask[i]) 156 | batch_sequence_length_b.append(sim_sequence_length[i]) 157 | 158 | batch_output_ids.append(labels_idx[i]) 159 | 160 | 161 | if len(batch_output_ids) == self.batch_size: 162 | yield dict( 163 | input_word_ids_a=np.array(batch_word_ids_a, dtype="int32"), 164 | input_mask_a=np.array(batch_word_mask_a, dtype="int32"), 165 | input_type_ids_a=np.array(batch_segment_ids_a, dtype="int32"), 166 | input_word_ids_b=np.array(batch_word_ids_b, dtype="int32"), 167 | input_mask_b=np.array(batch_word_mask_b, dtype="int32"), 168 | input_type_ids_b=np.array(batch_segment_ids_b, dtype="int32"), 169 | input_target_ids=np.array(batch_output_ids, dtype="float32") 170 | ) 171 | batch_word_ids_a, batch_segment_ids_a, batch_word_mask_a, batch_sequence_length_a, \ 172 | batch_word_ids_b, batch_segment_ids_b, batch_word_mask_b, batch_sequence_length_b, batch_output_ids = [], [], [], [], [], [], [], [], [] 173 | 174 | --------------------------------------------------------------------------------