├── models
    ├── __init__.py
    ├── ranking.py
    ├── sentence_embedding.py
    ├── sim_bert.py
    ├── knowledge_distiilation.py
    └── model_base.py
├── tasks
    ├── __init__.py
    ├── classifier.py
    ├── embedding_task.py
    ├── Itr_pair_task.py
    ├── ner_task.py
    ├── distillation_task.py
    └── ranking_task.py
├── bert_service
    ├── __init__.py
    ├── docker_start.sh
    ├── model_saving_utils.py
    └── embedding_serving.py
├── model_configs
    ├── __init__.py
    ├── sentence_embedding.json
    ├── bert_ner.json
    ├── sim_bert.json
    ├── ranking.json
    ├── classifier.json
    └── distill_bert.json
├── data_processor
    ├── embedding_data_generator.py
    ├── __pycache__
    │   ├── embedding.cpython-36.pyc
    │   ├── embedding.cpython-37.pyc
    │   ├── tokenizer.cpython-36.pyc
    │   ├── tokenizer.cpython-37.pyc
    │   ├── base_processor.cpython-36.pyc
    │   ├── base_processor.cpython-37.pyc
    │   ├── ner_data_generator.cpython-36.pyc
    │   ├── ner_data_generator.cpython-37.pyc
    │   ├── classifier_data_generator.cpython-36.pyc
    │   ├── classifier_data_generator.cpython-37.pyc
    │   ├── text_match_data_generator.cpython-36.pyc
    │   └── text_match_data_generator.cpython-37.pyc
    ├── embedding.py
    ├── base_processor.py
    ├── classifier_data_generator.py
    ├── ner_data_generator.py
    ├── text_match_data_generator_v2.py
    ├── tokenizer.py
    └── text_match_data_generator.py
├── requirements.txt
├── model
    └── __pycache__
    │   ├── __init__.cpython-37.pyc
    │   ├── sim_bert.cpython-37.pyc
    │   ├── model_base.cpython-37.pyc
    │   └── sentence_embedding.cpython-37.pyc
├── trainer
    ├── __pycache__
    │   ├── train_base.cpython-36.pyc
    │   └── train_base.cpython-37.pyc
    └── train_base.py
├── predictor
    ├── __pycache__
    │   └── predict_base.cpython-36.pyc
    ├── predict_base.py
    └── predict.py
└── README.md


/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tasks/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/bert_service/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/model_configs/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data_processor/embedding_data_generator.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow==2.4.0
2 | tf-models-official==2.4.0
3 | jieba
4 | gensim
5 | pandas


--------------------------------------------------------------------------------
/model/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dextroushands/pretraind_model_for_nlp_tasks/HEAD/model/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/model/__pycache__/sim_bert.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dextroushands/pretraind_model_for_nlp_tasks/HEAD/model/__pycache__/sim_bert.cpython-37.pyc


--------------------------------------------------------------------------------
/model/__pycache__/model_base.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dextroushands/pretraind_model_for_nlp_tasks/HEAD/model/__pycache__/model_base.cpython-37.pyc


--------------------------------------------------------------------------------
/trainer/__pycache__/train_base.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dextroushands/pretraind_model_for_nlp_tasks/HEAD/trainer/__pycache__/train_base.cpython-36.pyc


--------------------------------------------------------------------------------
/trainer/__pycache__/train_base.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dextroushands/pretraind_model_for_nlp_tasks/HEAD/trainer/__pycache__/train_base.cpython-37.pyc


--------------------------------------------------------------------------------
/predictor/__pycache__/predict_base.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dextroushands/pretraind_model_for_nlp_tasks/HEAD/predictor/__pycache__/predict_base.cpython-36.pyc


--------------------------------------------------------------------------------
/data_processor/__pycache__/embedding.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dextroushands/pretraind_model_for_nlp_tasks/HEAD/data_processor/__pycache__/embedding.cpython-36.pyc


--------------------------------------------------------------------------------
/data_processor/__pycache__/embedding.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dextroushands/pretraind_model_for_nlp_tasks/HEAD/data_processor/__pycache__/embedding.cpython-37.pyc


--------------------------------------------------------------------------------
/data_processor/__pycache__/tokenizer.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dextroushands/pretraind_model_for_nlp_tasks/HEAD/data_processor/__pycache__/tokenizer.cpython-36.pyc


--------------------------------------------------------------------------------
/data_processor/__pycache__/tokenizer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dextroushands/pretraind_model_for_nlp_tasks/HEAD/data_processor/__pycache__/tokenizer.cpython-37.pyc


--------------------------------------------------------------------------------
/model/__pycache__/sentence_embedding.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dextroushands/pretraind_model_for_nlp_tasks/HEAD/model/__pycache__/sentence_embedding.cpython-37.pyc


--------------------------------------------------------------------------------
/data_processor/__pycache__/base_processor.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dextroushands/pretraind_model_for_nlp_tasks/HEAD/data_processor/__pycache__/base_processor.cpython-36.pyc


--------------------------------------------------------------------------------
/data_processor/__pycache__/base_processor.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dextroushands/pretraind_model_for_nlp_tasks/HEAD/data_processor/__pycache__/base_processor.cpython-37.pyc


--------------------------------------------------------------------------------
/data_processor/__pycache__/ner_data_generator.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dextroushands/pretraind_model_for_nlp_tasks/HEAD/data_processor/__pycache__/ner_data_generator.cpython-36.pyc


--------------------------------------------------------------------------------
/data_processor/__pycache__/ner_data_generator.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dextroushands/pretraind_model_for_nlp_tasks/HEAD/data_processor/__pycache__/ner_data_generator.cpython-37.pyc


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # pretraind_model_for_nlp_tasks
 2 | 
 3 | 构建基于预训练模型的nlp任务，主要任务分为4块，分别为：
 4 | 
 5 | 1.sentence embedding句子向量表示
 6 | 
 7 | 2.classifier 文本分类
 8 | 
 9 | 3.text match 文本匹配
10 | 
11 | 4.ner 命名实体识别
12 | 


--------------------------------------------------------------------------------
/data_processor/__pycache__/classifier_data_generator.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dextroushands/pretraind_model_for_nlp_tasks/HEAD/data_processor/__pycache__/classifier_data_generator.cpython-36.pyc


--------------------------------------------------------------------------------
/data_processor/__pycache__/classifier_data_generator.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dextroushands/pretraind_model_for_nlp_tasks/HEAD/data_processor/__pycache__/classifier_data_generator.cpython-37.pyc


--------------------------------------------------------------------------------
/data_processor/__pycache__/text_match_data_generator.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dextroushands/pretraind_model_for_nlp_tasks/HEAD/data_processor/__pycache__/text_match_data_generator.cpython-36.pyc


--------------------------------------------------------------------------------
/data_processor/__pycache__/text_match_data_generator.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dextroushands/pretraind_model_for_nlp_tasks/HEAD/data_processor/__pycache__/text_match_data_generator.cpython-37.pyc


--------------------------------------------------------------------------------
/model_configs/sentence_embedding.json:
--------------------------------------------------------------------------------
1 | {
2 |   "model_name": "sentence_embeddding",
3 |   "seq_len": 100,
4 |   "pooled_output_size": 256,
5 |   "vocab_path": "../chinese_wwm_ext_L-12_H-768_A-12/v2/vocab.txt",
6 |   "bert_config_path": "../chinese_wwm_ext_L-12_H-768_A-12/v2/bert_config.json",
7 |   "bert_model_path": "../chinese_wwm_ext_L-12_H-768_A-12/v2/bert_model.ckpt-1",
8 |   "output_path": "../output_path/sentence_embedding"
9 | }


--------------------------------------------------------------------------------
/bert_service/docker_start.sh:
--------------------------------------------------------------------------------
 1 | docker stop new_serve
 2 | docker rm new_serve
 3 | docker rmi my_img
 4 | docker run -d --name serving_base tensorflow/serving:2.4.1
 5 | 
 6 | docker cp /Users/donruo/Desktop/project/bert_tasks/chinese_wwm_ext_L-12_H-768_A-12/serve/versions/ serving_base:/models/my_model
 7 | 
 8 | docker commit --change "ENV MODEL_NAME my_model" serving_base my_img
 9 | docker stop serving_base
10 | docker rm serving_base
11 | 
12 | docker run --name new_serve -p 8501:8501 -p 8500:8500 my_img
13 | 


--------------------------------------------------------------------------------
/model_configs/bert_ner.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "model_name": "bert_ner",
 3 |   "learning_rate": 1e-3,
 4 |   "epoches": 10,
 5 |   "batch_size": 8,
 6 |   "optimizer": "adam",
 7 |   "multi_label": 0,
 8 |   "tag_categories": 9,
 9 |   "seq_len": 50,
10 |   "dropout_rate": 0.2,
11 |   "vocab_path": "../chinese_wwm_ext_L-12_H-768_A-12/v2/vocab.txt",
12 |   "bert_config_path": "../chinese_wwm_ext_L-12_H-768_A-12/v2/bert_config.json",
13 |   "bert_model_path": "../chinese_wwm_ext_L-12_H-768_A-12/v2/bert_model.ckpt-1",
14 |   "output_path": "../output_path/ner",
15 |   "ckpt_model_path": "output_path/ckpt_model/bert_ner",
16 |   "export_model_path": "output_path/export_model",
17 |   "data_path": "/Users/donruo/Desktop/project/nlp_models/corpus/pd2014/",
18 |   "test_data": "/Users/donruo/Desktop/project/search_algorithm/query_understand/short_text_classify/data/news/test.tsv"
19 | }


--------------------------------------------------------------------------------
/model_configs/sim_bert.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "model_name": "simbert",
 3 |   "learning_rate": 1e-7,
 4 |   "epoches": 10,
 5 |   "batch_size": 8,
 6 |   "optimizer": "adam",
 7 |   "multi_label": 0,
 8 |   "neg_threshold": 0.4,
 9 |   "freq_filter": 1,
10 |   "seq_len": 50,
11 |   "dropout_rate": 0.2,
12 |   "vocab_path": "../chinese_wwm_ext_L-12_H-768_A-12/v2/vocab.txt",
13 |   "bert_config_path": "../chinese_wwm_ext_L-12_H-768_A-12/v2/bert_config.json",
14 |   "bert_model_path": "../chinese_wwm_ext_L-12_H-768_A-12/v2/bert_model.ckpt-1",
15 |   "output_path": "../output_path/sim",
16 |   "ckpt_model_path": "output_path/ckpt_model/simbert",
17 |   "export_model_path": "output_path/export_model",
18 |   "data_path": "/Users/donruo/Desktop/project/text_match/data/train.csv",
19 |   "test_data": "/Users/donruo/Desktop/project/search_algorithm/query_understand/short_text_classify/data/news/test.tsv"
20 | }


--------------------------------------------------------------------------------
/model_configs/ranking.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "model_name": "ranking",
 3 |   "learning_rate": 1e-5,
 4 |   "epoches": 10,
 5 |   "batch_size": 2,
 6 |   "optimizer": "adam",
 7 |   "multi_label": 0,
 8 |   "num_classes": 2,
 9 |   "embedding_size": 300,
10 |   "seq_len": 128,
11 |   "dropout_rate": 0.2,
12 |   "num_samples": 7,
13 |   "vocab_path": "../chinese_wwm_ext_L-12_H-768_A-12/v2/vocab.txt",
14 |   "bert_config_path": "../chinese_wwm_ext_L-12_H-768_A-12/v2/bert_config.json",
15 |   "bert_model_path": "../chinese_wwm_ext_L-12_H-768_A-12/v2/bert_model.ckpt-1",
16 |   "output_path": "../output_path/ranking",
17 |   "ckpt_model_path": "output_path/ckpt_model/ranking",
18 |   "export_model_path": "output_path/export_model",
19 |   "data_path": "/Users/donruo/Desktop/project/QA/data/标准FAQ.xlsx",
20 |   "test_data": "/Users/donruo/Desktop/project/search_algorithm/query_understand/short_text_classify/data/news/test.tsv"
21 | }


--------------------------------------------------------------------------------
/model_configs/classifier.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "model_name": "bert_classifier",
 3 |   "learning_rate": 1e-5,
 4 |   "epoches": 10,
 5 |   "batch_size": 8,
 6 |   "optimizer": "adam",
 7 |   "multi_label": 0,
 8 |   "num_classes": 15,
 9 |   "embedding_size": 300,
10 |   "seq_len": 128,
11 |   "dropout_rate": 0.2,
12 |   "vocab_path": "../chinese_wwm_ext_L-12_H-768_A-12/v2/vocab.txt",
13 |   "bert_config_path": "../chinese_wwm_ext_L-12_H-768_A-12/v2/bert_config.json",
14 |   "bert_model_path": "../chinese_wwm_ext_L-12_H-768_A-12/v2/bert_model.ckpt-1",
15 |   "output_path": "../output_path",
16 |   "ckpt_model_path": "output_path/ckpt_model/bert_classifier",
17 |   "export_model_path": "output_path/export_model",
18 |   "data_path": "/Users/donruo/Desktop/project/search_algorithm/query_understand/short_text_classify/data/news/train.tsv",
19 |   "test_data": "/Users/donruo/Desktop/project/search_algorithm/query_understand/short_text_classify/data/news/test.tsv"
20 | }


--------------------------------------------------------------------------------
/model_configs/distill_bert.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "model_name": "distill_model",
 3 |   "learning_rate": 1e-7,
 4 |   "epoches": 10,
 5 |   "batch_size": 8,
 6 |   "optimizer": "adam",
 7 |   "multi_label": 0,
 8 |   "neg_threshold": 0.4,
 9 |   "freq_filter": 1,
10 |   "seq_len": 50,
11 |   "dropout_rate": 0.2,
12 |   "use_word2vec": 0,
13 |   "t": 2,
14 |   "alpha": 0.4,
15 |   "embedding_size": 128,
16 |   "hidden_size": 64,
17 |   "output_size": 128,
18 |   "is_training": 1,
19 |   "vocab_path": "../chinese_wwm_ext_L-12_H-768_A-12/v2/vocab.txt",
20 |   "bert_config_path": "../chinese_wwm_ext_L-12_H-768_A-12/v2/bert_config.json",
21 |   "bert_model_path": "../chinese_wwm_ext_L-12_H-768_A-12/v2/bert_model.ckpt-1",
22 |   "output_path": "../output_path/distill",
23 |   "ckpt_model_path": "output_path/ckpt_model/distill_bert",
24 |   "export_model_path": "output_path/export_model",
25 |   "data_path": "/Users/donruo/Desktop/project/text_match/data/train.csv",
26 |   "test_data": "/Users/donruo/Desktop/project/search_algorithm/query_understand/short_text_classify/data/news/test.tsv"
27 | }


--------------------------------------------------------------------------------
/models/ranking.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from official.nlp.modeling import layers
 3 | from official.nlp.modeling import networks
 4 | 
 5 | 
 6 | class Ranking(tf.keras.Model):
 7 |     '''
 8 |     bert的排序模型
 9 |     '''
10 |     def __init__(self, config, network, **kwargs):
11 |         self.config = config
12 |         # 定义模型输入
13 |         word_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_word_ids')
14 |         mask = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_mask')
15 |         type_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_type_ids')
16 |         input = [word_ids, mask, type_ids]
17 | 
18 |         _output = network(input)
19 |         classifier = networks.Classification(
20 |             input_width=_output[1].shape[-1],
21 |             num_classes=self.config['num_classes'],
22 |             output='logits',
23 |             name='sentence_prediction')
24 |         _logits = classifier(_output[1]) #[batch_size*samples_num, 1]
25 |         logits = tf.split(_logits, num_or_size_splits=self.config['batch_size'], axis=0)
26 |         _relations = tf.keras.layers.Activation(tf.nn.sigmoid)(logits)
27 |         predictions = tf.reshape(tf.argmax(_relations), [-1])
28 |         outputs = dict(logits=logits, predictions=predictions)
29 |         super(Ranking, self).__init__(inputs=input, outputs=outputs, **kwargs)
30 | 


--------------------------------------------------------------------------------
/predictor/predict_base.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import os
 3 | import pickle
 4 | import json
 5 | from data_processor.tokenizer import tokenizer
 6 | from models.model_base import BaseModel
 7 | 
 8 | 
 9 | class BasePredictor(BaseModel):
10 |     '''
11 |     构建预测的基础对象
12 |     '''
13 |     def __init__(self, config):
14 |         self.tokenizer = tokenizer(config)
15 |         super(BasePredictor, self).__init__(config)
16 | 
17 |     def load_ckpt_model(self, model, path, model_name):
18 |         '''
19 |         加载ckpt模型
20 |         :param model_path:
21 |         :return:
22 |         '''
23 |         # models = self.create_model()
24 |         path = '../'+os.path.join(path, model_name)
25 |         model.load_weights(path)
26 |         # ckpt = tf.train.Checkpoint(model=model)
27 |         # init_checkpoint = path
28 |         #
29 |         # ckpt.restore(init_checkpoint).assert_existing_objects_matched()
30 | 
31 |         return model
32 | 
33 |     def create_model(self):
34 |         '''
35 |         创建模型
36 |         :return:
37 |         '''
38 |         raise NotImplemented
39 | 
40 |     def load_vocab(self):
41 |         '''
42 |         加载词典
43 |         :return:
44 |         '''
45 |         raise NotImplemented
46 | 
47 |     def predict(self, sentence):
48 |         '''
49 |         预测句子结果
50 |         :param sentence:
51 |         :return:
52 |         '''
53 |         raise NotImplemented
54 | 


--------------------------------------------------------------------------------
/models/sentence_embedding.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | from official.nlp.keras_nlp.encoders.bert_encoder import BertEncoder
 4 | 
 5 | class SentenceEmbedding(tf.keras.Model):
 6 |     '''
 7 |     句子向量
 8 |     '''
 9 |     def __init__(self,
10 |                  encoder_network: tf.keras.Model,
11 |                  # sequence_length,
12 |                  config = None,
13 |                  **kwargs):
14 |         # self.encoder_network = encoder_network
15 |         self.config = config
16 |         # self.sequence_length = sequence_length
17 | 
18 |         # sequence_length = tf.keras.Input(shape=(None,), dtype=tf.int32, name='seqence_length')
19 |         sequence_length = self.config['seq_len']
20 |         inputs = encoder_network.inputs
21 |         outputs = encoder_network(inputs)
22 |         if isinstance(outputs, list):
23 |             sequence_output = outputs[0][-1]
24 |             cls_output = outputs[1]
25 |             encoder_outputs = outputs[0]
26 |         else:
27 |             sequence_output = outputs['sequence_output']
28 |             cls_output = outputs['pooled_output']
29 |             encoder_outputs = outputs['encoder_outputs']
30 | 
31 |         #取第一层和最后一层的均值作为句子embedding
32 |         # if isinstance(sequence_length, int):
33 |         first_layer_outputs = encoder_outputs[0][:, :sequence_length, :]
34 |         last_layer_outputs = encoder_outputs[-1][:, :sequence_length, :]
35 |         average = (first_layer_outputs + last_layer_outputs) / 2.0
36 |         sentence_embedding = tf.reduce_mean(average, axis=1)
37 |         # else:
38 |         #     sentence_embedding = []
39 |         #     for i in range(self.config['batch_size']):
40 |         #         first_layer_outputs = encoder_outputs[0][:, :sequence_length[i], :]
41 |         #         last_layer_outputs = encoder_outputs[-1][:, :sequence_length[i], :]
42 |         #         average = (first_layer_outputs + last_layer_outputs) / 2.0
43 |         #         sentence_embedding.append(tf.reduce_mean(average, axis=1))
44 |         _pooler_layer = tf.keras.layers.Dense(
45 |             units=self.config['pooled_output_size'],
46 |             activation='tanh',
47 |             kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
48 |             name='pooler_transform')
49 |         outputs = _pooler_layer(sentence_embedding)
50 | 
51 |         super(SentenceEmbedding, self).__init__(inputs=inputs, outputs=outputs, **kwargs)


--------------------------------------------------------------------------------
/bert_service/model_saving_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """Utilities to save models."""
16 | 
17 | import os
18 | 
19 | from absl import logging
20 | import tensorflow as tf
21 | import typing
22 | 
23 | 
24 | def export_bert_model(model_export_path: typing.Text,
25 |                       model: tf.keras.Model,
26 |                       checkpoint_dir: typing.Optional[typing.Text] = None,
27 |                       restore_model_using_load_weights: bool = False) -> None:
28 |   """Export BERT model for serving which does not include the optimizer.
29 | 
30 |   Args:
31 |       model_export_path: Path to which exported model will be saved.
32 |       model: Keras model object to export.
33 |       checkpoint_dir: Path from which model weights will be loaded, if
34 |         specified.
35 |       restore_model_using_load_weights: Whether to use checkpoint.restore() API
36 |         for custom checkpoint or to use model.load_weights() API. There are 2
37 |         different ways to save checkpoints. One is using tf.train.Checkpoint and
38 |         another is using Keras model.save_weights(). Custom training loop
39 |         implementation uses tf.train.Checkpoint API and Keras ModelCheckpoint
40 |         callback internally uses model.save_weights() API. Since these two API's
41 |         cannot be used toghether, model loading logic must be take into account
42 |         how model checkpoint was saved.
43 | 
44 |   Raises:
45 |     ValueError when either model_export_path or model is not specified.
46 |   """
47 |   if not model_export_path:
48 |     raise ValueError('model_export_path must be specified.')
49 |   if not isinstance(model, tf.keras.Model):
50 |     raise ValueError('model must be a tf.keras.Model object.')
51 | 
52 |   if checkpoint_dir:
53 |     if restore_model_using_load_weights:
54 |       model_weight_path = os.path.join(checkpoint_dir, 'checkpoint')
55 |       assert tf.io.gfile.exists(model_weight_path)
56 |       model.load_weights(model_weight_path)
57 |     else:
58 |       checkpoint = tf.train.Checkpoint(model=model)
59 | 
60 |       # Restores the model from latest checkpoint.
61 |       latest_checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
62 |       assert latest_checkpoint_file
63 |       logging.info('Checkpoint file %s found and restoring from '
64 |                    'checkpoint', latest_checkpoint_file)
65 |       checkpoint.restore(
66 |           latest_checkpoint_file).assert_existing_objects_matched()
67 | 
68 |   model.save(model_export_path, include_optimizer=False, save_format='tf')
69 | 


--------------------------------------------------------------------------------
/trainer/train_base.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | from models.model_base import BaseModel
 4 | 
 5 | 
 6 | class TrainBase(BaseModel):
 7 |     '''
 8 |     模型训练基础
 9 |     '''
10 |     def __init__(self, train_config):
11 |         self.epoches = train_config['epoches']
12 |         self.data_generator = None
13 | 
14 |         super(TrainBase, self).__init__(train_config)
15 | 
16 |     def train(self, model):
17 |         '''
18 |         训练过程
19 |         :return:
20 |         '''
21 |         model.summary()
22 |         optimizer = self.get_optimizer()
23 |         metrics = self.build_metrics()
24 |         batch_num = 0
25 |         valid_loss = 0
26 |         best_acc = 0
27 |         mean_acc = 0
28 |         for i in range(self.epoches):
29 |             print("------------start train epoch {}--------------------".format(i))
30 |             for train_batch in self.data_generator.gen_data(self.data_generator.train_data, self.data_generator.train_label):
31 |                 train_input = self.build_inputs(train_batch)
32 |                 train_loss = self.train_step(train_input, model, optimizer, metrics)
33 |                 print(train_loss)
34 |                 batch_num += 1
35 | 
36 |                 if batch_num % 3 == 0:
37 |                     print("------------start validation epoch {}--------------".format(i))
38 |                     count = 0
39 |                     sum_acc = 0
40 |                     for valid_batch in self.data_generator.gen_data(self.data_generator.eval_data, self.data_generator.eval_label):
41 |                         count += 1
42 |                         valid_input = self.build_inputs(valid_batch)
43 |                         valid_loss = self.validation_step(valid_input, model, metrics=metrics)
44 |                         print("accuracy: {}".format(metrics[0].result().numpy())+'\n')
45 |                         sum_acc += metrics[0].result().numpy()
46 |                     mean_acc = sum_acc/count
47 |             if mean_acc > best_acc:
48 |                 best_acc = mean_acc
49 |                 # print('save models')
50 |                 self.save_ckpt_model(model)
51 |                 self.save_pb_model(model)
52 | 
53 |     def fit_train(self, model):
54 |         '''
55 |         使用fit训练模型
56 |         :return:
57 |         '''
58 |         optimizer = self.get_optimizer()
59 |         metrics = self.build_metrics()
60 | 
61 |         model = self.compile_model(model,
62 |                              optimizer=optimizer,
63 |                              train_step=self.train_step,
64 |                              validation_step=self.validation_step,
65 |                              metrics=metrics)
66 |         model.summary()
67 |         dataset = self.data_generator.gen_data(self.data_generator.train_data, self.data_generator.train_label)
68 |         valid_data = self.data_generator.gen_data(self.data_generator.eval_data, self.data_generator.eval_label)
69 |         # dataset = dataset.repeat()
70 |         # valid_data = valid_data.repeat()
71 |         # dataset = self.build_inputs(data_)
72 |         logs = model.fit(dataset, epochs=2, steps_per_epoch=3, validation_data=valid_data, validation_steps=1)
73 |         # self.assertIn("loss", logs.history)
74 |         # self.assertIn("accuracy", logs.history)
75 | 
76 | 


--------------------------------------------------------------------------------
/models/sim_bert.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from __future__ import division
 3 | # from __future__ import google_type_annotations
 4 | from __future__ import print_function
 5 | 
 6 | import tensorflow as tf
 7 | from official.nlp.modeling import layers
 8 | from official.nlp.modeling import networks
 9 | 
10 | 
11 | class SimBert(tf.keras.Model):
12 |   """
13 |   bert句子相似度模型
14 |   """
15 | 
16 |   def __init__(self,
17 |                network,
18 |                config,
19 |                initializer='glorot_uniform',
20 |                dropout_rate=0.1,
21 |                ):
22 |       self._self_setattr_tracking = False
23 |       self._network = network
24 |       self._config = {
25 |           'network': network,
26 |           'initializer': initializer,
27 |       }
28 |       self.config = config
29 |       #定义两个句子的输入
30 |       # 定义输入
31 |       word_ids_a = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_word_ids_a')
32 |       mask_a = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_mask_a')
33 |       type_ids_a = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_type_ids_a')
34 |       word_ids_b = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_word_ids_b')
35 |       mask_b = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_mask_b')
36 |       type_ids_b = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_type_ids_b')
37 |       input_a = [word_ids_a, mask_a, type_ids_a]
38 |       input_b = [word_ids_b, mask_b, type_ids_b]
39 | 
40 |       #计算encoder
41 |       outputs_a = network.predict_step(input_a)
42 |       outputs_b = network.predict_step(input_b)
43 | 
44 |       cls_output_a = outputs_a[1]
45 |       query_embedding_output = tf.keras.layers.Dropout(rate=dropout_rate)(cls_output_a)
46 | 
47 |       cls_output_b = outputs_b[1]
48 |       sim_query_embedding_output = tf.keras.layers.Dropout(rate=dropout_rate)(cls_output_b)
49 | 
50 |       # 余弦函数计算相似度
51 |       # cos_similarity余弦相似度[batch_size, similarity]
52 |       query_norm = tf.sqrt(tf.reduce_sum(tf.square(query_embedding_output), axis=-1), name='query_norm')
53 |       sim_query_norm = tf.sqrt(tf.reduce_sum(tf.square(sim_query_embedding_output), axis=-1), name='sim_query_norm')
54 | 
55 |       dot = tf.reduce_sum(tf.multiply(query_embedding_output, sim_query_embedding_output), axis=-1)
56 |       cos_similarity = tf.divide(dot, (query_norm * sim_query_norm), name='cos_similarity')
57 |       self.similarity = cos_similarity
58 | 
59 |       # 预测为正例的概率
60 |       cond = (self.similarity > self.config["neg_threshold"])
61 |       pos = tf.where(cond, tf.square(self.similarity), 1 - tf.square(self.similarity))
62 |       neg = tf.where(cond, 1 - tf.square(self.similarity), tf.square(self.similarity))
63 |       predictions = [[neg[i], pos[i]] for i in range(self.config['batch_size'])]
64 | 
65 |       self.logits = self.similarity
66 |       outputs = dict(logits=self.logits, predictions=predictions)
67 | 
68 |       super(SimBert, self).__init__(inputs=[input_a, input_b], outputs=outputs)
69 | 
70 |   @property
71 |   def checkpoint_items(self):
72 |       return dict(encoder=self._network)
73 | 
74 |   def get_config(self):
75 |       return self._config
76 | 
77 |   @classmethod
78 |   def from_config(cls, config, custom_objects=None):
79 |       return cls(**config)


--------------------------------------------------------------------------------
/data_processor/embedding.py:
--------------------------------------------------------------------------------
  1 | import gensim
  2 | import os
  3 | 
  4 | from data_processor.tokenizer import tokenizer
  5 | import numpy as np
  6 | import h5py
  7 | import logging
  8 | from collections import Counter
  9 | import pandas as pd
 10 | from itertools import chain
 11 | from gensim import corpora, models
 12 | import gensim
 13 | logger = logging.getLogger(__name__)
 14 | 
 15 | class embedding(tokenizer):
 16 |     '''
 17 |     文本向量化
 18 |     '''
 19 |     def __init__(self, embedding_config):
 20 |         self.config = embedding_config
 21 |         super(embedding, self).__init__(embedding_config)
 22 | 
 23 |     def load_word2vec_model(self):
 24 |         '''
 25 |         加载word2vec模型
 26 |         :return:
 27 |         '''
 28 |         model_path = self.config.get('word2vec_path')
 29 |         if not os.path.exists(model_path):
 30 |             raise Exception("model_path did not exit, please check path")
 31 |         model = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=False)
 32 |         return model
 33 | 
 34 |     def load_bert_base(self):
 35 |         '''
 36 |         加载bert_base模型
 37 |         '''
 38 |         model_path = self.config['bert_model_path']
 39 | 
 40 |     def get_word_vectors(self, tokens):
 41 |         '''
 42 |         获取词向量
 43 |         :param tokens:
 44 |         :return:
 45 |         '''
 46 |         features = []
 47 |         embedding_size = self.config['embedding_size']
 48 |         word_vectors = np.zeros(embedding_size).tolist()
 49 |         model = self.load_word2vec_model()
 50 |         for word in tokens:
 51 |             if word in model.index2word:
 52 |                 features.append(model.word_vec(word))
 53 |             else:
 54 |                 features.append(word_vectors)
 55 |                 print("{} is not in vocabulary!".format(word))
 56 |         # print(features)
 57 |         return features
 58 | 
 59 |     def save_vectors(self, vectors, name):
 60 |         '''
 61 |         保存向量到文件中
 62 |         :param vectors:
 63 |         :return:
 64 |         '''
 65 |         file_path = os.path.join(self.config['output_path'], name + '.npy')
 66 |         np.save(file_path, vectors)
 67 | 
 68 |     @staticmethod
 69 |     def trans_to_tf_idf(inputs, dictionary, tf_idf_model):
 70 |         vocab_size = len(dictionary)
 71 |         input_ids = []
 72 |         for question in inputs:
 73 |             # question_ids = []
 74 |             # for question in questions:
 75 |             bow_vec = dictionary.doc2bow(question)
 76 |             tfidf_vec = tf_idf_model[bow_vec]
 77 |             vec = [0] * vocab_size
 78 |             for item in tfidf_vec:
 79 |                 vec[item[0]] = item[1]
 80 |             # question_ids.append(vec)
 81 |             input_ids.append(vec)
 82 |         return input_ids
 83 | 
 84 |     @staticmethod
 85 |     def train_tf_idf(inputs):
 86 |         sentences = inputs
 87 |         dictionary = corpora.Dictionary(sentences)
 88 |         corpus = [dictionary.doc2bow(sentence) for sentence in sentences]
 89 |         tfidf_model = models.TfidfModel(corpus)
 90 |         return dictionary, tfidf_model
 91 | 
 92 |     def get_one_hot_vectors(self, tokens):
 93 |         '''
 94 |         获取one-hot向量
 95 |         :param tokens:
 96 |         :return:
 97 |         '''
 98 |         raise NotImplemented
 99 | 
100 |     def get_tf_idf_vectors(self, tokens):
101 |         '''
102 |         获取tf-idf向量
103 |         :param tokens:
104 |         :return:
105 |         '''
106 |         raise NotImplemented
107 | 
108 | 
109 | 
110 | 


--------------------------------------------------------------------------------
/data_processor/base_processor.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | 数据预处理的基础对象
  3 | '''
  4 | import os
  5 | import jieba
  6 | from collections import Counter
  7 | import jieba.posseg as pseg
  8 | import pandas as pd
  9 | 
 10 | class data_base(object):
 11 |     '''
 12 |     中文文本处理的基础组件
 13 |     '''
 14 |     def __init__(self, data_config):
 15 |         self.config = data_config
 16 | 
 17 | 
 18 |     @staticmethod
 19 |     def read_data(path):
 20 |         '''
 21 |         读取数据集
 22 |         :param path:
 23 |         :return: text, label
 24 |         '''
 25 |         texts = []
 26 |         labels = []
 27 |         with open(path, "rb", encoding='utf8') as f:
 28 |             for line in f.readlines():
 29 |                 text, label = line.strip().split(' ')
 30 |                 texts.append(text.strip())
 31 |                 labels.append(label.strip())
 32 |         return texts, labels
 33 | 
 34 | 
 35 |     @staticmethod
 36 |     def _read_data(path):
 37 |         """
 38 |         读取多标签数据
 39 |         :return: 返回分词后的文本内容和标签，inputs = [[]], labels = [[]]
 40 |         """
 41 |         inputs = []
 42 |         labels = []
 43 |         train_data = pd.read_csv(path, error_bad_lines=False, sep='\t')
 44 |         print(train_data.columns)
 45 |         print(train_data.head(2))
 46 |         inputs = train_data['text_a'].values.tolist()[:100]
 47 |         labels = train_data['label'].values.tolist()[:100]
 48 |         labels = [str(label) for label in labels]
 49 |         # inputs = [list(i) for i in inputs]
 50 | 
 51 |         return inputs, labels
 52 | 
 53 |     def get_all_words(self, tokens):
 54 |         '''
 55 |         对已经分词的数据直接获取所有词
 56 |         :param tokens:
 57 |         :return:
 58 |         '''
 59 |         all_words = []
 60 |         [all_words.extend(i) for i in tokens]
 61 |         return all_words
 62 | 
 63 |     def cut_words(self, texts):
 64 |         '''
 65 |         分词
 66 |         :param text:
 67 |         :return:
 68 |         '''
 69 |         all_words = []
 70 |         for text in texts:
 71 |             words = jieba.lcut(text)
 72 |             all_words.extend(words)
 73 |         return all_words
 74 | 
 75 |     def cut_chars(self, texts):
 76 |         '''
 77 |         将文本分割成字
 78 |         :param text:
 79 |         :return:
 80 |         '''
 81 |         all_chars = []
 82 |         for text in texts:
 83 |             chars = list(text)
 84 |             all_chars.extend(chars)
 85 |         return all_chars
 86 | 
 87 |     def word_pos_filter(self, pos_filter, text):
 88 |         '''
 89 |         根据词性过滤文本
 90 |         :param pos: ['nr'...]
 91 |         :param text:
 92 |         :return:
 93 |         '''
 94 |         words = []
 95 |         pos_text = pseg.lcut(text)
 96 |         for word, pos in pos_text:
 97 |             if pos not in pos_filter:
 98 |                 words.append(word)
 99 |         return words
100 | 
101 |     def word_freq_filter(self, freq, all_words):
102 |         '''
103 |         词频过滤
104 |         :param freq:
105 |         :return:
106 |         '''
107 |         print(all_words)
108 |         word_count = Counter(all_words)  # 统计词频
109 |         sort_word_count = sorted(word_count.items(), key=lambda x: x[1], reverse=True)
110 | 
111 |         # 去除低频词
112 |         words = [item[0] for item in sort_word_count if item[1] >= freq]
113 |         return words
114 | 
115 |     def get_vocab(self, all_words):
116 |         '''
117 |         获取词列表
118 |         :param all_words:
119 |         :return:
120 |         '''
121 |         word_count = Counter(all_words)  # 统计词频
122 |         sort_word_count = sorted(word_count.items(), key=lambda x: x[1], reverse=True)
123 |         vocab = [item[0] for item in sort_word_count]
124 | 
125 |         return vocab
126 | 
127 |     def remove_stop_words(self, all_words):
128 |         '''
129 |         去除停用词
130 |         :param all_words:
131 |         :return:
132 |         '''
133 |         stop_words = self.load_stop_words(self.config['stop_word_path'])
134 |         words = [word for word in all_words if word not in stop_words]
135 |         return words
136 | 
137 |     def load_stop_words(self, stop_word_path):
138 |         '''
139 |         加载停用词表
140 |         :param stop_word_path:
141 |         :return:
142 |         '''
143 |         with open(stop_word_path, "r", encoding="utf8") as fr:
144 |             stop_words = [line.strip() for line in fr.readlines()]
145 |         return stop_words
146 | 
147 | 


--------------------------------------------------------------------------------
/predictor/predict.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import os
  3 | import json
  4 | import pickle
  5 | import sys
  6 | sys.path.append(os.path.abspath(os.path.dirname(os.getcwd())))
  7 | 
  8 | import numpy as np
  9 | from models.sentence_embedding import SentenceEmbedding
 10 | from tasks.classifier import ClassifierTask
 11 | from tasks.ner_task import NERTask
 12 | from tasks.Itr_pair_task import ItrTask
 13 | 
 14 | 
 15 | from predictor.predict_base import BasePredictor
 16 | import pandas as pd
 17 | 
 18 | class Predictor(BasePredictor):
 19 |     '''
 20 |     预测类
 21 |     '''
 22 |     def __init__(self, config):
 23 | 
 24 |         self.config = config
 25 |         super(Predictor, self).__init__(config)
 26 | 
 27 |         self.word_to_index = None
 28 |         self.label_to_index = None
 29 |         self.word_vectors = None
 30 |         self.vocab_size = None
 31 | 
 32 |         self.load_vocab()
 33 |         #创建模型并加载参数
 34 |         self.model = self.create_model()
 35 |         self.model = self.load_ckpt_model(self.model, self.config['ckpt_model_path'], self.config['model_name'])
 36 | 
 37 | 
 38 |     def create_model(self):
 39 |         '''
 40 |         创建模型
 41 |         :return:
 42 |         '''
 43 |         model = None
 44 |         if self.config['model_name'] == 'bert_classifier':
 45 |             model = ClassifierTask(self.config).build_model()
 46 | 
 47 |         if self.config['model_name'] == 'bert_ner':
 48 |             model = NERTask(self.config).build_model()
 49 | 
 50 |         if self.config['model_name'] == 'simbert':
 51 |             model = ItrTask(self.config).build_model()
 52 | 
 53 |         return model
 54 | 
 55 |     def load_vocab(self):
 56 |         '''
 57 |         加载词典
 58 |         :return:
 59 |         '''
 60 |         # with open(os.path.join(self.config['output_path'], "word_to_index.pkl"), "rb") as f:
 61 |         #     self.word_to_index = pickle.load(f)
 62 |         with open(os.path.join(self.config['output_path'], "label_to_index.pkl"), "rb") as f:
 63 |             self.label_to_index = pickle.load(f)
 64 | 
 65 |         # if self.config['use_word2vec']:
 66 |         #     if os.path.exists(os.path.join(self.config['output_path'], "word_vectors.npy")):
 67 |         #         print("load word_vectors")
 68 |         #         self.word_vectors = np.load(os.path.join(self.config['output_path'], "word_vectors.npy"),
 69 |         #                                     allow_pickle=True)
 70 | 
 71 |     def predict(self, sentence):
 72 |         '''
 73 |         句子预测
 74 |         :param sentence:list
 75 |         :return:
 76 |         '''
 77 |         word_ids, segment_ids, word_mask, sequence_length = [], [], [], []
 78 | 
 79 |         _word_ids, _segment_ids, _word_mask, _sequence_length = self.tokenizer.encode(sentence)
 80 |         word_ids.append(_word_ids)
 81 |         segment_ids.append(_segment_ids)
 82 |         word_mask.append(_word_mask)
 83 |         sequence_length.append(_sequence_length)
 84 |         inputs = dict(
 85 |             input_word_ids=word_ids,
 86 |             input_mask=word_mask,
 87 |             input_type_ids=segment_ids,
 88 |         )
 89 | 
 90 |         infer_input = {
 91 |             "input_word_ids": tf.convert_to_tensor(inputs['input_word_ids']),
 92 |             "input_mask": tf.convert_to_tensor(inputs['input_mask']),
 93 |             "input_type_ids": tf.convert_to_tensor(inputs['input_type_ids']),
 94 |         }
 95 |         logits = self.model(infer_input, training=False)
 96 |         predictions = self.get_predictions(logits)
 97 |         label = self.tokenizer.ids_to_tokens(predictions, self.label_to_index)
 98 |         return label
 99 | 
100 |     def sequence_predict(self, sentence):
101 |         '''
102 |         序列标注预测
103 |         :param sentence:
104 |         :return:
105 |         '''
106 |         word_ids, segment_ids, word_mask, sequence_length = [], [], [], []
107 | 
108 |         _word_ids, _segment_ids, _word_mask, _sequence_length = self.tokenizer.encode(sentence)
109 |         word_ids.append(_word_ids)
110 |         segment_ids.append(_segment_ids)
111 |         word_mask.append(_word_mask)
112 |         sequence_length.append(_sequence_length)
113 |         inputs = dict(
114 |             input_word_ids=word_ids,
115 |             input_mask=word_mask,
116 |             input_type_ids=segment_ids,
117 |         )
118 | 
119 |         infer_input = {
120 |             "input_word_ids": tf.convert_to_tensor(inputs['input_word_ids']),
121 |             "input_mask": tf.convert_to_tensor(inputs['input_mask']),
122 |             "input_type_ids": tf.convert_to_tensor(inputs['input_type_ids']),
123 |         }
124 |         outputs = self.model(infer_input, training=False)
125 |         # decode_results = outputs.numpy().tolist()
126 |         predictions = self.get_predictions(outputs)[0][1:_sequence_length]
127 | 
128 |         label = self.tokenizer.ids_to_tokens(predictions, self.label_to_index)
129 |         return label
130 | 
131 | 
132 | 
133 | if __name__=='__main__':
134 |     with open("../model_configs/bert_ner.json", 'r') as fr:
135 |         config = json.load(fr)
136 |     predictor = Predictor(config)
137 |     test_data = pd.read_csv(config['test_data'], error_bad_lines=False, sep='\t')
138 |     inputs = test_data['text_a'].values.tolist()[:10]
139 |     labels = test_data['label'].values.tolist()[:10]
140 |     labels = [str(label) for label in labels]
141 |     predictions = []
142 |     count = 0
143 | 
144 |     for i,sentence in enumerate(inputs):
145 |         prediction = predictor.sequence_predict(sentence)
146 |         # prediction = predictor.predict(sentence)
147 |         print(prediction)
148 |         if prediction[0] == labels[i]:
149 |             print(sentence)
150 |             count += 1
151 |         predictions.extend(prediction)
152 |     # print(predictions)
153 |     print(count/100)
154 |     print(inputs[5])
155 | 


--------------------------------------------------------------------------------
/tasks/classifier.py:
--------------------------------------------------------------------------------
  1 | from official.nlp.bert import tokenization
  2 | import tensorflow as tf
  3 | from official.nlp.configs import bert
  4 | from official.nlp.configs import encoders
  5 | from official.nlp.data import pretrain_dataloader
  6 | 
  7 | from official.nlp.tasks.tagging import TaggingTask
  8 | from trainer.train_base import TrainBase
  9 | from official.nlp.modeling.models import BertClassifier
 10 | import os
 11 | import json
 12 | from data_processor.classifier_data_generator import ClassifierDataGenerator
 13 | from official.nlp.modeling.networks import BertEncoder
 14 | from official.modeling import tf_utils
 15 | from official.nlp.bert import configs as bert_configs
 16 | 
 17 | 
 18 | 
 19 | class ClassifierTask(TrainBase):
 20 |     '''
 21 |     基于bert的分类任务
 22 |     '''
 23 |     def __init__(self, task_config):
 24 |         self.config = task_config
 25 |         self.loss = 'loss'
 26 |         super(ClassifierTask, self).__init__(task_config)
 27 |         self.data_generator = ClassifierDataGenerator(task_config)
 28 | 
 29 | 
 30 |     def build_model(self):
 31 |         '''
 32 |         构建模型
 33 |         '''
 34 |         # encoder_network = encoders.build_encoder(encoders.EncoderConfig(
 35 |         #     bert=encoders.BertEncoderConfig(vocab_size=21128)))
 36 |         encoder_network = self.build_encoder()
 37 | 
 38 | 
 39 | 
 40 |         model = BertClassifier(network=encoder_network,
 41 |                                num_classes=self.config['num_classes'])
 42 |         # ckpt = tf.train.Checkpoint(models=models)
 43 | 
 44 |         # init_checkpoint = self.config['bert_model_path']
 45 | 
 46 |         # ckpt.restore(init_checkpoint).assert_existing_objects_matched()
 47 | 
 48 |         # models.load_weights(init_checkpoint).assert_existing_objects_matched()
 49 |         return model
 50 | 
 51 |     def build_encoder(self):
 52 |         bert_config = bert_configs.BertConfig.from_json_file(self.config['bert_config_path'])
 53 |         cfg = bert_config
 54 |         bert_encoder = BertEncoder(
 55 |             vocab_size=cfg.vocab_size,
 56 |             hidden_size=cfg.hidden_size,
 57 |             num_layers=cfg.num_hidden_layers,
 58 |             num_attention_heads=cfg.num_attention_heads,
 59 |             intermediate_size=cfg.intermediate_size,
 60 |             activation=tf_utils.get_activation(cfg.hidden_act),
 61 |             dropout_rate=cfg.hidden_dropout_prob,
 62 |             attention_dropout_rate=cfg.attention_probs_dropout_prob,
 63 |             max_sequence_length=cfg.max_position_embeddings,
 64 |             type_vocab_size=cfg.type_vocab_size,
 65 |             initializer=tf.keras.initializers.TruncatedNormal(
 66 |                 stddev=cfg.initializer_range),
 67 |             embedding_width=cfg.embedding_size,
 68 |             return_all_encoder_outputs=True)
 69 |         # ckpt = tf.train.Checkpoint(model=bert_encoder)
 70 |         # init_checkpoint = self.config['bert_model_path']
 71 |         # ckpt.restore(init_checkpoint).assert_existing_objects_matched()
 72 |         # bert_encoder.load_weights(init_checkpoint)
 73 |         return bert_encoder
 74 | 
 75 |     def build_losses(self, labels, model_outputs, metrics, aux_losses=None) -> tf.Tensor:
 76 |         '''
 77 |         构建损失
 78 |         '''
 79 |         if self.config['num_classes'] > 1:
 80 |             losses = tf.keras.losses.sparse_categorical_crossentropy(labels,
 81 |                                                                      tf.cast(model_outputs, tf.float32),
 82 |                                                                      from_logits=True)
 83 |         else:
 84 |             losses = tf.keras.losses.categorical_crossentropy(labels,
 85 |                                                               tf.cast(model_outputs, tf.float32),
 86 |                                                               from_logits=True
 87 |                                                               )
 88 |         # metrics['losses'].update_state(losses)
 89 |         loss = tf.reduce_mean(losses)
 90 | 
 91 |         return loss
 92 | 
 93 |     def build_inputs(self, inputs):
 94 |         '''
 95 |         构建输入
 96 |         '''
 97 |         train_input = {
 98 |             "input_word_ids": tf.convert_to_tensor(inputs['input_word_ids']),
 99 |             "input_mask": tf.convert_to_tensor(inputs['input_mask']),
100 |             "input_type_ids": tf.convert_to_tensor(inputs['input_type_ids']),
101 |             "labels": inputs['input_target_ids']
102 |         }
103 |         return train_input
104 | 
105 |     def build_metrics(self, training=None):
106 |         '''
107 |         构建评价指标
108 |         :param training:
109 |         :return:
110 |         '''
111 |         # del training
112 |         metrics = [
113 |             tf.keras.metrics.SparseCategoricalAccuracy(name='classifier_metrics')
114 |         ]
115 | 
116 |         # metrics = dict([(metric.name, metric) for metric in metrics])
117 | 
118 |         return metrics
119 | 
120 |     def check_exist_model(self, model):
121 |         '''
122 |         检查是否存在模型文件
123 |         :return:
124 |         '''
125 |         # ckpt = tf.train.Checkpoint(models=models)
126 |         init_checkpoint = os.path.join(self.config['ckpt_model_path'], self.config['model_name'])
127 | 
128 |         # ckpt.restore(init_checkpoint).assert_existing_objects_matched()
129 |         model.load_weights(init_checkpoint).assert_existing_objects_matched()
130 | 
131 | 
132 | if __name__=='__main__':
133 |     with open("../model_configs/classifier.json", 'r') as fr:
134 |         config = json.load(fr)
135 |     print(config)
136 |     classifier = ClassifierTask(config)
137 | 
138 |     model = classifier.build_model()
139 |     bert_encoder = classifier.build_encoder()
140 |     ckpt = tf.train.Checkpoint(model=bert_encoder)
141 |     init_checkpoint = config['bert_model_path']
142 |     ckpt.restore(init_checkpoint).assert_existing_objects_matched()
143 |     # config = models.get_config()
144 |     classifier.train(model)
145 | 
146 | 
147 | 


--------------------------------------------------------------------------------
/tasks/embedding_task.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | from tensorflow_addons.layers import crf
  3 | 
  4 | import json
  5 | import os
  6 | from trainer.train_base import TrainBase
  7 | from data_processor.classifier_data_generator import ClassifierDataGenerator
  8 | from data_processor.ner_data_generator import NERDataGenerator
  9 | 
 10 | # from official.nlp.keras_nlp.encoders.bert_encoder import BertEncoder
 11 | from official.nlp.modeling.networks import BertEncoder
 12 | from official.modeling import tf_utils
 13 | from official.nlp.bert import configs as bert_configs
 14 | from data_processor.tokenizer import tokenizer
 15 | from official.nlp.configs import encoders
 16 | import dataclasses
 17 | from official.modeling.hyperparams import base_config
 18 | from official.core import base_task
 19 | from official.core import config_definitions as cfg
 20 | from official.core import task_factory
 21 | from typing import List, Optional, Tuple
 22 | 
 23 | from models.sentence_embedding import SentenceEmbedding
 24 | import requests
 25 | import numpy as np
 26 | 
 27 | @dataclasses.dataclass
 28 | class ModelConfig(base_config.Config):
 29 |   """A base span labeler configuration."""
 30 |   encoder: encoders.EncoderConfig = encoders.EncoderConfig()
 31 |   head_dropout: float = 0.1
 32 |   head_initializer_range: float = 0.02
 33 | 
 34 | 
 35 | @dataclasses.dataclass
 36 | class embeddingConfig(cfg.TaskConfig):
 37 |   """The models config."""
 38 |   # At most one of `init_checkpoint` and `hub_module_url` can be specified.
 39 |   init_checkpoint: str = ''
 40 |   hub_module_url: str = ''
 41 |   model: ModelConfig = ModelConfig()
 42 | 
 43 |   # The real class names, the order of which should match real label id.
 44 |   # Note that a word may be tokenized into multiple word_pieces tokens, and
 45 |   # we asssume the real label id (non-negative) is assigned to the first token
 46 |   # of the word, and a negative label id is assigned to the remaining tokens.
 47 |   # The negative label id will not contribute to loss and metrics.
 48 |   class_names: Optional[List[str]] = None
 49 |   train_data: cfg.DataConfig = cfg.DataConfig()
 50 |   validation_data: cfg.DataConfig = cfg.DataConfig()
 51 | 
 52 | class EmbeddingTask(object):
 53 |     '''
 54 |     抽取句子向量任务
 55 |     '''
 56 |     def __init__(self, config):
 57 |         self.config = config
 58 | 
 59 |     def build_model(self):
 60 |         '''
 61 |         构建模型
 62 |         '''
 63 |         # encoder_network = encoders.build_encoder(encoders.EncoderConfig(bert=encoders.BertEncoderConfig(vocab_size=21128,
 64 |         #                                         num_layers=1)))
 65 |         bert_config = bert_configs.BertConfig.from_json_file(self.config['bert_config_path'])
 66 |         cfg = bert_config
 67 |         bert_encoder = BertEncoder(
 68 |             vocab_size=cfg.vocab_size,
 69 |             hidden_size=cfg.hidden_size,
 70 |             num_layers=cfg.num_hidden_layers,
 71 |             num_attention_heads=cfg.num_attention_heads,
 72 |             intermediate_size=cfg.intermediate_size,
 73 |             activation=tf_utils.get_activation(cfg.hidden_act),
 74 |             dropout_rate=cfg.hidden_dropout_prob,
 75 |             attention_dropout_rate=cfg.attention_probs_dropout_prob,
 76 |             max_sequence_length=cfg.max_position_embeddings,
 77 |             type_vocab_size=cfg.type_vocab_size,
 78 |             initializer=tf.keras.initializers.TruncatedNormal(
 79 |                 stddev=cfg.initializer_range),
 80 |             embedding_width=cfg.embedding_size,
 81 |             return_all_encoder_outputs=True)
 82 |         model = SentenceEmbedding(bert_encoder, self.config)
 83 |         ckpt = tf.train.Checkpoint(model=bert_encoder)
 84 |         init_checkpoint = self.config['bert_model_path']
 85 | 
 86 |         ckpt.restore(init_checkpoint).assert_existing_objects_matched()
 87 | 
 88 |         return model
 89 | 
 90 |     def build_inputs(self, text):
 91 |         '''
 92 |         构建输入
 93 |         '''
 94 |         tokenize = tokenizer(self.config)
 95 | 
 96 |         batch_token_ids, batch_segment_ids, batch_mask, batch_seq_len = [], [], [], []
 97 |         word_ids, segment_ids, word_mask, seq_len = tokenize.encode(text)
 98 |         word_ids = np.array(word_ids, dtype="float32").tolist()
 99 |         segment_ids = np.array(segment_ids, dtype="float32").tolist()
100 |         word_mask = np.array(word_mask, dtype="float32").tolist()
101 |         batch_token_ids.append(word_ids)
102 |         batch_segment_ids.append(segment_ids)
103 |         batch_mask.append(word_mask)
104 |         batch_seq_len.append(seq_len)
105 |         inputs = dict(
106 |             input_word_ids=word_ids,
107 |             input_mask=word_mask,
108 |             input_type_ids=segment_ids,
109 |         )
110 | 
111 |         infer_input = {
112 |             "input_word_ids": tf.convert_to_tensor(inputs['input_word_ids']),
113 |             "input_mask": tf.convert_to_tensor(inputs['input_mask']),
114 |             "input_type_ids": tf.convert_to_tensor(inputs['input_type_ids']),
115 |         }
116 | 
117 |         return inputs, infer_input, tf.reshape(tf.convert_to_tensor(batch_seq_len), shape=(-1,))
118 | 
119 | 
120 |     def inference_one(self, text):
121 |         '''
122 |         推理一条数据
123 |         '''
124 |         inputs, infer_inputs, seq_len = self.build_inputs(text)
125 |         # model = self.build_model()
126 |         # outputs = model(infer_inputs)
127 |         data = json.dumps({"signature_name": "serving_default", "inputs":inputs['input_word_ids'],
128 |                            })
129 |         headers = {"content-type": "application/json"}
130 |         json_response = requests.post('http://localhost:8501/v1/models/my_model:predict',
131 |                                       data=data, headers=headers)
132 |         outputs = json.loads(json_response.text)
133 |         print(outputs)
134 |         return outputs
135 | 
136 | if __name__=='__main__':
137 |     with open("../model_configs/sentence_embedding.json", 'r') as fr:
138 |         config = json.load(fr)
139 |     print(config)
140 |     embedding = EmbeddingTask(config)
141 |     text = '你好'
142 |     result = embedding.inference_one(text)
143 |     print(result)
144 | 


--------------------------------------------------------------------------------
/data_processor/classifier_data_generator.py:
--------------------------------------------------------------------------------
  1 | from data_processor.embedding import embedding
  2 | import numpy as np
  3 | import pandas as pd
  4 | import pickle
  5 | import os
  6 | 
  7 | 
  8 | class ClassifierDataGenerator(embedding):
  9 |     '''
 10 |     生成训练数据
 11 |     '''
 12 |     def __init__(self, config):
 13 |         super(ClassifierDataGenerator, self).__init__(config)
 14 |         self.config = config
 15 |         self.batch_size = config['batch_size']
 16 |         self.load_data()
 17 |         self.train_data, self.train_label, self.eval_data, self.eval_label = self.train_eval_split(self.word_ids,
 18 |                                                                                                    self.segment_ids,
 19 |                                                                                                    self.word_mask,
 20 |                                                                                                    self.sequence_length,
 21 |                                                                                                    self.labels_idx, 0.2)
 22 | 
 23 |     def load_data(self):
 24 |         '''
 25 |         加载预处理好的数据
 26 |         :return:
 27 |         '''
 28 | 
 29 |         if os.path.exists(os.path.join(self.config['output_path'], "train_tokens.pkl")) and \
 30 |                 os.path.exists(os.path.join(self.config['output_path'], "label_to_index.pkl")):
 31 |             print("load existed train data")
 32 |             # with open(os.path.join(self.config['output_path'], "word_to_index.pkl"), "rb") as f:
 33 |             #     self.word_to_index = pickle.load(f)
 34 |             with open(os.path.join(self.config['output_path'], "label_to_index.pkl"), "rb") as f:
 35 |                 self.label_to_index = pickle.load(f)
 36 |             with open(os.path.join(self.config['output_path'], "train_tokens.pkl"), "rb") as f:
 37 |                 train_data = pickle.load(f)
 38 | 
 39 |             if os.path.exists(os.path.join(self.config['output_path'], "word_vectors.npy")):
 40 |                 print("load word_vectors")
 41 |                 self.word_vectors = np.load(os.path.join(self.config['output_path'], "word_vectors.npy"),
 42 |                                             allow_pickle=True)
 43 | 
 44 |             self.word_ids, self.segment_ids, self.word_mask, self.sequence_length, self.labels_idx = np.array(train_data["word_ids"]), \
 45 |                                                                                                      np.array(train_data["segment_ids"]),\
 46 |                                                                                                      np.array(train_data["word_mask"]),\
 47 |                                                                                                      np.array(train_data["sequence_length"]),\
 48 |                                                                                                      np.array(train_data["labels_idx"])
 49 | 
 50 |             # self.vocab = self.word_to_index.keys()
 51 |             # self.vocab_size = len(self.vocab)
 52 |         else:
 53 |             # 1，读取原始数据
 54 |             inputs, labels = self._read_data(self.config['data_path'])
 55 |             print("read finished")
 56 | 
 57 |             # 选择分词方式
 58 |             # if self.config['embedding_type'] == 'char':
 59 |             #     all_words = self.cut_chars(inputs)
 60 |             # else:
 61 |             #     all_words = self.cut_words(inputs)
 62 |             # word_to_index = self.word_to_index(all_words)
 63 |             label_to_index = self.label_to_index(labels)
 64 | 
 65 |             word_ids, segment_ids, word_mask, sequence_length, label_ids = self.save_input_tokens(inputs, labels, label_to_index)
 66 |             print('text to tokens process finished')
 67 | 
 68 |             # # 2，得到去除低频词和停用词的词汇表
 69 |             # word_to_index, all_words = self.word_to_index(inputs)
 70 |             # print("word process finished")
 71 |             #
 72 |             # # 3，得到词汇表
 73 |             # label_to_index = self.label_to_index(labels)
 74 |             # print("vocab process finished")
 75 |             #
 76 |             # # 4，输入转索引
 77 |             # inputs_idx = [self.tokens_to_ids(text, word_to_index) for text in all_words]
 78 |             # print("index transform finished")
 79 |             #
 80 |             # # 5，对输入做padding
 81 |             # inputs_idx = self.padding(inputs_idx)
 82 |             # print("padding finished")
 83 |             #
 84 |             # # 6，标签转索引
 85 |             # labels_idx = self.tokens_to_ids(labels, label_to_index)
 86 |             # print("label index transform finished")
 87 | 
 88 |             # 7, 加载词向量
 89 |             # if self.config['word2vec_path']:
 90 |             #     word_vectors = self.get_word_vectors(self.vocab)
 91 |             #     self.word_vectors = word_vectors
 92 |                 # 将本项目的词向量保存起来
 93 |                 # self.save_vectors(self.word_vectors, 'word_vectors')
 94 | 
 95 |             # train_data = dict(inputs_idx=inputs_idx, labels_idx=labels_idx)
 96 |             # with open(os.path.join(self.config['output_path'], "train_data.pkl"), "wb") as fw:
 97 |             #     pickle.dump(train_data, fw)
 98 |             # labels_idx = labels
 99 |             self.word_ids, self.segment_ids, self.word_mask, self.sequence_length, self.labels_idx = word_ids, segment_ids, word_mask, sequence_length, label_ids
100 | 
101 | 
102 |     def train_eval_split(self, word_ids, segment_ids, word_mask, sequence_length, labels, rate):
103 |         '''
104 |         划分训练和验证集
105 |         :param data:
106 |         :param labels:
107 |         :param rate:
108 |         :return:
109 |         '''
110 |         # np.random.shuffle(data)
111 |         perm = int(len(word_ids) * rate)
112 |         train_data = (word_ids[perm:], segment_ids[perm:], word_mask[perm:], sequence_length[perm:])
113 |         eval_data = (word_ids[:perm], segment_ids[:perm], word_mask[:perm], sequence_length[:perm])
114 |         train_label = labels[perm:]
115 |         eval_label = labels[:perm]
116 |         return train_data, train_label, eval_data, eval_label
117 | 
118 | 
119 |     def gen_data(self, input_idx, labels_idx):
120 |         '''
121 |         生成批次数据
122 |         :return:
123 |         '''
124 |         word_ids, segment_ids, word_mask, sequence_length = input_idx[0], input_idx[1], input_idx[2], input_idx[3]
125 |         batch_word_ids, batch_segment_ids, batch_word_mask, batch_sequence_length, batch_output_ids = [], [], [], [], []
126 | 
127 |         for i in range(len(word_ids)):
128 |             word_id = word_ids[i]
129 |             segment_id = segment_ids[i]
130 |             mask = word_mask[i]
131 |             seq_len = sequence_length[i]
132 |             target_ids = labels_idx[i]
133 |             batch_word_ids.append(word_id)
134 |             batch_segment_ids.append(segment_id)
135 |             batch_word_mask.append(mask)
136 |             batch_sequence_length.append(seq_len)
137 |             batch_output_ids.extend(target_ids)
138 | 
139 |             if len(batch_word_ids) == self.batch_size:
140 |                 yield dict(
141 |                     input_word_ids=np.array(batch_word_ids, dtype="int64"),
142 |                     input_mask=np.array(batch_word_mask, dtype="int64"),
143 |                     input_type_ids=np.array(batch_segment_ids, dtype="int64"),
144 |                     sequence_length=np.array(batch_sequence_length, dtype="int64"),
145 |                     input_target_ids=np.array(batch_output_ids, dtype="float32")
146 |                 )
147 |                 batch_word_ids, batch_segment_ids, batch_word_mask, batch_sequence_length, batch_output_ids = [], [], [], [], []
148 | 
149 | 


--------------------------------------------------------------------------------
/models/knowledge_distiilation.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | from official.nlp.modeling import layers
  3 | from official.nlp.modeling import networks
  4 | 
  5 | 
  6 | class Distill_model(tf.keras.Model):
  7 |     '''
  8 |     使用dssm进行知识蒸馏
  9 |     '''
 10 |     def __init__(self,
 11 |                  config,
 12 |                  teacher_network,
 13 |                  vocab_size,
 14 |                  word_vectors,
 15 |                  **kwargs):
 16 |         self.config = config
 17 |         self.vocab_size = vocab_size
 18 |         self.word_vectors = word_vectors
 19 |         #冻结teacher network的参数
 20 |         for layer in teacher_network.layers:
 21 |             layer.trainable = False
 22 |         #定义学生模型输入
 23 |         query = tf.keras.layers.Input(shape=(None,), dtype=tf.int64, name='input_x_ids')
 24 |         sim_query = tf.keras.layers.Input(shape=(None,), dtype=tf.int64, name='input_y_ids')
 25 |         #定义老师模型输入
 26 |         word_ids_a = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_word_ids_a')
 27 |         mask_a = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_mask_a')
 28 |         type_ids_a = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_type_ids_a')
 29 |         word_ids_b = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_word_ids_b')
 30 |         mask_b = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_mask_b')
 31 |         type_ids_b = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_type_ids_b')
 32 |         input_a = [word_ids_a, mask_a, type_ids_a]
 33 |         input_b = [word_ids_b, mask_b, type_ids_b]
 34 |         teacher_input = [input_a, input_b]
 35 | 
 36 |         #teacher_softlabel
 37 |         teacher_output = teacher_network(teacher_input)
 38 | 
 39 |         teacher_soft_label = softmax_t(self.config['t'], teacher_output['logits'])
 40 | 
 41 |         # embedding层
 42 |         # 利用词嵌入矩阵将输入数据转成词向量，shape=[batch_size, seq_len, embedding_size]
 43 |         class GatherLayer(tf.keras.layers.Layer):
 44 |             def __init__(self, config, vocab_size, word_vectors):
 45 |                 super(GatherLayer, self).__init__()
 46 |                 self.config = config
 47 | 
 48 |                 self.vocab_size = vocab_size
 49 |                 self.word_vectors = word_vectors
 50 | 
 51 |             def build(self, input_shape):
 52 |                 with tf.name_scope('embedding'):
 53 |                     if not self.config['use_word2vec']:
 54 |                         self.embedding_w = tf.Variable(tf.keras.initializers.glorot_normal()(
 55 |                             shape=[self.vocab_size, self.config['embedding_size']],
 56 |                             dtype=tf.float32), trainable=True, name='embedding_w')
 57 |                     else:
 58 |                         self.embedding_w = tf.Variable(tf.cast(self.word_vectors, tf.float32), trainable=True,
 59 |                                                        name='embedding_w')
 60 |                 self.build = True
 61 | 
 62 |             def call(self, inputs, **kwargs):
 63 |                 return tf.gather(self.embedding_w, inputs, name='embedded_words')
 64 | 
 65 |             def get_config(self):
 66 |                 config = super(GatherLayer, self).get_config()
 67 | 
 68 |                 return config
 69 | 
 70 | 
 71 |         shared_net = tf.keras.Sequential([GatherLayer(config, vocab_size, word_vectors),
 72 |                                           shared_lstm_layer(config)])
 73 | 
 74 |         query_embedding_output = shared_net.predict_step(query)
 75 |         sim_query_embedding_output = shared_net.predict_step(sim_query)
 76 | 
 77 | 
 78 |         # 余弦函数计算相似度
 79 |         # cos_similarity余弦相似度[batch_size, similarity]
 80 |         query_norm = tf.sqrt(tf.reduce_sum(tf.square(query_embedding_output), axis=-1), name='query_norm')
 81 |         sim_query_norm = tf.sqrt(tf.reduce_sum(tf.square(sim_query_embedding_output), axis=-1), name='sim_query_norm')
 82 | 
 83 |         dot = tf.reduce_sum(tf.multiply(query_embedding_output, sim_query_embedding_output), axis=-1)
 84 |         cos_similarity = tf.divide(dot, (query_norm * sim_query_norm), name='cos_similarity')
 85 |         self.similarity = cos_similarity
 86 | 
 87 |         # 预测为正例的概率
 88 |         cond = (self.similarity > self.config["neg_threshold"])
 89 |         pos = tf.where(cond, tf.square(self.similarity), 1 - tf.square(self.similarity))
 90 |         neg = tf.where(cond, 1 - tf.square(self.similarity), tf.square(self.similarity))
 91 |         predictions = [[neg[i], pos[i]] for i in range(self.config['batch_size'])]
 92 | 
 93 |         self.logits = self.similarity
 94 |         student_soft_label = softmax_t(self.config['t'], self.logits)
 95 |         student_hard_label = self.logits
 96 |         if self.config['is_training']:
 97 |             #训练时候蒸馏
 98 |             outputs = dict(student_soft_label=student_soft_label, student_hard_label=student_hard_label, teacher_soft_label=teacher_soft_label, predictions=predictions)
 99 |             super(Distill_model, self).__init__(inputs=[query, sim_query, teacher_input], outputs=outputs, **kwargs)
100 |         else:
101 |             #预测时候只加载学生模型
102 |             outputs = dict(predictions=predictions)
103 |             super(Distill_model, self).__init__(inputs=[query, sim_query], outputs=outputs, **kwargs)
104 | 
105 | 
106 | 
107 | def softmax_t(t, logits):
108 |     '''
109 |     带参数t的softmax
110 |     '''
111 |     _sum = tf.reduce_sum(tf.exp(logits/t))
112 |     return tf.exp(logits/t) / _sum
113 | 
114 | class shared_lstm_layer(tf.keras.layers.Layer):
115 |     '''
116 |     共享lstm层参数
117 |     '''
118 |     def __init__(self, config):
119 |         self.config = config
120 |         super(shared_lstm_layer, self).__init__()
121 | 
122 |     def build(self, input_shape):
123 |         forward_layer_1 = tf.keras.layers.LSTM(self.config['hidden_size'], dropout=self.config['dropout_rate'],
124 |                                                return_sequences=True)
125 |         backward_layer_1 = tf.keras.layers.LSTM(self.config['hidden_size'], dropout=self.config['dropout_rate'],
126 |                                                 return_sequences=True, go_backwards=True)
127 |         forward_layer_2 = tf.keras.layers.LSTM(self.config['hidden_size'], dropout=self.config['dropout_rate'],
128 |                                                return_sequences=True)
129 |         backward_layer_2 = tf.keras.layers.LSTM(self.config['hidden_size'], dropout=self.config['dropout_rate'],
130 |                                                 return_sequences=True, go_backwards=True)
131 |         self.bilstm_1 = tf.keras.layers.Bidirectional(forward_layer_1, backward_layer=backward_layer_1)
132 |         self.bilstm_2 = tf.keras.layers.Bidirectional(forward_layer_2, backward_layer=backward_layer_2)
133 |         self.layer_dropout = tf.keras.layers.Dropout(0.4)
134 |         self.output_dense = tf.keras.layers.Dense(self.config['output_size'])
135 | 
136 |         super(shared_lstm_layer, self).build(input_shape)
137 | 
138 |     def get_config(self):
139 |         config = {}
140 |         return config
141 | 
142 |     def call(self, inputs, **kwargs):
143 |         query_res_1 = self.bilstm_1(inputs)
144 |         query_res_1 = self.layer_dropout(query_res_1)
145 |         query_res_2 = self.bilstm_2(query_res_1)
146 | 
147 |         #取时间步的平均值，摊平[batch_size, forward_size+backward_size]
148 |         avg_query_embedding = tf.reduce_mean(query_res_2, axis=1)
149 |         tmp_query_embedding = tf.reshape(avg_query_embedding, [self.config['batch_size'], self.config['hidden_size']*2])
150 |         # 全连接层[batch_size, dense_dim]
151 |         query_embedding_output = self.output_dense(tmp_query_embedding)
152 |         query_embedding_output = tf.keras.activations.relu(query_embedding_output)
153 |         return query_embedding_output


--------------------------------------------------------------------------------
/data_processor/ner_data_generator.py:
--------------------------------------------------------------------------------
  1 | from data_processor.embedding import embedding
  2 | import numpy as np
  3 | import pandas as pd
  4 | import pickle
  5 | import os
  6 | 
  7 | 
  8 | class NERDataGenerator(embedding):
  9 |     '''
 10 |     生成训练数据
 11 |     '''
 12 |     def __init__(self, config):
 13 |         super(NERDataGenerator, self).__init__(config)
 14 |         self.config = config
 15 |         self.batch_size = config['batch_size']
 16 |         self.load_data()
 17 |         self.train_data, self.train_label, self.eval_data, self.eval_label = self.train_eval_split(self.word_ids,
 18 |                                                                                                    self.segment_ids,
 19 |                                                                                                    self.word_mask,
 20 |                                                                                                    self.sequence_length,
 21 |                                                                                                    self.labels_idx, 0.2)
 22 | 
 23 |     def read_data(self, path):
 24 |         inputs = []
 25 |         labels = []
 26 |         with open(os.path.join(path, 'source_BIO_2014_cropus.txt'), 'r', encoding='utf-8') as fr:
 27 |             for line in fr.readlines():
 28 |                 inputs.append(line.split(sep=' '))
 29 |         with open(os.path.join(path, 'target_BIO_2014_cropus.txt'), 'r', encoding='utf-8') as fr:
 30 |             for line in fr.readlines():
 31 |                 labels.append(line.split(sep=' '))
 32 |         return inputs[:100], labels[:100]
 33 | 
 34 |     def get_labels(self):
 35 |         return ['O', 'B_LOC', 'I_LOC', 'B_PER', 'I_PER', 'B_ORG', 'I_ORG', 'B_T', 'I_T']
 36 | 
 37 |     def save_input_tokens(self, texts, labels, label_to_index):
 38 |         '''
 39 |         保存处理完成的输入tokens，方便后续加载
 40 |         :param texts:
 41 |         :return:
 42 |         '''
 43 |         word_ids, segment_ids, word_mask, sequence_length = [], [], [], []
 44 |         label_ids = []
 45 |         for i, text in enumerate(texts):
 46 |             _word_ids, _segment_ids, _word_mask, _sequence_length = self.encode(text)
 47 |             word_ids.append(_word_ids)
 48 |             segment_ids.append(_segment_ids)
 49 |             word_mask.append(_word_mask)
 50 |             sequence_length.append(_sequence_length)
 51 |             label_id = self.seq_labels_to_ids(labels[i], label_to_index)
 52 |             label_ids.append(label_id)
 53 |         input_tokens = dict(word_ids=word_ids, segment_ids=segment_ids, word_mask=word_mask,
 54 |                             sequence_length=sequence_length, labels_idx=label_ids)
 55 |         if not os.path.exists(self.config['output_path']):
 56 |             os.mkdir(self.config['output_path'])
 57 |         # 保存准备训练的tokens数据
 58 |         with open(os.path.join(self.config['output_path'], 'train_tokens.pkl'), "wb") as fw:
 59 |             pickle.dump(input_tokens, fw)
 60 |         # 保存预处理的word_to_index数据
 61 |         # with open(os.path.join(self.config['output_path'], 'word_to_index.pkl'), "wb") as fw:
 62 |         #     pickle.dump(word_to_index, fw)
 63 |         # 保存预处理的word_to_index数据
 64 |         with open(os.path.join(self.config['output_path'], 'label_to_index.pkl'), "wb") as fw:
 65 |             pickle.dump(label_to_index, fw)
 66 |         return word_ids, segment_ids, word_mask, sequence_length, label_ids
 67 | 
 68 |     def load_data(self):
 69 |         '''
 70 |         加载预处理好的数据
 71 |         :return:
 72 |         '''
 73 | 
 74 |         if os.path.exists(os.path.join(self.config['output_path'], "train_tokens.pkl")) and \
 75 |                 os.path.exists(os.path.join(self.config['output_path'], "label_to_index.pkl")):
 76 |             print("load existed train data")
 77 |             # with open(os.path.join(self.config['output_path'], "word_to_index.pkl"), "rb") as f:
 78 |             #     self.word_to_index = pickle.load(f)
 79 |             with open(os.path.join(self.config['output_path'], "label_to_index.pkl"), "rb") as f:
 80 |                 self.label_to_index = pickle.load(f)
 81 |             with open(os.path.join(self.config['output_path'], "train_tokens.pkl"), "rb") as f:
 82 |                 train_data = pickle.load(f)
 83 | 
 84 |             if os.path.exists(os.path.join(self.config['output_path'], "word_vectors.npy")):
 85 |                 print("load word_vectors")
 86 |                 self.word_vectors = np.load(os.path.join(self.config['output_path'], "word_vectors.npy"),
 87 |                                             allow_pickle=True)
 88 | 
 89 |             self.word_ids, self.segment_ids, self.word_mask, self.sequence_length, self.labels_idx = np.array(train_data["word_ids"]), \
 90 |                                                                                                      np.array(train_data["segment_ids"]), \
 91 |                                                                                                      np.array(train_data["word_mask"]), \
 92 |                                                                                                      np.array(train_data["sequence_length"]), \
 93 |                                                                                                      np.array(train_data["labels_idx"])
 94 | 
 95 |             # self.vocab = self.word_to_index.keys()
 96 |             # self.vocab_size = len(self.vocab)
 97 |         else:
 98 |             # 1，读取原始数据
 99 |             inputs, labels = self.read_data(self.config['data_path'])
100 |             print("read finished")
101 |             targets = self.get_labels()
102 |             label_to_index = self.label_to_index(targets)
103 | 
104 |             word_ids, segment_ids, word_mask, sequence_length, label_ids = self.save_input_tokens(inputs, labels,
105 |                                                                                                   label_to_index)
106 |             print('text to tokens process finished')
107 | 
108 | 
109 |             self.word_ids, self.segment_ids, self.word_mask, self.sequence_length, self.labels_idx = word_ids, segment_ids, word_mask, sequence_length, label_ids
110 | 
111 |     def train_eval_split(self, word_ids, segment_ids, word_mask, sequence_length, labels, rate):
112 |         '''
113 |         划分训练和验证集
114 |         :param data:
115 |         :param labels:
116 |         :param rate:
117 |         :return:
118 |         '''
119 |         # np.random.shuffle(data)
120 |         perm = int(len(word_ids) * rate)
121 |         train_data = (word_ids[perm:], segment_ids[perm:], word_mask[perm:], sequence_length[perm:])
122 |         eval_data = (word_ids[:perm], segment_ids[:perm], word_mask[:perm], sequence_length[:perm])
123 |         train_label = labels[perm:]
124 |         eval_label = labels[:perm]
125 |         return train_data, train_label, eval_data, eval_label
126 | 
127 | 
128 |     def gen_data(self, input_idx, labels_idx):
129 |         '''
130 |         生成批次数据
131 |         :return:
132 |         '''
133 |         word_ids, segment_ids, word_mask, sequence_length = input_idx[0], input_idx[1], input_idx[2], input_idx[3]
134 |         batch_word_ids, batch_segment_ids, batch_word_mask, batch_sequence_length, batch_output_ids = [], [], [], [], []
135 | 
136 |         for i in range(len(word_ids)):
137 |             word_id = word_ids[i]
138 |             segment_id = segment_ids[i]
139 |             mask = word_mask[i]
140 |             seq_len = sequence_length[i]
141 |             target_ids = labels_idx[i]
142 |             batch_word_ids.append(word_id)
143 |             batch_segment_ids.append(segment_id)
144 |             batch_word_mask.append(mask)
145 |             batch_sequence_length.append(seq_len)
146 |             batch_output_ids.append(target_ids)
147 | 
148 |             if len(batch_word_ids) == self.batch_size:
149 |                 yield dict(
150 |                     input_word_ids=np.array(batch_word_ids, dtype="int64"),
151 |                     input_mask=np.array(batch_word_mask, dtype="int64"),
152 |                     input_type_ids=np.array(batch_segment_ids, dtype="int64"),
153 |                     sequence_length=np.array(batch_sequence_length, dtype="int64"),
154 |                     input_target_ids=np.array(batch_output_ids, dtype="float32")
155 |                 )
156 |                 batch_word_ids, batch_segment_ids, batch_word_mask, batch_sequence_length, batch_output_ids = [], [], [], [], []
157 | 
158 | 


--------------------------------------------------------------------------------
/data_processor/text_match_data_generator_v2.py:
--------------------------------------------------------------------------------
  1 | from data_processor.embedding import embedding
  2 | import numpy as np
  3 | import pandas as pd
  4 | import pickle
  5 | import os
  6 | from random import shuffle
  7 | import random
  8 | import copy
  9 | from itertools import chain
 10 | 
 11 | class TextMatchDataGeneratorV2(embedding):
 12 |     '''
 13 |     生成训练数据
 14 |     '''
 15 |     def __init__(self, config):
 16 |         super(TextMatchDataGeneratorV2, self).__init__(config)
 17 |         self.config = config
 18 |         self.batch_size = config['batch_size']
 19 |         self.load_data()
 20 |         self.train_data, self.train_label, self.eval_data, self.eval_label = self.train_eval_split(self.word_idx, self.segment_idx, self.word_mask, self.sequence_length,self.labels_idx, 0.2)
 21 | 
 22 |     def read_data(self, file_path):
 23 |         '''
 24 |         加载训练数据
 25 |         '''
 26 |         # df = pd.read_csv(file_path)
 27 |         # query = [jieba.lcut(i) for i in df['sentence1'].values[0:data_size]]
 28 |         # sim = [jieba.lcut(i) for i in df['sentence2'].values[0:data_size]]
 29 |         # query = [list(i) for i in df['sentence1'].values]
 30 |         # sim = [list(i) for i in df['sentence2'].values]
 31 |         # import pandas as pd
 32 |         work_data = pd.read_excel(file_path)
 33 |         std_query_list = work_data['standard_questions'].tolist()
 34 |         sim_query_list = work_data['sim_questions'].tolist()
 35 |         # std_answer_list = work_data['standard_answers'].tolist()
 36 |         sim = []
 37 | 
 38 |         for i in range(len(std_query_list)):
 39 |             _sim = sim_query_list[i].split('||')
 40 |             sim.append(_sim)
 41 | 
 42 | 
 43 |         return std_query_list, sim
 44 | 
 45 |     def negative_sampling(self, queries, sim):
 46 |         '''
 47 |         随机负采样
 48 |         '''
 49 |         new_queries = []
 50 |         labels = []
 51 |         for i, item in enumerate(queries):
 52 |             copy_questions = copy.copy(queries)
 53 |             copy_questions.remove(item)
 54 |             neg_samples = random.sample(copy_questions, 5)
 55 |             pos_samples = random.sample(sim[i], 2)
 56 |             new_queries.append([item] + pos_samples + neg_samples)
 57 |             labels.append([1]*2 + [0]*5)
 58 |         return new_queries, labels
 59 | 
 60 |     def save_ranking_tokens(self, queries, sim):
 61 |         '''
 62 |         保存处理完成的输入tokens，方便后续加载
 63 |         :param texts:
 64 |         :return:
 65 |         '''
 66 | 
 67 |         word_ids, segment_ids, word_mask, sequence_length = [], [], [], []
 68 |         word_ids_list, segment_ids_list, word_mask_list, sequence_length_list = [], [], [], []
 69 |         new_queries, label_ids = self.negative_sampling(queries, sim)
 70 | 
 71 |         for j, questions in enumerate(new_queries):
 72 |             for i, query in enumerate(questions[1:]):
 73 | 
 74 |                 _word_ids, _segment_ids, _word_mask, _sequence_length = self.encode_v2(query[0], query)
 75 | 
 76 |                 word_ids.append(_word_ids)
 77 |                 segment_ids.append(_segment_ids)
 78 |                 word_mask.append(_word_mask)
 79 |                 sequence_length.append(_sequence_length)
 80 | 
 81 |             word_ids_list.append(word_ids)
 82 |             segment_ids_list.append(segment_ids)
 83 |             word_mask_list.append(word_mask)
 84 |             sequence_length_list.append(sequence_length)
 85 |             word_ids, segment_ids, word_mask, sequence_length = [], [], [], []
 86 | 
 87 | 
 88 |             # label_id = self.labels_to_ids([labels[i]], label_to_index)
 89 |             # label_ids_list.append(label_ids)
 90 |         input_tokens = dict(word_ids=word_ids_list, query_segment_ids=segment_ids_list, query_word_mask=word_mask_list,
 91 |                             sequence_length=sequence_length_list,labels_idx=label_ids)
 92 |         if not os.path.exists(self.config['output_path']):
 93 |             os.mkdir(self.config['output_path'])
 94 |         #保存准备训练的tokens数据
 95 |         with open(os.path.join(self.config['output_path'], 'train_tokens.pkl'), "wb") as fw:
 96 |             pickle.dump(input_tokens, fw)
 97 |         # 保存预处理的label_to_index数据
 98 |         # with open(os.path.join(self.config['output_path'], 'label_to_index.pkl'), "wb") as fw:
 99 |         #     pickle.dump(label_to_index, fw)
100 |         return word_ids_list, segment_ids_list, word_mask_list, sequence_length_list, label_ids
101 | 
102 |     def load_data(self):
103 |         '''
104 |         加载预处理好的数据
105 |         :return:
106 |         '''
107 | 
108 |         if os.path.exists(os.path.join(self.config['output_path'], "train_tokens.pkl")) or \
109 |                 os.path.exists(os.path.join(self.config['output_path'], "label_to_index.pkl")):
110 |             print("load existed train data")
111 |             # with open(os.path.join(self.config['output_path'], "word_to_index.pkl"), "rb") as f:
112 |             #     self.word_to_index = pickle.load(f)
113 |             # with open(os.path.join(self.config['output_path'], "label_to_index.pkl"), "rb") as f:
114 |             #     self.label_to_index = pickle.load(f)
115 |             with open(os.path.join(self.config['output_path'], "train_tokens.pkl"), "rb") as f:
116 |                 train_data = pickle.load(f)
117 | 
118 |             self.word_idx, self.segment_idx, self.word_mask, self.sequence_length, \
119 |             self.labels_idx = np.array(train_data["word_ids"]), \
120 |                               np.array(train_data["query_segment_ids"]), \
121 |                               np.array(train_data["query_word_mask"]), \
122 |                               np.array(train_data["sequence_length"]), \
123 |                               np.array(train_data["labels_idx"])
124 |         else:
125 |             # 1，读取原始数据
126 |             query, sim = self.read_data(self.config['data_path'])
127 |             print("read finished")
128 | 
129 |             # label_to_index = self.label_to_index(labels)
130 | 
131 |             word_ids, segment_ids, word_mask, sequence_length, label_ids = self.save_ranking_tokens(query, sim)
132 |             print('text to tokens process finished')
133 | 
134 |             # train_data = dict(inputs_idx=inputs_idx, labels_idx=labels_idx)
135 |             # with open(os.path.join(self.config['output_path'], "train_data.pkl"), "wb") as fw:
136 |             #     pickle.dump(train_data, fw)
137 |             # labels_idx = labels
138 |             self.word_idx, self.segment_idx, self.word_mask, self.sequence_length, \
139 |             self.labels_idx = word_ids, segment_ids, word_mask, sequence_length, label_ids
140 | 
141 |     def train_eval_split(self, word_ids, segment_ids, word_mask, sequence_length,
142 |                          labels, rate):
143 | 
144 |         split_index = int(len(word_ids) * rate)
145 |         train_data = (word_ids[split_index:], segment_ids[split_index:], word_mask[split_index:],
146 |                       sequence_length[split_index:])
147 |         train_label = labels[split_index:]
148 |         eval_data = (word_ids[:split_index], segment_ids[:split_index], word_mask[:split_index],
149 |                       sequence_length[:split_index])
150 |         eval_label = labels[:split_index]
151 | 
152 |         return train_data, train_label, eval_data, eval_label
153 | 
154 |     def gen_data(self, inputs_idx, labels_idx):
155 |         '''
156 |         生成批次数据
157 |         :return:
158 |         '''
159 |         word_ids, segment_ids, word_mask, sequence_length = inputs_idx[0], inputs_idx[1],inputs_idx[2],inputs_idx[3]
160 |         batch_word_ids, batch_segment_ids, batch_word_mask, batch_sequence_length, batch_output_ids= [], [], [], [], []
161 | 
162 |         for i in range(len(word_ids)):
163 |             batch_word_ids.append(word_ids[i])
164 |             batch_segment_ids.append(segment_ids[i])
165 |             batch_word_mask.append(word_mask[i])
166 |             batch_sequence_length.append(sequence_length[i])
167 | 
168 |             batch_output_ids.append(labels_idx[i])
169 | 
170 | 
171 |             if len(batch_output_ids) == self.batch_size:
172 |                 yield dict(
173 |                 input_word_ids=np.array(list(chain(*batch_word_ids)), dtype="int32"),
174 |                 input_mask=np.array(list(chain(*batch_word_mask)), dtype="int32"),
175 |                 input_type_ids=np.array(list(chain(*batch_segment_ids)), dtype="int32"),
176 |                 input_target_ids=np.array(list(chain(*batch_output_ids)), dtype="float32")
177 |                 )
178 |                 batch_word_ids, batch_segment_ids, batch_word_mask, batch_sequence_length, batch_output_ids = [], [], [], [], []
179 | 
180 | 


--------------------------------------------------------------------------------
/tasks/Itr_pair_task.py:
--------------------------------------------------------------------------------
  1 | from official.nlp.bert import tokenization
  2 | import tensorflow as tf
  3 | from official.nlp.configs import bert
  4 | from official.nlp.configs import encoders
  5 | from official.nlp.data import pretrain_dataloader
  6 | 
  7 | from official.nlp.tasks.tagging import TaggingTask
  8 | from trainer.train_base import TrainBase
  9 | from official.nlp.modeling.models import BertClassifier
 10 | import os
 11 | import json
 12 | from data_processor.text_match_data_generator import TextMatchDataGenerator
 13 | from official.nlp.modeling.networks import BertEncoder
 14 | from official.modeling import tf_utils
 15 | from official.nlp.bert import configs as bert_configs
 16 | from models.sim_bert import SimBert
 17 | 
 18 | 
 19 | 
 20 | class ItrTask(TrainBase):
 21 |     '''
 22 |     基于bert的分类任务
 23 |     '''
 24 |     def __init__(self, task_config):
 25 |         self.config = task_config
 26 |         self.loss = 'loss'
 27 |         super(ItrTask, self).__init__(task_config)
 28 |         self.data_generator = TextMatchDataGenerator(task_config)
 29 | 
 30 | 
 31 |     def build_model(self):
 32 |         '''
 33 |         构建模型
 34 |         '''
 35 |         # encoder_network = encoders.build_encoder(encoders.EncoderConfig(
 36 |         #     bert=encoders.BertEncoderConfig(vocab_size=21128)))
 37 |         encoder_network = self.build_encoder()
 38 |         model = SimBert(network=encoder_network, config=self.config)
 39 | 
 40 |         return model
 41 | 
 42 |     def build_encoder(self):
 43 |         bert_config = bert_configs.BertConfig.from_json_file(self.config['bert_config_path'])
 44 |         cfg = bert_config
 45 |         bert_encoder = BertEncoder(
 46 |             vocab_size=cfg.vocab_size,
 47 |             hidden_size=cfg.hidden_size,
 48 |             num_layers=cfg.num_hidden_layers,
 49 |             num_attention_heads=cfg.num_attention_heads,
 50 |             intermediate_size=cfg.intermediate_size,
 51 |             activation=tf_utils.get_activation(cfg.hidden_act),
 52 |             dropout_rate=cfg.hidden_dropout_prob,
 53 |             attention_dropout_rate=cfg.attention_probs_dropout_prob,
 54 |             max_sequence_length=cfg.max_position_embeddings,
 55 |             type_vocab_size=cfg.type_vocab_size,
 56 |             initializer=tf.keras.initializers.TruncatedNormal(
 57 |                 stddev=cfg.initializer_range),
 58 |             embedding_width=cfg.embedding_size,
 59 |             return_all_encoder_outputs=True)
 60 |         # ckpt = tf.train.Checkpoint(model=bert_encoder)
 61 |         # init_checkpoint = self.config['bert_model_path']
 62 |         # ckpt.restore(init_checkpoint).assert_existing_objects_matched()
 63 |         # bert_encoder.load_weights(init_checkpoint)
 64 |         return bert_encoder
 65 | 
 66 |     def build_losses(self, labels, model_outputs, metrics, aux_losses=None) -> tf.Tensor:
 67 |         '''
 68 |         构建损失
 69 |         '''
 70 |         with tf.name_scope('TextMatchTask/losses'):
 71 |             if self.config['model_name'] == 'simbert':
 72 |                 # 构建对比损失
 73 |                 y = tf.reshape(labels, (-1,))
 74 |                 similarity = model_outputs['logits']
 75 |                 cond = (similarity < self.config["neg_threshold"])
 76 |                 zeros = tf.zeros_like(similarity, dtype=tf.float32)
 77 |                 ones = tf.ones_like(similarity, dtype=tf.float32)
 78 |                 squre_similarity = tf.square(similarity)
 79 |                 neg_similarity = tf.where(cond, squre_similarity, zeros)
 80 | 
 81 |                 pos_loss = y * (tf.square(ones - similarity) / 4)
 82 |                 neg_loss = (ones - y) * neg_similarity
 83 |                 losses = pos_loss + neg_loss
 84 |                 loss = tf.reduce_mean(losses)
 85 |                 return loss
 86 | 
 87 |             metrics = dict([(metric.name, metric) for metric in metrics])
 88 |             losses = tf.keras.losses.sparse_categorical_crossentropy(labels,
 89 |                                                                      tf.cast(model_outputs['predictions'], tf.float32),
 90 |                                                                      from_logits=True)
 91 | 
 92 |             loss = tf.reduce_mean(losses)
 93 | 
 94 |             return loss
 95 | 
 96 |     def build_inputs(self, inputs):
 97 |         '''
 98 |         构建输入
 99 |         '''
100 |         train_input = {
101 |             "input_word_ids_a": tf.convert_to_tensor(inputs['input_word_ids_a']),
102 |             "input_mask_a": tf.convert_to_tensor(inputs['input_mask_a']),
103 |             "input_type_ids_a": tf.convert_to_tensor(inputs['input_type_ids_a']),
104 |             "input_word_ids_b": tf.convert_to_tensor(inputs['input_word_ids_b']),
105 |             "input_mask_b": tf.convert_to_tensor(inputs['input_mask_b']),
106 |             "input_type_ids_b": tf.convert_to_tensor(inputs['input_type_ids_b']),
107 |             "labels": inputs['input_target_ids']
108 |         }
109 |         return train_input
110 | 
111 |     def train_step(self,
112 |                    inputs,
113 |                    model: tf.keras.Model,
114 |                    optimizer: tf.keras.optimizers.Optimizer,
115 |                    metrics=None):
116 |         '''
117 |         进行训练，前向和后向计算
118 |         :param inputs:
119 |         :param model:
120 |         :param optimizer:
121 |         :param metrics:
122 |         :return:
123 |         '''
124 | 
125 |         with tf.GradientTape() as tape:
126 |             outputs = model(inputs, training=True)
127 |             loss = self.build_losses(inputs["labels"], outputs, metrics, aux_losses=None)
128 | 
129 |         tvars = model.trainable_variables
130 |         grads = tape.gradient(loss, tvars)
131 |         grads, _ = tf.clip_by_global_norm(grads, clip_norm=5.0)
132 |         optimizer.apply_gradients(list(zip(grads, tvars)))
133 |         labels = inputs['labels']
134 |         logs = {self.loss: loss}
135 |         if metrics:
136 |             self.process_metrics(metrics, labels, outputs['predictions'])
137 |             logs.update({m.name: m.result() for m in model.metrics})
138 |         if model.compiled_metrics:
139 |             self.process_compiled_metrics(model.compiled_metrics, labels, outputs['predictions'])
140 |             logs.update({m.name: m.result() for m in metrics or []})
141 |             logs.update({m.name: m.result() for m in model.metrics})
142 |         return logs
143 | 
144 |     def validation_step(self, inputs, model: tf.keras.Model, metrics=None):
145 |         '''
146 |         验证集验证模型
147 |         :param input:
148 |         :param model:
149 |         :return:
150 |         '''
151 |         labels = inputs['labels']
152 |         outputs = self.inference_step(inputs, model)
153 |         loss = self.build_losses(labels, outputs, metrics, aux_losses=model.losses)
154 | 
155 |         logs = {self.loss: loss}
156 |         if metrics:
157 |             self.process_metrics(metrics, labels, outputs['predictions'])
158 |         if model.compiled_metrics:
159 |             self.process_compiled_metrics(model.compiled_metrics, labels, outputs['predictions'])
160 |             logs.update({m.name: m.result() for m in metrics or []})
161 |             logs.update({m.name: m.result() for m in model.metrics})
162 |         return logs
163 | 
164 |     def build_metrics(self, training=None):
165 |         '''
166 |         构建评价指标
167 |         :param training:
168 |         :return:
169 |         '''
170 |         # del training
171 |         metrics = [
172 |             tf.keras.metrics.SparseCategoricalAccuracy(name='text_match_metrics')
173 |         ]
174 | 
175 |         return metrics
176 | 
177 |     def check_exist_model(self, model):
178 |         '''
179 |         检查是否存在模型文件
180 |         :return:
181 |         '''
182 |         # ckpt = tf.train.Checkpoint(models=models)
183 |         init_checkpoint = os.path.join(self.config['ckpt_model_path'], self.config['model_name'])
184 | 
185 |         # ckpt.restore(init_checkpoint).assert_existing_objects_matched()
186 |         model.load_weights(init_checkpoint).assert_existing_objects_matched()
187 | 
188 | 
189 | if __name__=='__main__':
190 |     with open("../model_configs/sim_bert.json", 'r') as fr:
191 |         config = json.load(fr)
192 |     print(config)
193 |     Itr_pair = ItrTask(config)
194 | 
195 |     model = Itr_pair.build_model()
196 |     bert_encoder = Itr_pair.build_encoder()
197 |     ckpt = tf.train.Checkpoint(model=bert_encoder)
198 |     init_checkpoint = config['bert_model_path']
199 |     ckpt.restore(init_checkpoint).assert_existing_objects_matched()
200 |     # config = models.get_config()
201 |     # Itr_pair.train(model)
202 |     print(model.layers)
203 | 
204 | 
205 | 


--------------------------------------------------------------------------------
/bert_service/embedding_serving.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Examples of SavedModel export for tf-serving."""
 16 | 
 17 | from absl import app
 18 | from absl import flags
 19 | import tensorflow as tf
 20 | 
 21 | from official.nlp.bert import bert_models
 22 | from official.nlp.bert import configs
 23 | from tasks.embedding_task import EmbeddingTask
 24 | from official.nlp.modeling.networks import BertEncoder
 25 | from official.modeling import tf_utils
 26 | from official.nlp.bert import configs as bert_configs
 27 | from data_processor.tokenizer import tokenizer
 28 | from models.sentence_embedding import SentenceEmbedding
 29 | import json
 30 | 
 31 | 
 32 | root_path = '/Users/donruo/Desktop/project/bert_tasks/'
 33 | 
 34 | flags.DEFINE_string("bert_config_file", root_path+'chinese_wwm_ext_L-12_H-768_A-12/v2/bert_config.json',
 35 |                     "Bert configuration file to define core bert layers.")
 36 | flags.DEFINE_string("model_checkpoint_path", root_path+'chinese_wwm_ext_L-12_H-768_A-12/v2/bert_model.ckpt-1',
 37 |                     "File path to TF model checkpoint.")
 38 | flags.DEFINE_string("export_path", root_path+'chinese_wwm_ext_L-12_H-768_A-12/serve/versions/1',
 39 |                     "Destination folder to export the serving SavedModel.")
 40 | flags.DEFINE_string("config_path", root_path+'model_configs/sentence_embedding.json', "embedding model configurations")
 41 | 
 42 | FLAGS = flags.FLAGS
 43 | 
 44 | 
 45 | class BertServing(tf.keras.Model):
 46 |   """Bert transformer encoder model for serving."""
 47 | 
 48 |   def __init__(self, config, bert_config, name_to_features, name="serving_model"):
 49 |     super(BertServing, self).__init__(name=name)
 50 | 
 51 |     cfg = bert_config
 52 |     self.bert_encoder = BertEncoder(
 53 |         vocab_size=cfg.vocab_size,
 54 |         hidden_size=cfg.hidden_size,
 55 |         num_layers=cfg.num_hidden_layers,
 56 |         num_attention_heads=cfg.num_attention_heads,
 57 |         intermediate_size=cfg.intermediate_size,
 58 |         activation=tf_utils.get_activation(cfg.hidden_act),
 59 |         dropout_rate=cfg.hidden_dropout_prob,
 60 |         attention_dropout_rate=cfg.attention_probs_dropout_prob,
 61 |         max_sequence_length=cfg.max_position_embeddings,
 62 |         type_vocab_size=cfg.type_vocab_size,
 63 |         initializer=tf.keras.initializers.TruncatedNormal(
 64 |             stddev=cfg.initializer_range),
 65 |         embedding_width=cfg.embedding_size,
 66 |         return_all_encoder_outputs=True)
 67 |     self.model = SentenceEmbedding(self.bert_encoder, config)
 68 |     # ckpt = tf.train.Checkpoint(model=self.bert_encoder)
 69 |     # init_checkpoint = self.config['bert_model_path']
 70 |     #
 71 |     # ckpt.restore(init_checkpoint).assert_existing_objects_matched()
 72 |     self.name_to_features = name_to_features
 73 | 
 74 |   def call(self, inputs):
 75 |     input_word_ids = inputs["input_word_ids"]
 76 |     input_mask = inputs["input_mask"]
 77 |     input_type_ids = inputs["input_type_ids"]
 78 |     infer_input = {
 79 |         "input_word_ids": input_word_ids,
 80 |         "input_mask": input_mask,
 81 |         "input_type_ids": input_type_ids,
 82 |     }
 83 |     encoder_outputs = self.model(
 84 |         infer_input)
 85 |     return encoder_outputs
 86 | 
 87 |   def serve_body(self, input_ids, input_mask=None, segment_ids=None):
 88 |     if segment_ids is None:
 89 |       # Requires CLS token is the first token of inputs.
 90 |       segment_ids = tf.zeros_like(input_ids)
 91 |     if input_mask is None:
 92 |       # The mask has 1 for real tokens and 0 for padding tokens.
 93 |       input_mask = tf.where(
 94 |           tf.equal(input_ids, 0), tf.zeros_like(input_ids),
 95 |           tf.ones_like(input_ids))
 96 | 
 97 |     inputs = dict(
 98 |         input_word_ids=input_ids, input_mask=input_mask, input_type_ids=segment_ids)
 99 |     return self.call(inputs)
100 | 
101 |   @tf.function
102 |   def serve(self, input_ids, input_mask=None, segment_ids=None):
103 |     outputs = self.serve_body(input_ids, input_mask, segment_ids)
104 |     # Returns a dictionary to control SignatureDef output signature.
105 |     return {"outputs": outputs}
106 | 
107 |   @tf.function
108 |   def serve_examples(self, inputs):
109 |     features = tf.io.parse_example(inputs, self.name_to_features)
110 |     for key in list(features.keys()):
111 |       t = features[key]
112 |       if t.dtype == tf.int64:
113 |         t = tf.cast(t, tf.int32)
114 |       features[key] = t
115 |     return self.serve(
116 |         features["input_word_ids"],
117 |         input_mask=features["input_mask"] if "input_mask" in features else None,
118 |         segment_ids=features["input_type_ids"]
119 |         if "input_type_ids" in features else None)
120 | 
121 |   @classmethod
122 |   def export(cls, model, export_dir):
123 |     if not isinstance(model, cls):
124 |       raise ValueError("Invalid model instance: %s, it should be a %s" %
125 |                        (model, cls))
126 | 
127 |     signatures = {
128 |         "serving_default":
129 |             model.serve.get_concrete_function(
130 |                 input_ids=tf.TensorSpec(
131 |                     shape=[None, None], dtype=tf.float32, name="inputs")),
132 |     }
133 |     if model.name_to_features:
134 |       signatures[
135 |           "serving_examples"] = model.serve_examples.get_concrete_function(
136 |               tf.TensorSpec(shape=[None], dtype=tf.string, name="examples"))
137 |     tf.saved_model.save(model.model, export_dir=export_dir, signatures=signatures)
138 | 
139 | 
140 | def main(_):
141 |   config_path = FLAGS.config_path
142 |   with open(config_path, 'r') as fr:
143 |       config = json.load(fr)
144 |   sequence_length = config['seq_len']
145 |   if sequence_length is not None and sequence_length > 0:
146 |     name_to_features = {
147 |         "input_word_ids": tf.io.FixedLenFeature([sequence_length], tf.int64),
148 |         "input_mask": tf.io.FixedLenFeature([sequence_length], tf.int64),
149 |         "input_type_ids": tf.io.FixedLenFeature([sequence_length], tf.int64),
150 |     }
151 |   else:
152 |     name_to_features = None
153 |   bert_config = bert_configs.BertConfig.from_json_file(FLAGS.bert_config_file)
154 |   serving_model = BertServing(
155 |       config=config, bert_config=bert_config, name_to_features=name_to_features)
156 |   checkpoint = tf.train.Checkpoint(model=serving_model.bert_encoder)
157 |   checkpoint.restore(FLAGS.model_checkpoint_path
158 |                     ).assert_existing_objects_matched()
159 |   '''.run_restore_ops()'''
160 |   BertServing.export(serving_model, FLAGS.export_path)
161 | 
162 | def get_serving_predict(self, sentence):
163 |     '''
164 |     使用tf-serving加载模型
165 |     :param sentence:
166 |     :return:
167 |     '''
168 |     # docker
169 |     # run - t - -rm - p 8500: 8500 \
170 |     # - v "/Users/donruo/Desktop/project/search_algorithm/ranking/tf_ranking/examples/output/export/latest_exporter/1614153823/" \
171 |     # - e MODEL_NAME = saved_model \
172 |     # tensorflow / serving: 1.15.0 &
173 |     sentence = list(sentence)
174 |     sentence_ids = self.sentence_to_idx(sentence)
175 |     # print(sentence_ids)
176 |     embedded_words = []
177 |     [embedded_words.append(self.word_vectors[i].tolist()) for i in sentence_ids]
178 |     # print(len(embedded_words))
179 |     # tf.contrib.util.make_tensor_proto(padding_sentence,
180 |     #                                   dtype=tf.int64,
181 |     #                                   shape=[1, 50]).SerializeToString())
182 | 
183 |     data = json.dumps({"signature_name": "classifier", "instances": [{"inputs": sentence_ids, "keep_prob": 1.0}]})
184 |     headers = {"content-type": "application/json"}
185 |     json_response = requests.post('http://localhost:8501/v1/models/savedModel:predict',
186 |                                   data=data, headers=headers)
187 |     prediction = json.loads(json_response.text)
188 |     print(prediction)
189 | 
190 |     return prediction
191 | 
192 | if __name__ == "__main__":
193 |   flags.mark_flag_as_required("bert_config_file")
194 |   flags.mark_flag_as_required("model_checkpoint_path")
195 |   flags.mark_flag_as_required("export_path")
196 |   flags.mark_flag_as_required('config_path')
197 |   app.run(main)
198 | 


--------------------------------------------------------------------------------
/tasks/ner_task.py:
--------------------------------------------------------------------------------
  1 | from official.nlp.bert import tokenization
  2 | import tensorflow as tf
  3 | from official.nlp.configs import bert
  4 | from official.nlp.configs import encoders
  5 | from official.nlp.data import pretrain_dataloader
  6 | 
  7 | from official.nlp.tasks.tagging import TaggingTask
  8 | from trainer.train_base import TrainBase
  9 | from official.nlp.modeling.models import BertTokenClassifier
 10 | import os
 11 | import json
 12 | from data_processor.ner_data_generator import NERDataGenerator
 13 | from official.nlp.modeling.networks import BertEncoder
 14 | from official.modeling import tf_utils
 15 | from official.nlp.bert import configs as bert_configs
 16 | 
 17 | def _masked_labels_and_weights(y_true):
 18 |   """Masks negative values from token level labels.
 19 | 
 20 |   Args:
 21 |     y_true: Token labels, typically shape (batch_size, seq_len), where tokens
 22 |       with negative labels should be ignored during loss/accuracy calculation.
 23 | 
 24 |   Returns:
 25 |     (masked_y_true, masked_weights) where `masked_y_true` is the input
 26 |     with each negative label replaced with zero and `masked_weights` is 0.0
 27 |     where negative labels were replaced and 1.0 for original labels.
 28 |   """
 29 |   # Ignore the classes of tokens with negative values.
 30 |   mask = tf.greater_equal(y_true, 0)
 31 |   # Replace negative labels, which are out of bounds for some loss functions,
 32 |   # with zero.
 33 |   masked_y_true = tf.where(mask, y_true, 0)
 34 |   return masked_y_true, tf.cast(mask, tf.float32)
 35 | 
 36 | class NERTask(TrainBase):
 37 |     '''
 38 |     基于bert的分类任务
 39 |     '''
 40 |     def __init__(self, task_config):
 41 |         self.config = task_config
 42 |         self.loss = 'loss'
 43 |         super(NERTask, self).__init__(task_config)
 44 |         self.data_generator = NERDataGenerator(task_config)
 45 | 
 46 | 
 47 |     def build_model(self):
 48 |         '''
 49 |         构建模型
 50 |         '''
 51 |         # encoder_network = encoders.build_encoder(encoders.EncoderConfig(
 52 |         #     bert=encoders.BertEncoderConfig(vocab_size=21128)))
 53 |         encoder_network = self.build_encoder()
 54 | 
 55 | 
 56 | 
 57 |         model = BertTokenClassifier(network=encoder_network,
 58 |                                     num_classes=self.config['tag_categories'],
 59 |                                     dropout_rate=self.config['dropout_rate'],
 60 |                                     output='logits')
 61 |         # ckpt = tf.train.Checkpoint(models=models)
 62 | 
 63 |         # init_checkpoint = self.config['bert_model_path']
 64 | 
 65 |         # ckpt.restore(init_checkpoint).assert_existing_objects_matched()
 66 | 
 67 |         # models.load_weights(init_checkpoint).assert_existing_objects_matched()
 68 |         return model
 69 | 
 70 |     def build_encoder(self):
 71 |         bert_config = bert_configs.BertConfig.from_json_file(self.config['bert_config_path'])
 72 |         cfg = bert_config
 73 |         bert_encoder = BertEncoder(
 74 |             vocab_size=cfg.vocab_size,
 75 |             hidden_size=cfg.hidden_size,
 76 |             num_layers=cfg.num_hidden_layers,
 77 |             num_attention_heads=cfg.num_attention_heads,
 78 |             intermediate_size=cfg.intermediate_size,
 79 |             activation=tf_utils.get_activation(cfg.hidden_act),
 80 |             dropout_rate=cfg.hidden_dropout_prob,
 81 |             attention_dropout_rate=cfg.attention_probs_dropout_prob,
 82 |             max_sequence_length=cfg.max_position_embeddings,
 83 |             type_vocab_size=cfg.type_vocab_size,
 84 |             initializer=tf.keras.initializers.TruncatedNormal(
 85 |                 stddev=cfg.initializer_range),
 86 |             embedding_width=cfg.embedding_size,
 87 |             return_all_encoder_outputs=False)
 88 |         # ckpt = tf.train.Checkpoint(model=bert_encoder)
 89 |         # init_checkpoint = self.config['bert_model_path']
 90 |         # ckpt.restore(init_checkpoint).assert_existing_objects_matched()
 91 |         # bert_encoder.load_weights(init_checkpoint)
 92 |         return bert_encoder
 93 | 
 94 |     def build_losses(self, labels, model_outputs, metrics, aux_losses=None) -> tf.Tensor:
 95 |         '''
 96 |         构建损失
 97 |         '''
 98 |         masked_labels, masked_weights = _masked_labels_and_weights(labels)
 99 |         metrics = dict([(metric.name, metric) for metric in metrics])
100 |         losses = tf.keras.losses.sparse_categorical_crossentropy(masked_labels,
101 |                                                                  tf.cast(model_outputs, tf.float32),
102 |                                                                  from_logits=True)
103 |         # metrics['losses'].update_state(losses)
104 |         loss = losses
105 |         numerator_loss = tf.reduce_sum(loss * masked_weights)
106 |         denominator_loss = tf.reduce_sum(masked_weights)
107 |         loss = tf.math.divide_no_nan(numerator_loss, denominator_loss)
108 | 
109 |         return loss
110 | 
111 |     def train_step(self,
112 |                    inputs,
113 |                    model:tf.keras.Model,
114 |                    optimizer: tf.keras.optimizers.Optimizer,
115 |                    metrics=None):
116 |         '''
117 |         进行训练，前向和后向计算
118 |         :param inputs:
119 |         :param model:
120 |         :param optimizer:
121 |         :param metrics:
122 |         :return:
123 |         '''
124 |         with tf.GradientTape() as tape:
125 |             outputs = model(inputs, training=True)
126 |             outputs = outputs[:, 1:self.config['seq_len'] + 1, :]
127 |             loss = self.build_losses(labels=inputs['labels'], model_outputs=outputs, metrics=metrics, aux_losses=None)
128 |         tvars = model.trainable_variables
129 |         grads = tape.gradient(loss, tvars)
130 |         grads, _ = tf.clip_by_global_norm(grads, clip_norm=5.0)
131 |         optimizer.apply_gradients(list(zip(grads, tvars)))
132 |         labels = inputs['labels']
133 |         logs = {self.loss: loss}
134 |         if metrics:
135 |             self.process_metrics(metrics, labels, outputs)
136 |             logs.update({m.name: m.result() for m in model.metrics})
137 |         if model.compiled_metrics:
138 |             self.process_compiled_metrics(model.compiled_metrics, labels, outputs)
139 |             logs.update({m.name: m.result() for m in metrics or []})
140 |             logs.update({m.name: m.result() for m in model.metrics})
141 |         return logs
142 | 
143 |     def validation_step(self, inputs, model:tf.keras.Model, metrics=None):
144 |         '''
145 |         验证集验证模型
146 |         :param input:
147 |         :param model:
148 |         :return:
149 |         '''
150 |         labels = inputs['labels']
151 |         outputs = self.inference_step(inputs, model)
152 |         outputs = outputs[:, 1:self.config['seq_len'] + 1, :]
153 |         loss = self.build_losses(labels, outputs, metrics, aux_losses=model.losses)
154 | 
155 |         logs = {self.loss: loss}
156 |         if metrics:
157 |             self.process_metrics(metrics, labels, outputs)
158 |         if model.compiled_metrics:
159 |             self.process_compiled_metrics(model.compiled_metrics, labels, outputs)
160 |             logs.update({m.name: m.result() for m in metrics or []})
161 |             logs.update({m.name: m.result() for m in model.metrics})
162 |         return logs
163 | 
164 |     def build_inputs(self, inputs):
165 |         '''
166 |         构建输入
167 |         '''
168 |         train_input = {
169 |             "input_word_ids": tf.convert_to_tensor(inputs['input_word_ids']),
170 |             "input_mask": tf.convert_to_tensor(inputs['input_mask']),
171 |             "input_type_ids": tf.convert_to_tensor(inputs['input_type_ids']),
172 |             "labels": inputs['input_target_ids']
173 |         }
174 |         return train_input
175 | 
176 |     def build_metrics(self, training=None):
177 |         '''
178 |         构建评价指标
179 |         :param training:
180 |         :return:
181 |         '''
182 |         # del training
183 |         metrics = [
184 |             tf.keras.metrics.SparseCategoricalAccuracy(name='ner_metrics')
185 |         ]
186 | 
187 |         # metrics = dict([(metric.name, metric) for metric in metrics])
188 | 
189 |         return metrics
190 | 
191 |     def check_exist_model(self, model):
192 |         '''
193 |         检查是否存在模型文件
194 |         :return:
195 |         '''
196 |         # ckpt = tf.train.Checkpoint(models=models)
197 |         init_checkpoint = os.path.join(self.config['ckpt_model_path'], self.config['model_name'])
198 | 
199 |         # ckpt.restore(init_checkpoint).assert_existing_objects_matched()
200 |         model.load_weights(init_checkpoint).assert_existing_objects_matched()
201 | 
202 | 
203 | if __name__=='__main__':
204 |     with open("../model_configs/bert_ner.json", 'r') as fr:
205 |         config = json.load(fr)
206 |     print(config)
207 |     ner = NERTask(config)
208 | 
209 |     model = ner.build_model()
210 |     bert_encoder = ner.build_encoder()
211 |     ckpt = tf.train.Checkpoint(model=bert_encoder)
212 |     init_checkpoint = config['bert_model_path']
213 |     ckpt.restore(init_checkpoint).assert_existing_objects_matched()
214 |     # config = models.get_config()
215 |     ner.train(model)
216 | 
217 | 
218 | 


--------------------------------------------------------------------------------
/data_processor/tokenizer.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | 文本转化成tokens
  3 | '''
  4 | from data_processor.base_processor import data_base
  5 | from itertools import chain
  6 | import numpy as np
  7 | import pickle
  8 | import os
  9 | from official.nlp.bert import tokenization
 10 | 
 11 | 
 12 | class tokenizer(data_base):
 13 |     '''
 14 |     文本转tokens
 15 |     '''
 16 |     def __init__(self, token_configs):
 17 |         self.token_configs = token_configs
 18 |         super(tokenizer, self).__init__(token_configs)
 19 | 
 20 |     def tokens_to_ids(self, tokens, tokens_to_index):
 21 |         '''
 22 |         token转索引
 23 |         :param tokens:
 24 |         :return:
 25 |         '''
 26 |         ids = [tokens_to_index.get(token, 1) for token in tokens]
 27 |         return ids
 28 | 
 29 |     def labels_to_ids(self, labels, labels_to_index):
 30 |         '''
 31 |         token转索引
 32 |         :param tokens:
 33 |         :return:
 34 |         '''
 35 |         ids = [labels_to_index.get(token) for token in labels]
 36 |         return ids
 37 | 
 38 |     def seq_labels_to_ids(self, labels, labels_to_index):
 39 |         '''
 40 |         token转索引
 41 |         :param tokens:
 42 |         :return:
 43 |         '''
 44 |         if len(labels) < self.config['seq_len']:
 45 |             labels += ['O'] * (self.config['seq_len'] - len(labels))
 46 |         else:
 47 |             labels = labels[:self.config['seq_len']]
 48 |         nan_id = labels_to_index.get('O')
 49 |         ids = [labels_to_index.get(token, nan_id) for token in labels]
 50 |         return ids
 51 | 
 52 |     def seq2seq_label_process(self, labels):
 53 |         '''
 54 |         seq2seq任务处理label数据，在头尾添加<SOS>,<EOS>
 55 |         :param labels:
 56 |         :return:
 57 |         '''
 58 |         res = []
 59 |         for line in labels:
 60 |             line.insert(0, "<SOS>")
 61 |             line.insert(-1, "<EOS>")
 62 |             res.append(line)
 63 |         return res
 64 | 
 65 |     def ids_to_tokens(self, ids, tokens_to_index):
 66 |         '''
 67 |         索引转成token
 68 |         :param ids:
 69 |         :return:
 70 |         '''
 71 |         tokens = [list(tokens_to_index.keys())[id] for id in ids]
 72 |         return tokens
 73 | 
 74 |     def multi_label_to_index(self, labels, label_to_index):
 75 |         '''
 76 |         多标签数据转索引
 77 |         :param labels:
 78 |         :return:
 79 |         '''
 80 |         label_idxs = np.zeros((len(labels), len(label_to_index)))
 81 | 
 82 |         for i, label in enumerate(labels):
 83 |             for l in label:
 84 |                 id = label_to_index.get(l)
 85 |                 label_idxs[i, id] = 1
 86 |         return label_idxs
 87 | 
 88 |     def word_to_index(self, all_words):
 89 |         '''
 90 |         生成词汇-索引字典
 91 |         :param texts:
 92 |         :return:
 93 |         '''
 94 | 
 95 |         #是否过滤低频词
 96 |         if self.config['freq_filter']:
 97 |             vocab = self.word_freq_filter(self.config['freq_filter'], all_words)
 98 |         else:
 99 |             vocab = self.get_vocab(all_words)
100 |         #设置词典大小
101 |         vocab = ["<PAD>", "<UNK>"] + vocab
102 |         self.vocab_size = self.config['vocab_size']
103 |         if len(vocab) < self.vocab_size:
104 |             self.vocab_size = len(vocab)
105 |         self.vocab = vocab[:self.vocab_size]
106 |         #构建词典索引
107 |         word_to_index = dict(zip(vocab, list(range(len(vocab)))))
108 | 
109 |         return word_to_index
110 | 
111 | 
112 |     def label_to_index(self, labels):
113 |         '''
114 |         标签索引字典
115 |         :param labels:
116 |         :return:
117 |         '''
118 |         if not self.config['multi_label']:
119 |             unique_labels = list(set(labels))  # 单标签转换
120 |         else:
121 |             unique_labels = list(set(chain(*labels)))#多标签转换
122 |         label_to_index = dict(zip(unique_labels, list(range(len(unique_labels)))))
123 |         return label_to_index
124 | 
125 |     def padding(self, tokens):
126 |         '''
127 |         将输入序列做定长处理
128 |         :param tokens:
129 |         :return:
130 |         '''
131 |         if len(tokens) < self.config['seq_len']:
132 |             tokens += [0] * (self.config['seq_len'] - len(tokens))
133 |         else:
134 |             tokens = tokens[:self.config['seq_len']]
135 |         return tokens
136 | 
137 |     def encode(self, text):
138 |         '''
139 |         句子转成token
140 |         :param file_path:
141 |         :return:
142 |         '''
143 |         _tokenizer = tokenization.FullTokenizer(self.config['vocab_path'], do_lower_case=True)
144 |         if isinstance(text, str):
145 | 
146 |             split_tokens = _tokenizer.tokenize(text)
147 |         else:
148 |             split_tokens = text
149 |         if len(split_tokens) > self.config['seq_len']:
150 |             split_tokens = split_tokens[:self.config['seq_len']]
151 |             sequence_length = self.config['seq_len']
152 |         else:
153 |             sequence_length = len(split_tokens)
154 |             while (len(split_tokens) < self.config['seq_len']):
155 |                 split_tokens.append("[PAD]")
156 |             # word_mask = [[1]*(maxlen+2) for i in range(data_len)]
157 | 
158 |         tokens = []
159 |         tokens.append("[CLS]")
160 |         for i in split_tokens:
161 |             if i not in _tokenizer.vocab:
162 |                 tokens.append("[UNK]")
163 |                 print(i)
164 |                 continue
165 |             tokens.append(i)
166 |         tokens.append("[SEP]")
167 |         word_ids = _tokenizer.convert_tokens_to_ids(tokens)
168 |         word_mask = []
169 |         for i in word_ids:
170 |             if i == "[PAD]":
171 |                 word_mask.append(0)
172 |             else:
173 |                 word_mask.append(1)
174 |         segment_ids = [0] * len(word_ids)
175 |         return word_ids, segment_ids, word_mask, sequence_length
176 | 
177 |     def encode_v2(self, text_1, text_2):
178 |         '''
179 |         交互式文本匹配编码
180 |         '''
181 |         _tokenizer = tokenization.FullTokenizer(self.config['vocab_path'], do_lower_case=True)
182 |         if isinstance(text_1, str):
183 |             split_tokens_1 = _tokenizer.tokenize(text_1)
184 |         else:
185 |             split_tokens_1 = text_1
186 |         if isinstance(text_2, str):
187 |             split_tokens_2 = _tokenizer.tokenize(text_2)
188 |         else:
189 |             split_tokens_2 = text_1
190 | 
191 |         if len(split_tokens_1) + len(split_tokens_2) > self.config['seq_len']:
192 |             split_tokens_2 = split_tokens_2[:self.config['seq_len'] - len(split_tokens_1)]
193 |             sequence_length = self.config['seq_len']
194 |         else:
195 |             sequence_length = len(split_tokens_1) + len(split_tokens_2)
196 |             while (len(split_tokens_1) + len(split_tokens_2) < self.config['seq_len']):
197 |                 split_tokens_2.append("[PAD]")
198 | 
199 |         tokens = []
200 |         segment_ids = []
201 |         tokens.append("[CLS]")
202 |         segment_ids.append(0)
203 |         for i in split_tokens_1:
204 |             if i not in _tokenizer.vocab:
205 |                 tokens.append("[UNK]")
206 |                 print(i)
207 |                 continue
208 |             tokens.append(i)
209 |             segment_ids.append(0)
210 |         tokens.append("[SEP]")
211 |         segment_ids.append(0)
212 |         for i in split_tokens_2:
213 |             if i not in _tokenizer.vocab:
214 |                 tokens.append("[UNK]")
215 |                 print(i)
216 |                 continue
217 |             tokens.append(i)
218 |             segment_ids.append(1)
219 |         tokens.append("[SEP]")
220 |         segment_ids.append(1)
221 |         word_ids = _tokenizer.convert_tokens_to_ids(tokens)
222 |         word_mask = []
223 |         for i in word_ids:
224 |             if i == "[PAD]":
225 |                 word_mask.append(0)
226 |             else:
227 |                 word_mask.append(1)
228 |         return word_ids, segment_ids, word_mask, sequence_length
229 | 
230 |     def save_input_tokens(self, texts, labels, label_to_index):
231 |         '''
232 |         保存处理完成的输入tokens，方便后续加载
233 |         :param texts:
234 |         :return:
235 |         '''
236 | 
237 |         word_ids, segment_ids, word_mask, sequence_length = [], [], [], []
238 |         label_ids = []
239 |         for i,text in enumerate(texts):
240 | 
241 |             _word_ids, _segment_ids, _word_mask, _sequence_length = self.encode(text)
242 |             word_ids.append(_word_ids)
243 |             segment_ids.append(_segment_ids)
244 |             word_mask.append(_word_mask)
245 |             sequence_length.append(_sequence_length)
246 |             label_id = self.labels_to_ids([labels[i]], label_to_index)
247 |             label_ids.append(label_id)
248 | 
249 | 
250 |         input_tokens = dict(word_ids=word_ids, segment_ids=segment_ids, word_mask=word_mask, sequence_length=sequence_length, labels_idx=label_ids)
251 |         if not os.path.exists(self.config['output_path']):
252 |             os.mkdir(self.config['output_path'])
253 |         #保存准备训练的tokens数据
254 |         with open(os.path.join(self.config['output_path'], 'train_tokens.pkl'), "wb") as fw:
255 |             pickle.dump(input_tokens, fw)
256 |         # 保存预处理的word_to_index数据
257 |         # with open(os.path.join(self.config['output_path'], 'word_to_index.pkl'), "wb") as fw:
258 |         #     pickle.dump(word_to_index, fw)
259 |         # 保存预处理的word_to_index数据
260 |         with open(os.path.join(self.config['output_path'], 'label_to_index.pkl'), "wb") as fw:
261 |             pickle.dump(label_to_index, fw)
262 |         return word_ids, segment_ids, word_mask, sequence_length, label_ids


--------------------------------------------------------------------------------
/models/model_base.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import tensorflow as tf
  3 | import functools
  4 | from typing import Any, Callable, Optional
  5 | import abc
  6 | 
  7 | 
  8 | class BaseModel(tf.Module, metaclass=abc.ABCMeta):
  9 |     '''
 10 |     模型的基类
 11 |     '''
 12 |     def __init__(self, config):
 13 |         self.config = config
 14 |         super(BaseModel, self).__init__()
 15 | 
 16 |     def build_model(self):
 17 |         '''
 18 |         创建模型
 19 |         :return:
 20 |         '''
 21 |         raise NotImplemented
 22 | 
 23 |     def build_inputs(self, inputs):
 24 |         '''
 25 |         创建输入
 26 |         :return:
 27 |         '''
 28 |         raise NotImplemented
 29 | 
 30 |     def build_losses(self, labels, model_outputs, metrics, aux_losses) -> tf.Tensor:
 31 |         '''
 32 |         计算loss值
 33 |         :param labels:
 34 |         :param model_outputs:
 35 |         :param metrics:
 36 |         :return:
 37 |         '''
 38 |         raise NotImplemented
 39 | 
 40 |     def build_metrics(self, training: bool = True):
 41 |         """
 42 |         获取模型训练/验证的评价指标
 43 |         :param training:
 44 |         :return:
 45 |         """
 46 |         del training
 47 |         return []
 48 | 
 49 |     def compile_model(self,
 50 |                       model: tf.keras.Model,
 51 |                       optimizer: tf.keras.optimizers.Optimizer,
 52 |                       loss=None,
 53 |                       train_step: Optional[Callable[..., Any]] = None,
 54 |                       validation_step: Optional[Callable[..., Any]] = None,
 55 |                       **kwargs) -> tf.keras.Model:
 56 |         """Compiles the models with objects created by the task.
 57 | 
 58 |         The method should not be used in any customized training implementation.
 59 | 
 60 |         Args:
 61 |           model: a keras.Model.
 62 |           optimizer: the keras optimizer.
 63 |           loss: a callable/list of losses.
 64 |           train_step: optional train step function defined by the task.
 65 |           validation_step: optional validation_step step function defined by the
 66 |             task.
 67 |           **kwargs: other kwargs consumed by keras.Model compile().
 68 | 
 69 |         Returns:
 70 |           a compiled keras.Model.
 71 |         """
 72 |         if bool(loss is None) == bool(train_step is None):
 73 |             raise ValueError("`loss` and `train_step` should be exclusive to "
 74 |                              "each other.")
 75 |         model.compile(optimizer=optimizer, loss=loss, **kwargs)
 76 | 
 77 |         if train_step:
 78 |             model.train_step = functools.partial(
 79 |                 train_step, model=model, optimizer=model.optimizer)
 80 |         if validation_step:
 81 |             model.test_step = functools.partial(validation_step, model=model)
 82 |         return model
 83 | 
 84 |     def process_metrics(self, metrics, labels, model_outputs):
 85 |         '''
 86 |         处理并更新评价指标
 87 |         :param metrics:
 88 |         :param labels:
 89 |         :param model_outputs:
 90 |         :return:
 91 |         '''
 92 |         for metric in metrics:
 93 |             metric.update_state(labels, model_outputs)
 94 | 
 95 |     def process_compiled_metrics(self, compiled_metrics, labels, model_outputs):
 96 |         '''
 97 |         处理并更新compiled metrics
 98 |         :param compiled_metrics:
 99 |         :param labels:
100 |         :param model_outputs:
101 |         :return:
102 |         '''
103 |         compiled_metrics.update_state(labels, model_outputs)
104 | 
105 |     def get_optimizer(self):
106 |         '''
107 |         选择优化算法
108 |         :return:
109 |         '''
110 |         option = self.config['optimizer']
111 |         optimizer = None
112 |         learning_rate = self.config['learning_rate']
113 |         if option == 'adam':
114 |             optimizer = tf.keras.optimizers.Adam(learning_rate)
115 |         if option == 'rmsprop':
116 |             optimizer = tf.keras.optimizers.RMSprop(learning_rate)
117 |         if option == 'sgd':
118 |             optimizer = tf.keras.optimizers.SGD(learning_rate)
119 |         return optimizer
120 | 
121 |     def train_step(self,
122 |                    inputs,
123 |                    model:tf.keras.Model,
124 |                    optimizer: tf.keras.optimizers.Optimizer,
125 |                    metrics=None):
126 |         '''
127 |         进行训练，前向和后向计算
128 |         :param inputs:
129 |         :param model:
130 |         :param optimizer:
131 |         :param metrics:
132 |         :return:
133 |         '''
134 |         with tf.GradientTape() as tape:
135 |             outputs = model(inputs, training=True)
136 |             loss = self.build_losses(labels=inputs['labels'], model_outputs=outputs, metrics=metrics, aux_losses=None)
137 |         tvars = model.trainable_variables
138 |         grads = tape.gradient(loss, tvars)
139 |         grads, _ = tf.clip_by_global_norm(grads, clip_norm=5.0)
140 |         optimizer.apply_gradients(list(zip(grads, tvars)))
141 |         labels = inputs['labels']
142 |         logs = {self.loss: loss}
143 |         if metrics:
144 |             self.process_metrics(metrics, labels, outputs)
145 |             logs.update({m.name: m.result() for m in model.metrics})
146 |         if model.compiled_metrics:
147 |             self.process_compiled_metrics(model.compiled_metrics, labels, outputs)
148 |             logs.update({m.name: m.result() for m in metrics or []})
149 |             logs.update({m.name: m.result() for m in model.metrics})
150 |         return logs
151 | 
152 | 
153 |     def validation_step(self, inputs, model:tf.keras.Model, metrics=None):
154 |         '''
155 |         验证集验证模型
156 |         :param input:
157 |         :param model:
158 |         :return:
159 |         '''
160 |         labels = inputs['labels']
161 |         outputs = self.inference_step(inputs, model)
162 |         loss = self.build_losses(labels, outputs, metrics, aux_losses=model.losses)
163 | 
164 |         logs = {self.loss: loss}
165 |         if metrics:
166 |             self.process_metrics(metrics, labels, outputs)
167 |         if model.compiled_metrics:
168 |             self.process_compiled_metrics(model.compiled_metrics, labels, outputs)
169 |             logs.update({m.name: m.result() for m in metrics or []})
170 |             logs.update({m.name: m.result() for m in model.metrics})
171 |         return logs
172 | 
173 |     def inference_step(self, inputs, model:tf.keras.Model):
174 |         '''
175 |         模型推理
176 |         :param inputs:
177 |         :param model:
178 |         :return:
179 |         '''
180 |         return model(inputs, training=False)
181 | 
182 |     def get_predictions(self, logits):
183 |         '''
184 |         模型预测结果
185 |         :param input:
186 |         :param models:
187 |         :return:
188 |         '''
189 | 
190 |         predictions = tf.keras.layers.Activation(
191 |             tf.nn.log_softmax, dtype=tf.float32)(logits).numpy()
192 |         predictions = tf.argmax(predictions, axis=-1, name='predictions')
193 | 
194 |         return predictions
195 | 
196 |     def save_ckpt_model(self, model:tf.keras.Model):
197 |         '''
198 |         将模型保存成ckpt格式
199 |         :param model:
200 |         :return:
201 |         '''
202 |         save_path = os.path.join(os.path.abspath(os.path.dirname(os.getcwd())),
203 |                                  self.config["ckpt_model_path"])
204 |         if not os.path.exists(save_path):
205 |             os.makedirs(save_path)
206 |         model_save_path = os.path.join(save_path, self.config["model_name"])
207 | 
208 |         # checkpoint = tf.train.Checkpoint(models)
209 |         # checkpoint.save(model_save_path + '/models.ckpt')
210 |         model.save_weights(model_save_path)
211 | 
212 |     def save_pb_model(self, model:tf.keras.Model, checkpoint_dir=None, restore_model_using_load_weights=True):
213 |         '''
214 |         将模型保存成pb格式
215 |         :param model:
216 |         :return:
217 |         '''
218 |         save_path = os.path.join(os.path.abspath(os.path.dirname(os.getcwd())),
219 |                                  self.config["export_model_path"])
220 |         if not os.path.exists(save_path):
221 |             os.makedirs(save_path)
222 |         model_export_path = os.path.join(save_path, self.config["model_name"])
223 | 
224 |         if checkpoint_dir:
225 |             # Keras compile/fit() was used to save checkpoint using
226 |             # models.save_weights().
227 |             if restore_model_using_load_weights:
228 |                 model_weight_path = os.path.join(checkpoint_dir, 'checkpoint')
229 |                 assert tf.io.gfile.exists(model_weight_path)
230 |                 model.load_weights(model_weight_path)
231 | 
232 |             # tf.train.Checkpoint API was used via custom training loop logic.
233 |             else:
234 |                 checkpoint = tf.train.Checkpoint(model=model)
235 | 
236 |                 # Restores the models from latest checkpoint.
237 |                 latest_checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
238 |                 assert latest_checkpoint_file
239 | 
240 |                 checkpoint.restore(
241 |                     latest_checkpoint_file).assert_existing_objects_matched()
242 | 
243 |         model.save(model_export_path, include_optimizer=False, save_format='tf')
244 | 
245 |     def load_ckpt_model(self, model, path, model_name):
246 |         '''
247 |         加载ckpt模型
248 |         :param model_path:
249 |         :return:
250 |         '''
251 |         # models = self.create_model()
252 |         path = os.path.join(path, model_name)
253 |         model.load_weights(path)
254 |         return model
255 | 
256 | 
257 | 
258 | 
259 | 
260 | 
261 | 
262 | 
263 | 
264 | 


--------------------------------------------------------------------------------
/tasks/distillation_task.py:
--------------------------------------------------------------------------------
  1 | from official.nlp.bert import tokenization
  2 | import tensorflow as tf
  3 | from official.nlp.configs import bert
  4 | from official.nlp.configs import encoders
  5 | from official.nlp.data import pretrain_dataloader
  6 | 
  7 | from official.nlp.tasks.tagging import TaggingTask
  8 | from trainer.train_base import TrainBase
  9 | from official.nlp.modeling.models import BertClassifier
 10 | import os
 11 | import json
 12 | from data_processor.text_match_data_generator import TextMatchDataGenerator
 13 | from official.nlp.modeling.networks import BertEncoder
 14 | from official.modeling import tf_utils
 15 | from official.nlp.bert import configs as bert_configs
 16 | from models.knowledge_distiilation import Distill_model
 17 | from models.sim_bert import SimBert
 18 | 
 19 | 
 20 | 
 21 | class DistillTask(TrainBase):
 22 |     '''
 23 |     基于bert的知识蒸馏任务
 24 |     '''
 25 |     def __init__(self, task_config):
 26 |         self.config = task_config
 27 |         self.loss = 'loss'
 28 |         super(DistillTask, self).__init__(task_config)
 29 |         self.data_generator = TextMatchDataGenerator(task_config)
 30 | 
 31 | 
 32 |     def build_model(self):
 33 |         '''
 34 |         构建模型
 35 |         '''
 36 |         # encoder_network = encoders.build_encoder(encoders.EncoderConfig(
 37 |         #     bert=encoders.BertEncoderConfig(vocab_size=21128)))
 38 |         bert_config = bert_configs.BertConfig.from_json_file(self.config['bert_config_path'])
 39 |         encoder_network = self.build_encoder()
 40 |         teacher_network = SimBert(network=encoder_network, config=self.config)
 41 | 
 42 |         model = Distill_model(teacher_network=teacher_network, config=self.config, vocab_size=bert_config.vocab_size, word_vectors=None)
 43 | 
 44 |         return model
 45 | 
 46 |     def build_encoder(self):
 47 |         bert_config = bert_configs.BertConfig.from_json_file(self.config['bert_config_path'])
 48 |         cfg = bert_config
 49 |         bert_encoder = BertEncoder(
 50 |             vocab_size=cfg.vocab_size,
 51 |             hidden_size=cfg.hidden_size,
 52 |             num_layers=cfg.num_hidden_layers,
 53 |             num_attention_heads=cfg.num_attention_heads,
 54 |             intermediate_size=cfg.intermediate_size,
 55 |             activation=tf_utils.get_activation(cfg.hidden_act),
 56 |             dropout_rate=cfg.hidden_dropout_prob,
 57 |             attention_dropout_rate=cfg.attention_probs_dropout_prob,
 58 |             max_sequence_length=cfg.max_position_embeddings,
 59 |             type_vocab_size=cfg.type_vocab_size,
 60 |             initializer=tf.keras.initializers.TruncatedNormal(
 61 |                 stddev=cfg.initializer_range),
 62 |             embedding_width=cfg.embedding_size,
 63 |             return_all_encoder_outputs=True)
 64 |         # ckpt = tf.train.Checkpoint(model=bert_encoder)
 65 |         # init_checkpoint = self.config['bert_model_path']
 66 |         # ckpt.restore(init_checkpoint).assert_existing_objects_matched()
 67 |         # bert_encoder.load_weights(init_checkpoint)
 68 |         return bert_encoder
 69 | 
 70 |     def build_losses(self, labels, model_outputs, metrics, aux_losses=None) -> tf.Tensor:
 71 |         '''
 72 |         构建损失
 73 |         '''
 74 |         with tf.name_scope('TextMatchTask/losses'):
 75 |             if self.config['model_name'] == 'distill_model':
 76 |                 # mse损失计算
 77 |                 y = tf.reshape(labels, (-1,))
 78 |                 student_soft_label = model_outputs['student_soft_label']
 79 |                 teacher_soft_label = model_outputs['teacher_soft_label']
 80 |                 mse_loss = tf.keras.losses.mean_squared_error(teacher_soft_label, student_soft_label)
 81 | 
 82 |                 #ce损失计算
 83 |                 similarity = model_outputs['student_hard_label']
 84 |                 cond = (similarity < self.config["neg_threshold"])
 85 |                 zeros = tf.zeros_like(similarity, dtype=tf.float32)
 86 |                 ones = tf.ones_like(similarity, dtype=tf.float32)
 87 |                 squre_similarity = tf.square(similarity)
 88 |                 neg_similarity = tf.where(cond, squre_similarity, zeros)
 89 | 
 90 |                 pos_loss = y * (tf.square(ones - similarity) / 4)
 91 |                 neg_loss = (ones - y) * neg_similarity
 92 |                 ce_loss = pos_loss+neg_loss
 93 |                 losses = self.config['alpha']*mse_loss + (1-self.config['alpha'])*ce_loss
 94 |                 loss = tf.reduce_mean(losses)
 95 |                 return loss
 96 | 
 97 |             metrics = dict([(metric.name, metric) for metric in metrics])
 98 |             losses = tf.keras.losses.sparse_categorical_crossentropy(labels,
 99 |                                                                      tf.cast(model_outputs['predictions'], tf.float32),
100 |                                                                      from_logits=True)
101 | 
102 |             loss = tf.reduce_mean(losses)
103 | 
104 |             return loss
105 | 
106 |     def build_inputs(self, inputs):
107 |         '''
108 |         构建输入
109 |         '''
110 |         train_input = {
111 |             "input_x_ids": tf.convert_to_tensor(inputs['input_word_ids_a']),
112 |             "input_y_ids": tf.convert_to_tensor(inputs['input_word_ids_b']),
113 |             "input_word_ids_a": tf.convert_to_tensor(inputs['input_word_ids_a']),
114 |             "input_mask_a": tf.convert_to_tensor(inputs['input_mask_a']),
115 |             "input_type_ids_a": tf.convert_to_tensor(inputs['input_type_ids_a']),
116 |             "input_word_ids_b": tf.convert_to_tensor(inputs['input_word_ids_b']),
117 |             "input_mask_b": tf.convert_to_tensor(inputs['input_mask_b']),
118 |             "input_type_ids_b": tf.convert_to_tensor(inputs['input_type_ids_b']),
119 |             "labels": inputs['input_target_ids']
120 |         }
121 |         return train_input
122 | 
123 |     def train_step(self,
124 |                    inputs,
125 |                    model: tf.keras.Model,
126 |                    optimizer: tf.keras.optimizers.Optimizer,
127 |                    metrics=None):
128 |         '''
129 |         进行训练，前向和后向计算
130 |         :param inputs:
131 |         :param model:
132 |         :param optimizer:
133 |         :param metrics:
134 |         :return:
135 |         '''
136 | 
137 |         with tf.GradientTape() as tape:
138 |             outputs = model(inputs, training=True)
139 |             loss = self.build_losses(inputs["labels"], outputs, metrics, aux_losses=None)
140 | 
141 |         tvars = model.trainable_variables
142 |         grads = tape.gradient(loss, tvars)
143 |         grads, _ = tf.clip_by_global_norm(grads, clip_norm=5.0)
144 |         optimizer.apply_gradients(list(zip(grads, tvars)))
145 |         labels = inputs['labels']
146 |         logs = {self.loss: loss}
147 |         if metrics:
148 |             self.process_metrics(metrics, labels, outputs['predictions'])
149 |             logs.update({m.name: m.result() for m in model.metrics})
150 |         if model.compiled_metrics:
151 |             self.process_compiled_metrics(model.compiled_metrics, labels, outputs['predictions'])
152 |             logs.update({m.name: m.result() for m in metrics or []})
153 |             logs.update({m.name: m.result() for m in model.metrics})
154 |         return logs
155 | 
156 |     def validation_step(self, inputs, model: tf.keras.Model, metrics=None):
157 |         '''
158 |         验证集验证模型
159 |         :param input:
160 |         :param model:
161 |         :return:
162 |         '''
163 |         labels = inputs['labels']
164 |         outputs = self.inference_step(inputs, model)
165 |         loss = self.build_losses(labels, outputs, metrics, aux_losses=model.losses)
166 | 
167 |         logs = {self.loss: loss}
168 |         if metrics:
169 |             self.process_metrics(metrics, labels, outputs['predictions'])
170 |         if model.compiled_metrics:
171 |             self.process_compiled_metrics(model.compiled_metrics, labels, outputs['predictions'])
172 |             logs.update({m.name: m.result() for m in metrics or []})
173 |             logs.update({m.name: m.result() for m in model.metrics})
174 |         return logs
175 | 
176 |     def build_metrics(self, training=None):
177 |         '''
178 |         构建评价指标
179 |         :param training:
180 |         :return:
181 |         '''
182 |         # del training
183 |         metrics = [
184 |             tf.keras.metrics.SparseCategoricalAccuracy(name='text_match_metrics')
185 |         ]
186 | 
187 |         return metrics
188 | 
189 |     def check_exist_model(self, model):
190 |         '''
191 |         检查是否存在模型文件
192 |         :return:
193 |         '''
194 |         # ckpt = tf.train.Checkpoint(models=models)
195 |         init_checkpoint = os.path.join(self.config['ckpt_model_path'], self.config['model_name'])
196 | 
197 |         # ckpt.restore(init_checkpoint).assert_existing_objects_matched()
198 |         model.load_weights(init_checkpoint).assert_existing_objects_matched()
199 | 
200 | 
201 | if __name__=='__main__':
202 |     with open("../model_configs/distill_bert.json", 'r') as fr:
203 |         config = json.load(fr)
204 |     print(config)
205 |     distill_pair = DistillTask(config)
206 | 
207 |     model = distill_pair.build_model()
208 |     bert_encoder = distill_pair.build_encoder()
209 |     ckpt = tf.train.Checkpoint(model=bert_encoder)
210 |     init_checkpoint = config['bert_model_path']
211 |     ckpt.restore(init_checkpoint).assert_existing_objects_matched()
212 |     # config = models.get_config()
213 |     # new_model = tf.keras.Model(inputs=model.inputs[0:2], outputs=model.output['predictions'])
214 |     # for layer in model.layers:
215 |     #     if layer.name!='sim_bert':
216 |     #         new_model.add(layer)
217 |     distill_pair.train(model)
218 |     # print(new_model.summary())
219 | 
220 | 
221 | 


--------------------------------------------------------------------------------
/tasks/ranking_task.py:
--------------------------------------------------------------------------------
  1 | from official.nlp.bert import tokenization
  2 | import tensorflow as tf
  3 | 
  4 | 
  5 | from official.nlp.tasks.tagging import TaggingTask
  6 | from trainer.train_base import TrainBase
  7 | from official.nlp.modeling.models import BertClassifier
  8 | import os
  9 | import json
 10 | from data_processor.text_match_data_generator_v2 import TextMatchDataGeneratorV2
 11 | from official.nlp.modeling.networks import BertEncoder
 12 | from official.modeling import tf_utils
 13 | from official.nlp.bert import configs as bert_configs
 14 | from models.ranking import Ranking
 15 | import numpy as np
 16 | 
 17 | 
 18 | 
 19 | class RankingTask(TrainBase):
 20 |     '''
 21 |     基于bert的分类任务
 22 |     '''
 23 |     def __init__(self, task_config):
 24 |         self.config = task_config
 25 |         self.loss = 'loss'
 26 |         super(RankingTask, self).__init__(task_config)
 27 |         self.data_generator = TextMatchDataGeneratorV2(task_config)
 28 | 
 29 | 
 30 |     def build_model(self):
 31 |         '''
 32 |         构建模型
 33 |         '''
 34 |         # encoder_network = encoders.build_encoder(encoders.EncoderConfig(
 35 |         #     bert=encoders.BertEncoderConfig(vocab_size=21128)))
 36 |         encoder_network = self.build_encoder()
 37 |         model = Ranking(network=encoder_network, config=self.config)
 38 | 
 39 |         return model
 40 | 
 41 |     def build_encoder(self):
 42 |         bert_config = bert_configs.BertConfig.from_json_file(self.config['bert_config_path'])
 43 |         cfg = bert_config
 44 |         bert_encoder = BertEncoder(
 45 |             vocab_size=cfg.vocab_size,
 46 |             hidden_size=cfg.hidden_size,
 47 |             num_layers=cfg.num_hidden_layers,
 48 |             num_attention_heads=cfg.num_attention_heads,
 49 |             intermediate_size=cfg.intermediate_size,
 50 |             activation=tf_utils.get_activation(cfg.hidden_act),
 51 |             dropout_rate=cfg.hidden_dropout_prob,
 52 |             attention_dropout_rate=cfg.attention_probs_dropout_prob,
 53 |             max_sequence_length=cfg.max_position_embeddings,
 54 |             type_vocab_size=cfg.type_vocab_size,
 55 |             initializer=tf.keras.initializers.TruncatedNormal(
 56 |                 stddev=cfg.initializer_range),
 57 |             embedding_width=cfg.embedding_size,
 58 |             return_all_encoder_outputs=True)
 59 |         # ckpt = tf.train.Checkpoint(model=bert_encoder)
 60 |         # init_checkpoint = self.config['bert_model_path']
 61 |         # ckpt.restore(init_checkpoint).assert_existing_objects_matched()
 62 |         # bert_encoder.load_weights(init_checkpoint)
 63 |         return bert_encoder
 64 | 
 65 |     def lambda_rank_loss(self, scores, labels):
 66 |         '''
 67 |         lambda rank损失
 68 |         '''
 69 |         #delta_lambda计算
 70 |         rank = tf.range(1., tf.cast(self.config['num_samples'], dtype=tf.float32) + 1)
 71 |         rank = tf.tile(rank, [self.config['batch_size']])
 72 |         rank = tf.reshape(rank, tf.shape(labels))
 73 |         rel = 2 ** labels - 1
 74 |         sorted_label = tf.sort(labels, direction='DESCENDING')
 75 |         sorted_rel = 2 ** sorted_label - 1
 76 |         cg_discount = tf.math.log(1. + rank)
 77 |         dcg_m = rel / cg_discount
 78 |         dcg = tf.reduce_sum(dcg_m)
 79 |         stale_ij = dcg_m
 80 |         new_ij = rel / tf.transpose(cg_discount, perm=[0, 2, 1])
 81 |         stale_ji = tf.transpose(stale_ij, perm=[0, 2, 1])
 82 |         new_ji = tf.transpose(new_ij, perm=[0, 2, 1])
 83 |         #new dcg
 84 |         dcg_new = dcg - stale_ij + new_ij - stale_ji + new_ji
 85 |         #delta dcg
 86 |         dcg_max = tf.reduce_sum(sorted_rel / cg_discount)
 87 |         ndcg_delta = tf.abs(dcg_new - dcg) / dcg_max
 88 | 
 89 |         #
 90 |         s_i_minus_s_j = scores - tf.transpose(scores, perm=[0, 2, 1])
 91 |         #上三角矩阵
 92 |         mask1 = tf.linalg.band_part(ndcg_delta, 0, -1)
 93 |         #下三角矩阵
 94 |         mask2 = tf.linalg.band_part(s_i_minus_s_j, -1, 0)
 95 |         _loss = mask1 * tf.transpose(mask2, perm=[0, 2, 1])
 96 |         loss = tf.reduce_sum(_loss)
 97 |         return loss
 98 | 
 99 | 
100 |     def build_losses(self, labels, model_outputs, metrics, aux_losses=None) -> tf.Tensor:
101 |         '''
102 |         构建NDCG损失
103 |         '''
104 |         def _ndcg(rank, relations):
105 |             _dcg = [(np.power(2, relations[i]) - 1) / np.log2(rank[i] + 1) for i in range(len(relations))]
106 |             _sort_similarity = sorted(relations, reverse=True)
107 |             _idcg = [(np.power(2, _sort_similarity[i]) - 1) / np.log2(rank[i] + 1) for i in range(len(_sort_similarity))]
108 |             _ndcg = tf.reduce_sum(_dcg) / tf.reduce_sum(_idcg)
109 |             return _ndcg
110 | 
111 | 
112 | 
113 |         with tf.name_scope('TextMatchTask/lambdas'):
114 |             # 构建ndcg损失
115 |             tf.transpose(labels)
116 |             y = tf.reshape(labels, [self.config['batch_size'], 1, self.config['num_samples']])
117 |             similarity = model_outputs['logits']
118 | 
119 |             _relations = tf.keras.layers.Activation(tf.nn.sigmoid)(similarity)
120 |             relations = tf.reshape(_relations[:, :, 1], tf.shape(y))
121 |             # rank = [i for i in range(1, self.config['num_samples']+1)]
122 |             # _dcg = [(np.power(2,relations[i])-1) / np.log2(rank[i]+1) for i in range(len(relations))]
123 |             # _sort_similarity = [sorted(item, reverse=True) for item in _dcg]
124 |             # _idcg = [(tf.pow(2,r)-1) / (tf.math.log(rank+1)/tf.math.log(2)) for r in _sort_similarity]
125 |             # _ndcg = tf.reduce_sum(_dcg) / tf.reduce_sum(_idcg)
126 |             # ndcg = [_ndcg(rank, relations[i]) for i in range(len(relations))]
127 | 
128 |             # y = [_ndcg(rank, y[i]) for i in range(len(y))]
129 |             metrics = dict([(metric.name, metric) for metric in metrics])
130 |             # losses = tf.keras.losses.sparse_categorical_crossentropy(y,
131 |             #                                                          tf.cast(ndcg, tf.float32),
132 |             #                                                          from_logits=True)
133 | 
134 |             loss = self.lambda_rank_loss(relations, y)
135 | 
136 |             return loss
137 | 
138 |     def build_inputs(self, inputs):
139 |         '''
140 |         构建输入
141 |         '''
142 |         train_input = {
143 |             "input_word_ids": tf.convert_to_tensor(inputs['input_word_ids']),
144 |             "input_mask": tf.convert_to_tensor(inputs['input_mask']),
145 |             "input_type_ids": tf.convert_to_tensor(inputs['input_type_ids']),
146 |             "labels": inputs['input_target_ids']
147 |         }
148 |         return train_input
149 | 
150 |     def train_step(self,
151 |                    inputs,
152 |                    model: tf.keras.Model,
153 |                    optimizer: tf.keras.optimizers.Optimizer,
154 |                    metrics=None):
155 |         '''
156 |         进行训练，前向和后向计算
157 |         :param inputs:
158 |         :param model:
159 |         :param optimizer:
160 |         :param metrics:
161 |         :return:
162 |         '''
163 | 
164 |         with tf.GradientTape() as tape:
165 |             outputs = model(inputs, training=True)
166 |             loss = self.build_losses(inputs["labels"], outputs, metrics, aux_losses=None)
167 | 
168 |         tvars = model.trainable_variables
169 |         grads = tape.gradient(loss, tvars)
170 |         grads, _ = tf.clip_by_global_norm(grads, clip_norm=5.0)
171 |         optimizer.apply_gradients(list(zip(grads, tvars)))
172 |         labels = inputs['labels']
173 |         logs = {self.loss: loss}
174 |         if metrics:
175 |             self.process_metrics(metrics, tf.reshape(labels, (-1,1)), tf.reshape(outputs['predictions'], (-1,1)))
176 |             logs.update({m.name: m.result() for m in model.metrics})
177 |         if model.compiled_metrics:
178 |             self.process_compiled_metrics(model.compiled_metrics, tf.reshape(labels, (-1,1)), tf.reshape(outputs['predictions'], (-1,1)))
179 |             logs.update({m.name: m.result() for m in metrics or []})
180 |             logs.update({m.name: m.result() for m in model.metrics})
181 |         return logs
182 | 
183 |     def validation_step(self, inputs, model: tf.keras.Model, metrics=None):
184 |         '''
185 |         验证集验证模型
186 |         :param input:
187 |         :param model:
188 |         :return:
189 |         '''
190 |         labels = inputs['labels']
191 |         outputs = self.inference_step(inputs, model)
192 |         loss = self.build_losses(labels, outputs, metrics, aux_losses=model.losses)
193 | 
194 |         logs = {self.loss: loss}
195 |         if metrics:
196 |             self.process_metrics(metrics, tf.reshape(labels, (-1,1)), tf.reshape(outputs['predictions'], (-1,1)))
197 |         if model.compiled_metrics:
198 |             self.process_compiled_metrics(model.compiled_metrics, tf.reshape(labels, (-1,1)), tf.reshape(outputs['predictions'], (-1,1)))
199 |             logs.update({m.name: m.result() for m in metrics or []})
200 |             logs.update({m.name: m.result() for m in model.metrics})
201 |         return logs
202 | 
203 |     def build_metrics(self, training=None):
204 |         '''
205 |         构建评价指标
206 |         :param training:
207 |         :return:
208 |         '''
209 |         # del training
210 |         metrics = [
211 |             tf.keras.metrics.SparseCategoricalAccuracy(name='text_match_metrics')
212 |         ]
213 | 
214 |         return metrics
215 | 
216 |     def check_exist_model(self, model):
217 |         '''
218 |         检查是否存在模型文件
219 |         :return:
220 |         '''
221 |         # ckpt = tf.train.Checkpoint(models=models)
222 |         init_checkpoint = os.path.join(self.config['ckpt_model_path'], self.config['model_name'])
223 | 
224 |         # ckpt.restore(init_checkpoint).assert_existing_objects_matched()
225 |         model.load_weights(init_checkpoint).assert_existing_objects_matched()
226 | 
227 | 
228 | if __name__=='__main__':
229 |     with open("../model_configs/ranking.json", 'r') as fr:
230 |         config = json.load(fr)
231 |     print(config)
232 |     Itr_pair = RankingTask(config)
233 | 
234 |     model = Itr_pair.build_model()
235 |     bert_encoder = Itr_pair.build_encoder()
236 |     ckpt = tf.train.Checkpoint(model=bert_encoder)
237 |     init_checkpoint = config['bert_model_path']
238 |     ckpt.restore(init_checkpoint).assert_existing_objects_matched()
239 |     # config = models.get_config()
240 |     Itr_pair.train(model)
241 |     # print(model.layers)
242 | 
243 | 
244 | 


--------------------------------------------------------------------------------
/data_processor/text_match_data_generator.py:
--------------------------------------------------------------------------------
  1 | from data_processor.embedding import embedding
  2 | import numpy as np
  3 | import pandas as pd
  4 | import pickle
  5 | import os
  6 | from random import shuffle
  7 | 
  8 | class TextMatchDataGenerator(embedding):
  9 |     '''
 10 |     生成训练数据
 11 |     '''
 12 |     def __init__(self, config):
 13 |         super(TextMatchDataGenerator, self).__init__(config)
 14 |         self.config = config
 15 |         self.batch_size = config['batch_size']
 16 |         self.load_data()
 17 |         self.train_data, self.train_label, self.eval_data, self.eval_label = self.train_eval_split(self.query_word_idx, self.query_segment_idx, self.query_word_mask, self.query_sequence_length, \
 18 |             self.sim_word_idx, self.sim_segment_idx, self.sim_word_mask, self.sim_sequence_length, self.labels_idx, 0.2)
 19 | 
 20 |     def read_data(self, file_path, data_size=100):
 21 |         '''
 22 |         加载训练数据
 23 |         '''
 24 |         df = pd.read_csv(file_path)
 25 |         # query = [jieba.lcut(i) for i in df['sentence1'].values[0:data_size]]
 26 |         # sim = [jieba.lcut(i) for i in df['sentence2'].values[0:data_size]]
 27 |         query = [list(i) for i in df['sentence1'].values[0:data_size]]
 28 |         sim = [list(i) for i in df['sentence2'].values[0:data_size]]
 29 |         label = df['label'].values[0:data_size]
 30 | 
 31 |         return query, sim, label
 32 | 
 33 |     def save_input_tokens(self, query, sim, labels, label_to_index):
 34 |         '''
 35 |         保存处理完成的输入tokens，方便后续加载
 36 |         :param texts:
 37 |         :return:
 38 |         '''
 39 | 
 40 |         query_word_ids, query_segment_ids, query_word_mask, query_sequence_length = [], [], [], []
 41 |         sim_word_ids, sim_segment_ids, sim_word_mask, sim_sequence_length = [], [], [], []
 42 | 
 43 |         label_ids = []
 44 |         for i in range(len(query)):
 45 |             _query_word_ids, _query_segment_ids, _query_word_mask, _query_sequence_length = self.encode(query[i])
 46 |             _sim_word_ids, _sim_segment_ids, _sim_word_mask, _sim_sequence_length = self.encode(sim[i])
 47 | 
 48 |             query_word_ids.append(_query_word_ids)
 49 |             query_segment_ids.append(_query_segment_ids)
 50 |             query_word_mask.append(_query_word_mask)
 51 |             query_sequence_length.append(_query_sequence_length)
 52 | 
 53 |             sim_word_ids.append(_sim_word_ids)
 54 |             sim_segment_ids.append(_sim_segment_ids)
 55 |             sim_word_mask.append(_sim_word_mask)
 56 |             sim_sequence_length.append(_sim_sequence_length)
 57 | 
 58 |             label_id = self.labels_to_ids([labels[i]], label_to_index)
 59 |             label_ids.append(label_id)
 60 |         input_tokens = dict(query_word_ids=query_word_ids, query_segment_ids=query_segment_ids, query_word_mask=query_word_mask,
 61 |                             query_sequence_length=query_sequence_length,sim_word_ids=sim_word_ids,
 62 |                             sim_segment_ids=sim_segment_ids, sim_word_mask=sim_word_mask,
 63 |                             sim_sequence_length=sim_sequence_length,labels_idx=label_ids)
 64 |         if not os.path.exists(self.config['output_path']):
 65 |             os.mkdir(self.config['output_path'])
 66 |         #保存准备训练的tokens数据
 67 |         with open(os.path.join(self.config['output_path'], 'train_tokens.pkl'), "wb") as fw:
 68 |             pickle.dump(input_tokens, fw)
 69 |         # 保存预处理的label_to_index数据
 70 |         with open(os.path.join(self.config['output_path'], 'label_to_index.pkl'), "wb") as fw:
 71 |             pickle.dump(label_to_index, fw)
 72 |         return query_word_ids, query_segment_ids, query_word_mask, query_sequence_length,\
 73 |                sim_word_ids, sim_segment_ids, sim_word_mask, sim_sequence_length, label_ids
 74 | 
 75 |     def load_data(self):
 76 |         '''
 77 |         加载预处理好的数据
 78 |         :return:
 79 |         '''
 80 | 
 81 |         if os.path.exists(os.path.join(self.config['output_path'], "train_tokens.pkl")) and \
 82 |                 os.path.exists(os.path.join(self.config['output_path'], "label_to_index.pkl")):
 83 |             print("load existed train data")
 84 |             # with open(os.path.join(self.config['output_path'], "word_to_index.pkl"), "rb") as f:
 85 |             #     self.word_to_index = pickle.load(f)
 86 |             with open(os.path.join(self.config['output_path'], "label_to_index.pkl"), "rb") as f:
 87 |                 self.label_to_index = pickle.load(f)
 88 |             with open(os.path.join(self.config['output_path'], "train_tokens.pkl"), "rb") as f:
 89 |                 train_data = pickle.load(f)
 90 | 
 91 |             self.query_word_idx, self.query_segment_idx, self.query_word_mask, self.query_sequence_length, \
 92 |             self.sim_word_idx, self.sim_segment_idx, self.sim_word_mask, self.sim_sequence_length, self.labels_idx = np.array(train_data["query_word_ids"]), \
 93 |                                                                                                      np.array(train_data["query_segment_ids"]), \
 94 |                                                                                                      np.array(train_data["query_word_mask"]), \
 95 |                                                                                                      np.array(train_data["query_sequence_length"]), \
 96 |                                                                                                      np.array(train_data["sim_word_ids"]), \
 97 |                                                                                                      np.array(train_data["sim_segment_ids"]), \
 98 |                                                                                                      np.array(train_data["sim_word_mask"]), \
 99 |                                                                                                      np.array(train_data["sim_sequence_length"]), \
100 |                                                                                                      np.array(train_data["labels_idx"])
101 |         else:
102 |             # 1，读取原始数据
103 |             query, sim, labels = self.read_data(self.config['data_path'])
104 |             print("read finished")
105 | 
106 |             label_to_index = self.label_to_index(labels)
107 | 
108 |             query_word_ids, query_segment_ids, query_word_mask, query_sequence_length, \
109 |             sim_word_ids, sim_segment_ids, sim_word_mask, sim_sequence_length, label_ids = self.save_input_tokens(query, sim, labels, label_to_index)
110 |             print('text to tokens process finished')
111 | 
112 |             # train_data = dict(inputs_idx=inputs_idx, labels_idx=labels_idx)
113 |             # with open(os.path.join(self.config['output_path'], "train_data.pkl"), "wb") as fw:
114 |             #     pickle.dump(train_data, fw)
115 |             # labels_idx = labels
116 |             self.query_word_idx, self.query_segment_idx, self.query_word_mask, self.query_sequence_length, \
117 |             self.sim_word_idx, self.sim_segment_idx, self.sim_word_mask, self.sim_sequence_length,self.labels_idx = query_word_ids, query_segment_ids, query_word_mask, query_sequence_length,\
118 |                sim_word_ids, sim_segment_ids, sim_word_mask, sim_sequence_length, label_ids
119 | 
120 |     def train_eval_split(self, query_word_ids, query_segment_ids, query_word_mask, query_sequence_length,
121 |                          sim_word_ids, sim_segment_ids, sim_word_mask, sim_sequence_length, labels, rate):
122 | 
123 |         split_index = int(len(query_word_ids) * rate)
124 |         train_data = (query_word_ids[split_index:], query_segment_ids[split_index:], query_word_mask[split_index:],
125 |                       query_sequence_length[split_index:], sim_word_ids[split_index:], sim_segment_ids[split_index:],
126 |                       sim_word_mask[split_index:], sim_sequence_length[split_index:])
127 |         train_label = labels[split_index:]
128 |         eval_data = (query_word_ids[:split_index], query_segment_ids[:split_index], query_word_mask[:split_index],
129 |                       query_sequence_length[:split_index], sim_word_ids[:split_index], sim_segment_ids[:split_index],
130 |                       sim_word_mask[:split_index], sim_sequence_length[:split_index])
131 |         eval_label = labels[:split_index]
132 | 
133 |         return train_data, train_label, eval_data, eval_label
134 | 
135 |     def gen_data(self, inputs_idx, labels_idx):
136 |         '''
137 |         生成批次数据
138 |         :return:
139 |         '''
140 |         query_word_ids, query_segment_ids, query_word_mask, query_sequence_length, \
141 |         sim_word_ids, sim_segment_ids, sim_word_mask, sim_sequence_length = inputs_idx[0], inputs_idx[1],inputs_idx[2],\
142 |                                                                                        inputs_idx[3],inputs_idx[4],inputs_idx[5],\
143 |                                                                                        inputs_idx[6],inputs_idx[7]
144 |         batch_word_ids_a, batch_segment_ids_a, batch_word_mask_a, batch_sequence_length_a, \
145 |         batch_word_ids_b, batch_segment_ids_b, batch_word_mask_b, batch_sequence_length_b, batch_output_ids= [], [], [], [], [], [], [], [], []
146 | 
147 |         for i in range(len(query_word_ids)):
148 |             batch_word_ids_a.append(query_word_ids[i])
149 |             batch_segment_ids_a.append(query_segment_ids[i])
150 |             batch_word_mask_a.append(query_word_mask[i])
151 |             batch_sequence_length_a.append(query_sequence_length[i])
152 | 
153 |             batch_word_ids_b.append(sim_word_ids[i])
154 |             batch_segment_ids_b.append(sim_segment_ids[i])
155 |             batch_word_mask_b.append(sim_word_mask[i])
156 |             batch_sequence_length_b.append(sim_sequence_length[i])
157 | 
158 |             batch_output_ids.append(labels_idx[i])
159 | 
160 | 
161 |             if len(batch_output_ids) == self.batch_size:
162 |                 yield dict(
163 |                 input_word_ids_a=np.array(batch_word_ids_a, dtype="int32"),
164 |                 input_mask_a=np.array(batch_word_mask_a, dtype="int32"),
165 |                 input_type_ids_a=np.array(batch_segment_ids_a, dtype="int32"),
166 |                 input_word_ids_b=np.array(batch_word_ids_b, dtype="int32"),
167 |                 input_mask_b=np.array(batch_word_mask_b, dtype="int32"),
168 |                 input_type_ids_b=np.array(batch_segment_ids_b, dtype="int32"),
169 |                 input_target_ids=np.array(batch_output_ids, dtype="float32")
170 |                 )
171 |                 batch_word_ids_a, batch_segment_ids_a, batch_word_mask_a, batch_sequence_length_a, \
172 |                 batch_word_ids_b, batch_segment_ids_b, batch_word_mask_b, batch_sequence_length_b, batch_output_ids = [], [], [], [], [], [], [], [], []
173 | 
174 | 


--------------------------------------------------------------------------------